Converting slashdot.xml to Tab Delimited Text
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import java.io.*;
import java.net.*;
public class SlashdotTab implements DocumentHandler {
private Writer out;
public SlashdotTab(Writer out) {
this.out = out;
}
public SlashdotTab(OutputStream out) {
this(new OutputStreamWriter(out));
}
public void setDocumentLocator(Locator locator) {}
public void startDocument() throws SAXException {}
// Never forget to flush!
public void endDocument() throws SAXException {
try {
out.flush();
}
catch (IOException e) {
throw new SAXException(e);
}
}
/* <story>
<title>The Onion to buy the New York Times</title>
<url>http://slashdot.org/articles/00/02/19/1128240.shtml</url>
<time>2000-02-19 17:25:15</time>
<author>CmdrTaco</author>
<department>stuff-to-read</department>
<topic>media</topic>
<comments>20</comments>
<section>articles</section>
<image>topicmedia.gif</image>
</story>
*/
// one state; either we're in a tag that needs to use
// characters or we're not
boolean useCharacters = false;
public void startElement(String name, AttributeList atts)
throws SAXException {
if (name.equals("title") || name.equals("url") || name.equals("author")
|| name.equals("department") || name.equals("topic")
|| name.equals("comments") || name.equals("section")
|| name.equals("image") ) {
useCharacters = true;
}
}
public void endElement(String name) throws SAXException {
if (name.equals("title") || name.equals("url") || name.equals("author")
|| name.equals("department") || name.equals("topic")
|| name.equals("comments") || name.equals("section") ) {
try {
out.write('\t');
}
catch (IOException e) {
throw new SAXException(e);
}
useCharacters = false;
}
else if (name.equals("image")) {
try {
out.write("\r\n");
}
catch (IOException e) {
throw new SAXException(e);
}
useCharacters = false;
}
}
public void characters(char[] text, int start, int length)
throws SAXException {
if (useCharacters) {
try {
out.write(text, start, length);
}
catch (IOException e) {
throw new SAXException(e);
}
}
}
public void ignorableWhitespace(char[] text, int start, int length)
throws SAXException {}
public void processingInstruction(String target, String data)
throws SAXException {}
// Could easily have put main() method in a separate class
public static void main(String[] args) {
Parser parser;
try {
parser = ParserFactory.makeParser();
}
catch (Exception e) {
// fall back on Xerces parser by name
try {
parser = ParserFactory.makeParser(
"org.apache.xerces.parsers.SAXParser");
}
catch (Exception ee) {
System.err.println("Couldn't locate a SAX parser");
return;
}
}
String url = "http://www.slashdot.org/slashdot.xml";
if (args.length != 0) {
url = args[0];
}
// Install the Document Handler
parser.setDocumentHandler(new SlashdotTab(System.out));
// command line should offer URIs or file names
try {
parser.parse(url);
}
catch (SAXParseException e) { // well-formedness error
System.out.println(url + " is not well formed.");
System.out.println(e.getMessage()
+ " at line " + e.getLineNumber()
+ ", column " + e.getColumnNumber());
}
catch (SAXException e) { // some other kind of error
System.out.println(e.getMessage());
}
catch (IOException e) {
System.out.println("Could not read " + url
+ " because of the IOException " + e);
}
}
}