NekoSpider
import org.apache.xerces.xni.*;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.net.*;
import java.io.*;
import java.util.*;
public class NekoSpider {
// Need to keep track of where we've been
// so we don't get stuck in an infinite loop
private List spideredURIs = new Vector();
// This linked list keeps track of where we're going.
// Although the LinkedList class does not guarantee queue like
// access, I always access it in a first-in/first-out fashion.
private LinkedList queue = new LinkedList();
private URL currentURL;
private XMLPullParser parser;
public NekoSpider() {
this.parser = new Xerces2();
}
private void processStartTag(ElementEvent element) {
XMLAttributes attributes = element.attributes;
String type = attributes.getValue("http://www.w3.org/1999/xlink", "type");
if (type != null) {
String href = attributes.getValue("http://www.w3.org/1999/xlink", "href");
if (href != null) {
try {
URL foundURL = new URL(currentURL, href);
if (!spideredURIs.contains(foundURL)) {
queue.addFirst(foundURL);
}
}
catch (MalformedURLException ex) {
// skip it
}
}
}
}
public void spider(URL uri) {
System.out.println("Spidering " + uri);
try {
XMLInputSource source
= new XMLInputSource(null, uri.toExternalForm(), null);
parser.setInputSource(source);
spideredURIs.add(uri);
XMLEvent event;
while ((event = parser.nextEvent()) != null) {
if (event.type == XMLEvent.ELEMENT) {
ElementEvent element = (ElementEvent) event;
if (element.start) processStartTag(element);
}
} // end for
while (!queue.isEmpty()) {
URL nextURL = (URL) queue.removeLast();
spider(nextURL);
}
}
catch (Exception ex) {
// skip this document
}
}
public static void main(String[] args) throws Exception {
if (args.length == 0) {
System.err.println("Usage: java NekoSpider url" );
return;
}
NekoSpider spider = new NekoSpider();
spider.spider(new URL(args[0]));
} // end main
} // end NekoSpider