Attributes Example: XLinkSpider
import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;
import java.util.*;
public class PullSpider {
// Need to keep track of where we've been
// so we don't get stuck in an infinite loop
private List spideredURIs = new Vector();
// This linked list keeps track of where we're going.
// Although the LinkedList class does not guarantee queue like
// access, I always access it in a first-in/first-out fashion.
private LinkedList queue = new LinkedList();
private URL currentURL;
private XmlPullParser parser;
public PullSpider() {
try {
XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
factory.setNamespaceAware(true);
this.parser = factory.newPullParser();
}
catch (XmlPullParserException ex) {
throw new RuntimeException("Could not locate a pull parser");
}
}
private void processStartTag() {
String type
= parser.getAttributeValue("http://www.w3.org/1999/xlink", "type");
if (type != null) {
String href
= parser.getAttributeValue("http://www.w3.org/1999/xlink", "href");
if (href != null) {
try {
URL foundURL = new URL(currentURL, href);
if (!spideredURIs.contains(foundURL)) {
queue.addFirst(foundURL);
}
}
catch (MalformedURLException ex) {
// skip it
}
}
}
}
public void spider(URL uri) {
System.out.println("Spidering " + uri);
currentURL = uri;
try {
parser.setInput(this.currentURL.openStream(), null);
spideredURIs.add(currentURL);
for (int event = parser.next(); event != XmlPullParser.END_DOCUMENT; event = parser.next()) {
if (event == XmlPullParser.START_TAG) {
processStartTag();
}
} // end for
while (!queue.isEmpty()) {
URL nextURL = (URL) queue.removeLast();
spider(nextURL);
}
}
catch (Exception ex) {
// skip this document
}
}
public static void main(String[] args) throws Exception {
if (args.length == 0) {
System.err.println("Usage: java PullSpider url" );
return;
}
PullSpider spider = new PullSpider();
spider.spider(new URL(args[0]));
} // end main
} // end PullSpider