Attributes Example: XLinkSpider
import javax.xml.stream.*;
import java.net.*;
import java.io.*;
import java.util.*;
public class PullSpider {
// Need to keep track of where we've been
// so we don't get stuck in an infinite loop
private List spideredURIs = new Vector();
// This linked list keeps track of where we're going.
// Although the LinkedList class does not guarantee queue like
// access, I always access it in a first-in/first-out fashion.
private LinkedList queue = new LinkedList();
private URL currentURL;
private XMLInputFactory factory;
public PullSpider() {
this.factory = XMLInputFactory.newInstance();
}
private void processStartTag(XMLStreamReader parser) {
String type
= parser.getAttributeValue("http://www.w3.org/1999/xlink", "type");
if (type != null) {
String href
= parser.getAttributeValue("http://www.w3.org/1999/xlink", "href");
if (href != null) {
try {
URL foundURL = new URL(currentURL, href);
if (!spideredURIs.contains(foundURL)) {
queue.addFirst(foundURL);
}
}
catch (MalformedURLException ex) {
// skip this URL
}
}
}
}
public void spider(URL url) {
System.out.println("Spidering " + url);
currentURL = url;
try {
XMLStreamReader parser = factory.createXMLStreamReader(currentURL.openStream());
spideredURIs.add(currentURL);
for (int event = parser.next();
parser.hasNext();
event = parser.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
processStartTag(parser);
}
} // end for
parser.close();
while (!queue.isEmpty()) {
URL nextURL = (URL) queue.removeLast();
spider(nextURL);
}
}
catch (Exception ex) {
// skip this document
}
}
public static void main(String[] args) throws Exception {
if (args.length == 0) {
System.err.println("Usage: java PullSpider url" );
return;
}
PullSpider spider = new PullSpider();
spider.spider(new URL(args[0]));
} // end main
} // end PullSpider