XLinkSpider that Respects the robots Processing Instruction
import java.io.*;
import java.util.*;
import org.jdom.*;
import org.jdom.input.SAXBuilder;
public class XLinkSpider {
private static SAXBuilder builder = new SAXBuilder();
private static Vector visited = new Vector();
private static int maxDepth = 5;
private static int currentDepth = 0;
public static void listURIs(String systemID) {
currentDepth++;
try {
if (currentDepth < maxDepth) {
Document document = builder.build(systemID);
// check to see if we're allowed to spider
boolean index = true;
boolean follow = true;
try {
ProcessingInstruction robots
= document.getProcessingInstruction("robots");
String indexValue = robots.getValue("index");
if (indexValue.equalsIgnoreCase("no")) index = false;
String followValue = robots.getValue("follow");
if (followValue.equalsIgnoreCase("no")) follow = false;
}
catch (NoSuchProcessingInstructionException e) {
// spidering and indexing allowed
}
Vector uris = new Vector();
// search the document for uris,
// store them in vector, and print them
if (follow) searchForURIs(document.getRootElement(), uris);
Enumeration e = uris.elements();
while (e.hasMoreElements()) {
String uri = (String) e.nextElement();
visited.addElement(uri);
if (index) listURIs(uri);
}
}
}
catch (JDOMException e) {
// couldn't load the document,
// probably not well-formed XML, skip it
}
finally {
currentDepth--;
System.out.flush();
}
}
// use recursion
public static void searchForURIs(Element element, Vector uris) {
// look for XLinks in this element
try {
Attribute href = element.getAttribute("href", "http://www.w3.org/1999/xlink");
String uri = href.getValue();
if (!uri.equals("") && !visited.contains(uri) && !uris.contains(uri)) {
System.out.println(uri);
uris.addElement(uri);
}
}
catch (NoSuchAttributeException e) {
// No big deal. This element just isn't an XLink
// System.err.println(e);
}
// process child elements recursively
List children = element.getChildren();
Iterator iterator = children.iterator();
while (iterator.hasNext()) {
searchForURIs((Element) iterator.next(), uris);
}
}
public static void main(String[] args) {
if (args.length == 0) {
System.out.println("Usage: java XLinkSpider URL1 URL2...");
}
// start parsing...
for (int i = 0; i < args.length; i++) {
System.err.println(args[i]);
listURIs(args[i]);
} // end for
} // end main
} // end XLinkSpider