XLinkSpider that Respects the robots Processing Instruction
import java.io.*;
import java.util.*;
import org.jdom.*;
import org.jdom.input.SAXBuilder;
public class AdvancedSpider {
private static SAXBuilder builder = new SAXBuilder();
private static Vector visited = new Vector();
private static int maxDepth = 5;
private static int currentDepth = 0;
public static void listURIs(String systemID) {
currentDepth++;
try {
if (currentDepth < maxDepth) {
Document document = builder.build(systemID);
// check to see if we're allowed to spider
boolean index = true;
boolean follow = true;
ProcessingInstruction robots = findRobots(document);
if (robots != null) {
String indexValue = robots.getValue("index");
if (indexValue.equalsIgnoreCase("no")) index = false;
String followValue = robots.getValue("follow");
if (followValue.equalsIgnoreCase("no")) follow = false;
}
Vector uris = new Vector();
// search the document for uris,
// store them in vector, and print them
if (follow) searchForURIs(document.getRootElement(), uris);
Enumeration e = uris.elements();
while (e.hasMoreElements()) {
String uri = (String) e.nextElement();
visited.addElement(uri);
if (index) listURIs(uri);
}
}
}
catch (JDOMException e) {
// couldn't load the document,
// probably not well-formed XML, skip it
}
catch (IOException ex) {
// couldn't load the document,
// probably broken link, skip it
}
finally {
currentDepth--;
System.out.flush();
}
}
private static ProcessingInstruction findRobots(Document doc) {
List content = doc.getContent();
Iterator children = content.iterator();
while (children.hasNext()) {
Object o = children.next();
if (o instanceof Element) return null;
if (o instanceof ProcessingInstruction) {
ProcessingInstruction candidate = (ProcessingInstruction) o;
if (candidate.getTarget().equals("robots")) return candidate;
}
}
return null;
}
private static Namespace xlink
= Namespace.getNamespace("http://www.w3.org/1999/xlink");
// use recursion
public static void searchForURIs(Element element, Vector uris) {
// look for XLinks in this element
String uri = element.getAttributeValue("href", xlink);
if (uri != null && !uri.equals("")
&& !visited.contains(uri) && !uris.contains(uri)) {
System.out.println(uri);
uris.addElement(uri);
}
// process child elements recursively
List children = element.getChildren();
Iterator iterator = children.iterator();
while (iterator.hasNext()) {
searchForURIs((Element) iterator.next(), uris);
}
}
public static void main(String[] args) {
if (args.length == 0) {
System.out.println("Usage: java AdvancedSpider URL1 URL2...");
}
// start parsing...
for (int i = 0; i < args.length; i++) {
System.err.println(args[i]);
listURIs(args[i]);
} // end for
} // end main
} // end AdvancedSpider