XLinkSpider that Respects robots processing instruction
import org.xml.sax.*;
import org.apache.xerces.parsers.*;
import java.io.*;
import java.util.*;
import org.w3c.dom.*;
public class PoliteDOMSpider {
private static DOMParser parser = new DOMParser();
// namespace suport is turned off by default in Xerces
static {
try {
parser.setFeature("http://xml.org/sax/features/namespaces",
true);
}
catch (Exception e) {
System.err.println(e);
}
}
private static Vector visited = new Vector();
private static int maxDepth = 5;
private static int currentDepth = 0;
public static void listURIs(String systemId) {
currentDepth++;
try {
if (currentDepth < maxDepth) {
parser.parse(systemId);
Document document = parser.getDocument();
if (robotsAllowed(document)) {
Vector uris = new Vector();
// search the document for uris,
// store them in vector, print them
searchForURIs(document.getDocumentElement(), uris);
Enumeration e = uris.elements();
while (e.hasMoreElements()) {
String uri = (String) e.nextElement();
visited.addElement(uri);
listURIs(uri);
}
}
}
}
catch (SAXException e) {
// couldn't load the document,
// probably not well-formed XML, skip it
}
catch (IOException e) {
// couldn't load the document,
// likely network failure, skip it
}
finally {
currentDepth--;
System.out.flush();
}
}
public static boolean robotsAllowed(Document document) {
NodeList children = document.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node n = children.item(i);
if (n instanceof ProcessingInstruction) {
ProcessingInstruction pi = (ProcessingInstruction) n;
if (pi.getTarget().equals("robots")) {
String data = pi.getData();
if (data.indexOf("follow=\"no\"") >= 0) {
return false;
}
}
}
}
return true;
}
// use recursion
public static void searchForURIs(Element element, Vector uris) {
// look for XLinks in this element
String uri = element.getAttribute("xlink:href");
// Namespace support seems buggy
// String uri = element.getAttributeNS("href", "http://www.w3.org/1999/xlink");
if (uri != null && !uri.equals("")
&& !visited.contains(uri)
&& !uris.contains(uri)) {
System.out.println(uri);
uris.addElement(uri);
}
// process child elements recursively
NodeList children = element.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node n = children.item(i);
if (n instanceof Element) {
searchForURIs((Element) n, uris);
}
}
}
public static void main(String[] args) {
if (args.length == 0) {
System.out.println("Usage: java PoliteDOMSpider URL1 URL2...");
}
// start parsing...
for (int i = 0; i < args.length; i++) {
try {
listURIs(args[i]);
}
catch (Exception e) {
System.err.println(e);
e.printStackTrace();
}
} // end for
} // end main
} // end PoliteDOMSpider