XLinkSpider with JDOM
import java.io.*;
import java.util.*;
import org.jdom.*;
import org.jdom.input.SAXBuilder;
public class BasicXLinkSpider {
private static SAXBuilder builder = new SAXBuilder();
private static Vector visited = new Vector();
private static int maxDepth = 5;
private static int currentDepth = 0;
public static void listURIs(String systemID) {
currentDepth++;
try {
if (currentDepth < maxDepth) {
Document document = builder.build(systemID);
Vector uris = new Vector();
// search the document for uris,
// store them in vector, and print them
searchForURIs(document.getRootElement(), uris);
Enumeration e = uris.elements();
while (e.hasMoreElements()) {
String uri = (String) e.nextElement();
visited.addElement(uri);
listURIs(uri);
}
}
}
catch (JDOMException ex) {
// couldn't load the document,
// probably not well-formed XML, skip it
}
catch (IOException ex) {
// couldn't load the document,
// probably broken link, skip it
}
finally {
currentDepth--;
System.out.flush();
}
}
private static Namespace xlink
= Namespace.getNamespace("http://www.w3.org/1999/xlink");
// use recursion
public static void searchForURIs(Element element, Vector uris) {
// look for XLinks in this element
String uri = element.getAttributeValue("href", xlink);
if (uri != null && !uri.equals("")
&& !visited.contains(uri) && !uris.contains(uri)) {
System.out.println(uri);
uris.addElement(uri);
}
// process child elements recursively
List children = element.getChildren();
Iterator iterator = children.iterator();
while (iterator.hasNext()) {
searchForURIs((Element) iterator.next(), uris);
}
}
public static void main(String[] args) {
if (args.length == 0) {
System.out.println("Usage: java BasicXLinkSpider URL1 URL2...");
}
// start parsing...
for (int i = 0; i < args.length; i++) {
System.err.println(args[i]);
listURIs(args[i]);
} // end for
} // end main
} // end BasicXLinkSpider