XLinkSpider with DOM
import org.xml.sax.*;
import org.apache.xerces.parsers.*;
import java.io.*;
import java.util.*;
import org.w3c.dom.*;
public class DOMSpider {
private static DOMParser parser = new DOMParser();
// namespace suport is turned off by default in Xerces
static {
try {
parser.setFeature("http://xml.org/sax/features/namespaces", true);
}
catch (Exception e) {
System.err.println(e);
}
}
private static Vector visited = new Vector();
private static int maxDepth = 5;
private static int currentDepth = 0;
public static void listURIs(String systemId) {
currentDepth++;
try {
if (currentDepth < maxDepth) {
parser.parse(systemId);
Document document = parser.getDocument();
Vector uris = new Vector();
// search the document for uris,
// store them in vector, and print them
searchForURIs(document.getDocumentElement(), uris);
Enumeration e = uris.elements();
while (e.hasMoreElements()) {
String uri = (String) e.nextElement();
visited.addElement(uri);
listURIs(uri);
}
}
}
catch (SAXException e) {
// couldn't load the document,
// probably not well-formed XML, skip it
}
catch (IOException e) {
// couldn't load the document,
// likely network failure, skip it
}
finally {
currentDepth--;
System.out.flush();
}
}
// use recursion
public static void searchForURIs(Element element, Vector uris) {
// look for XLinks in this element
String uri = element.getAttribute("xlink:href");
// Namespace support seems buggy
// String uri = element.getAttributeNS("href", "http://www.w3.org/1999/xlink");
if (uri != null && !uri.equals("")
&& !visited.contains(uri)
&& !uris.contains(uri)) {
System.out.println(uri);
uris.addElement(uri);
}
// process child elements recursively
NodeList children = element.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node n = children.item(i);
if (n instanceof Element) {
searchForURIs((Element) n, uris);
}
}
}
public static void main(String[] args) {
if (args.length == 0) {
System.out.println("Usage: java DOMSpider URL1 URL2...");
}
// start parsing...
for (int i = 0; i < args.length; i++) {
try {
listURIs(args[i]);
}
catch (Exception e) {
System.err.println(e);
e.printStackTrace();
}
} // end for
} // end main
} // end DOMSpider