Example: XLinkSpider
import java.net.*;
import java.util.*;
import nu.xom.*;
public class XLinkSpider {
private Set spidered = new HashSet();
private Builder parser = new Builder();
private List queue = new LinkedList();
public static final String XLINK_NS
= "http://www.w3.org/1999/xlink";
public static final String XML_NS
= "http://www.w3.org/XML/1998/namespace";
public void search(URL url) {
try {
String systemID = url.toExternalForm();
Document doc = parser.build(systemID);
System.out.println(url);
search(doc.getRootElement(), url);
}
catch (Exception ex) {
// just skip this document
}
if (queue.isEmpty()) return;
URL discovered = (URL) queue.remove(0);
spidered.add(discovered);
search(discovered);
}
private void search(Element element, URL base) {
Attribute href = element.getAttribute("href", XLINK_NS);
Attribute xmlbase = element.getAttribute("base", XML_NS);
try {
if (xmlbase != null) {
base = new URL(base, xmlbase.getValue());
}
}
catch (MalformedURLException ex) {
// Probably just no protocol handler for the
// kind of URLs used inside this element
return;
}
if (href != null) {
String uri = href.getValue();
// absolutize URL
try {
URL discovered = new URL(base, uri);
// strip ref field if any
discovered = new URL(
discovered.getProtocol(),
discovered.getHost(),
discovered.getFile()
);
if (!spidered.contains(discovered)
&& !queue.contains(discovered)) {
queue.add(discovered);
}
}
catch (MalformedURLException ex) {
// skip this one
}
}
Elements children = element.getChildElements();
for (int i = 0; i < children.size(); i++) {
search(children.get(i), base);
}
}
public static void main(String[] args) {
XLinkSpider spider = new XLinkSpider();
for (int i = 0; i < args.length; i++) {
try {
spider.search(new URL(args[i]));
}
catch (MalformedURLException ex) {
System.err.println(ex);
}
}
} // end main()
}