Robots processing instruction:
<?robots index="yes | no"
follow="yes | no" ?>
import java.net.*; import java.util.*; import nu.xom.*; public class PoliteSpider { private Set spidered = new HashSet(); private Builder parser = new Builder(); private List queue = new LinkedList(); public static final String XLINK_NS = "http://www.w3.org/1999/xlink"; public void search(URL url) { try { String systemID = url.toExternalForm(); Document doc = parser.build(systemID); boolean follow = true; boolean index = true; for (int i = 0; i < doc.getChildCount(); i++) { Node child = doc.getChild(i); if (child instanceof Element) break; if (child instanceof ProcessingInstruction){ ProcessingInstruction instruction = (ProcessingInstruction) child; if (instruction.getTarget().equals("robots")) { Element data = PseudoAttributes.getAttributes(instruction); Attribute indexAtt = data.getAttribute("index"); if (indexAtt != null) { String value = indexAtt.getValue().trim(); if (value.equals("no")) index = false; } Attribute followAtt = data.getAttribute("follow"); if (followAtt != null) { String value = followAtt.getValue().trim(); if (value.equals("no")) follow = false; } } } } if (index) System.out.println(url); if (follow) search(doc.getRootElement()); } catch (Exception ex) { // just skip this document } if (queue.isEmpty()) return; URL discovered = (URL) queue.remove(0); spidered.add(discovered); search(discovered); } private void search(Element element) { Attribute href = element.getAttribute("href", XLINK_NS); URL base = null; try { base = new URL(element.getBaseURI()); } catch (MalformedURLException ex) { // Probably just no protocol handler for the // kind of URLs used inside this element return; } if (href != null) { String uri = href.getValue(); // absolutize URL try { URL discovered = new URL(base, uri); // remove fragment identifier if any discovered = new URL( discovered.getProtocol(), discovered.getHost(), discovered.getFile() ); if (!spidered.contains(discovered) && !queue.contains(discovered)) { queue.add(discovered); } } catch (MalformedURLException ex) { // skip this one } } Elements children = element.getChildElements(); for (int i = 0; i < children.size(); i++) { search(children.get(i)); } } public static void main(String[] args) { PoliteSpider spider = new PoliteSpider(); for (int i = 0; i < args.length; i++) { try { spider.search(new URL(args[i])); } catch (MalformedURLException ex) { System.err.println(ex); } } } // end main() }