Example: XLinkSpider

import java.net.*;
import java.util.*;
import nu.xom.*;

public class XLinkSpider {

    private Set spidered = new HashSet();
    private Builder parser = new Builder();
    private List queue = new LinkedList();
    
    public static final String XLINK_NS 
      = "http://www.w3.org/1999/xlink";
    
    public void search(URL url) {
        try {
            String systemID = url.toExternalForm();
            Document doc = parser.build(systemID);
            System.out.println(url);
            search(doc.getRootElement());
        }
        catch (Exception ex) {
            // just skip this document
        }
        
        if (queue.isEmpty()) return;
        
        URL discovered = (URL) queue.remove(0);
        spidered.add(discovered);
        search(discovered);      
    }

    private void search(Element element) {

        Attribute href = element.getAttribute("href", XLINK_NS);
        
        URL base = null;
        try {
            base = new URL(element.getBaseURI());
        }
        catch (MalformedURLException ex) {
            // Probably just no protocol handler for the 
            // kind of URLs used inside this element
            return;
        }
        if (href != null) {
            String uri = href.getValue();
            // absolutize URL
            try {
                URL discovered = new URL(base, uri);
                // remove fragment identifier if any
                discovered = new URL(
                  discovered.getProtocol(),
                  discovered.getHost(),
                  discovered.getFile()
                );
                
                if (!spidered.contains(discovered) 
                  && !queue.contains(discovered)) {
                    queue.add(discovered);   
                }
            }
            catch (MalformedURLException ex) {
                // skip this one   
            }
        }
        Elements children = element.getChildElements();
        for (int i = 0; i < children.size(); i++) {
            search(children.get(i));
        }
        
    }

    public static void main(String[] args) {
      
        XLinkSpider spider = new XLinkSpider();
        for (int i = 0; i < args.length; i++) { 
            try { 
                spider.search(new URL(args[i]));
            }
            catch (MalformedURLException ex) {
                System.err.println(ex);   
            }
        }
      
    }  // end main()

}

Previous | Next | Top | Cafe con Leche

Copyright 2004-2006 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified February 5, 2004