Example: PoliteSpider

import java.net.*;
import java.util.*;
import nu.xom.*;

public class PoliteSpider {

    private Set spidered = new HashSet();
    private Builder parser = new Builder();
    private List queue = new LinkedList();
    
    public static final String XLINK_NS 
     = "http://www.w3.org/1999/xlink";
    
    public void search(URL url) {
        
        try {
            String systemID = url.toExternalForm();
            Document doc = parser.build(systemID);
            
            boolean follow = true;
            boolean index = true;
            for (int i = 0; i < doc.getChildCount(); i++) {
                Node child = doc.getChild(i); 
                if (child instanceof Element) break;  
                if (child instanceof ProcessingInstruction){
                    ProcessingInstruction instruction 
                      = (ProcessingInstruction) child;
                    if (instruction.getTarget().equals("robots")) {
                        Element data 
                          = PseudoAttributes.getAttributes(instruction); 
                        Attribute indexAtt = data.getAttribute("index"); 
                        if (indexAtt != null) {
                            String value = indexAtt.getValue().trim();
                            if (value.equals("no")) index = false;
                        }
                        Attribute followAtt = data.getAttribute("follow"); 
                        if (followAtt != null) {
                            String value = followAtt.getValue().trim();
                            if (value.equals("no")) follow = false;
                        }
                    }   
                }  
            }
            
            if (index) System.out.println(url);
            if (follow) search(doc.getRootElement());
        }
        catch (Exception ex) {
            // just skip this document
        }
        
        if (queue.isEmpty()) return;
        
        URL discovered = (URL) queue.remove(0);
        spidered.add(discovered);
        search(discovered);      
        
    }

    private void search(Element element) {

        Attribute href = element.getAttribute("href", XLINK_NS);
        
        URL base = null;
        try {
            base = new URL(element.getBaseURI());
        }
        catch (MalformedURLException ex) {
            // Probably just no protocol handler for the 
            // kind of URLs used inside this element
            return;
        }
        if (href != null) {
            String uri = href.getValue();
            // absolutize URL
            try {
                URL discovered = new URL(base, uri);
                // remove fragment identifier if any
                discovered = new URL(
                  discovered.getProtocol(),
                  discovered.getHost(),
                  discovered.getFile()
                );
                
                if (!spidered.contains(discovered) 
                  && !queue.contains(discovered)) {
                    queue.add(discovered);   
                }
            }
            catch (MalformedURLException ex) {
                // skip this one   
            }
        }
        Elements children = element.getChildElements();
        for (int i = 0; i < children.size(); i++) {
            search(children.get(i));
        }
    }

    public static void main(String[] args) {
      
        PoliteSpider spider = new PoliteSpider();
        for (int i = 0; i < args.length; i++) { 
            try { 
                spider.search(new URL(args[i]));
            }
            catch (MalformedURLException ex) {
                System.err.println(ex);   
            }
        }
      
    } // end main()
}

Previous | Next | Top | Cafe con Leche

Copyright 2004-2006 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified February 9, 2004