NekoSpider

import org.apache.xerces.xni.*;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.net.*;
import java.io.*;
import java.util.*;

public class NekoSpider {

  // Need to keep track of where we've been 
  // so we don't get stuck in an infinite loop
  private List spideredURIs = new Vector();

  // This linked list keeps track of where we're going.
  // Although the LinkedList class does not guarantee queue like
  // access, I always access it in a first-in/first-out fashion.
  private LinkedList queue = new LinkedList();
  
  private URL currentURL;
  private XMLPullParser parser;
  
  public NekoSpider() {
      this.parser = new Xerces2();
  }

  private void processStartTag(ElementEvent element) {
    
    XMLAttributes attributes = element.attributes;
    String type = attributes.getValue("http://www.w3.org/1999/xlink", "type");
    if (type != null) {
      String href = attributes.getValue("http://www.w3.org/1999/xlink", "href");
      if (href != null) {
        try {
          URL foundURL = new URL(currentURL, href);
          if (!spideredURIs.contains(foundURL)) {
            queue.addFirst(foundURL);
          }
        }
        catch (MalformedURLException ex) {
          // skip it   
        }
      }
    }
  }
  
  public void spider(URL uri) {
      
    System.out.println("Spidering " + uri);
    try {
      XMLInputSource source 
       = new XMLInputSource(null, uri.toExternalForm(), null);
      parser.setInputSource(source);
      spideredURIs.add(uri);
      
      XMLEvent event;
      while ((event = parser.nextEvent()) != null) {
         if (event.type == XMLEvent.ELEMENT) {
             ElementEvent element = (ElementEvent) event;
             if (element.start) processStartTag(element);
         }
       }  // end for
      
       while (!queue.isEmpty()) {
         URL nextURL = (URL) queue.removeLast();
         spider(nextURL);
       }
      
    }
    catch (Exception ex) {
       // skip this document
    }
    
  }

  public static void main(String[] args) throws Exception {
        
    if (args.length == 0) {
      System.err.println("Usage: java NekoSpider url" );
       return;  
    }
        
    NekoSpider spider = new NekoSpider();
    spider.spider(new URL(args[0]));
        
  } // end main

} // end NekoSpider


Previous | Next | Top | Cafe con Leche

Copyright 2000-2003 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified November 16, 2002