Attributes Example: XLinkSpider

import javax.xml.stream.*;
import java.net.*;
import java.io.*;
import java.util.*;

public class PullSpider {

  // Need to keep track of where we've been 
  // so we don't get stuck in an infinite loop
  private List spideredURIs = new Vector();

  // This linked list keeps track of where we're going.
  // Although the LinkedList class does not guarantee queue like
  // access, I always access it in a first-in/first-out fashion.
  private LinkedList queue = new LinkedList();
  
  private URL currentURL;
  private XMLInputFactory factory;
  
  public PullSpider() {
      this.factory = XMLInputFactory.newInstance();
  }

  private void processStartTag(XMLStreamReader parser) {
    
    String type 
     = parser.getAttributeValue("http://www.w3.org/1999/xlink", "type");
    if (type != null) {
      String href 
       = parser.getAttributeValue("http://www.w3.org/1999/xlink", "href");
          if (href != null) {
            try {
              URL foundURL = new URL(currentURL, href);
              if (!spideredURIs.contains(foundURL)) {
                queue.addFirst(foundURL);
              }
            }
           catch (MalformedURLException ex) {
             // skip this URL  
           }
        }
    }
  }
  
  public void spider(URL url) {
      
    System.out.println("Spidering " + url);
    currentURL = url;
    try {
      XMLStreamReader parser = factory.createXMLStreamReader(currentURL.openStream());
      spideredURIs.add(currentURL);
      
      for (int event = parser.next(); 
           parser.hasNext(); 
           event = parser.next()) {
         if (event == XMLStreamConstants.START_ELEMENT) {
             processStartTag(parser);
         }
       }  // end for
       parser.close();
       
       while (!queue.isEmpty()) {
         URL nextURL = (URL) queue.removeLast();
         spider(nextURL);
       }
      
    }
    catch (Exception ex) {
       // skip this document
    }
    
  }

  public static void main(String[] args) throws Exception {
        
    if (args.length == 0) {
      System.err.println("Usage: java PullSpider url" );
       return;  
    }
        
    PullSpider spider = new PullSpider();
    spider.spider(new URL(args[0]));
        
  } // end main

} // end PullSpider


Previous | Next | Top | Cafe con Leche

Copyright 2007 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified January 8, 2004