Streaming Processing of Large Documents

import java.io.IOException;
import nu.xom.*;

public class RSSHeadlines extends NodeFactory {

    private boolean inTitle = false;
    private Nodes empty = new Nodes();

    public Element startMakingElement(String name, String namespace) {              
        if ("title".equals(name) ) {
            inTitle = true; 
            return new Element(name, namespace);
        }
        return null;            
    }

    public Nodes finishMakingElement(Element element) {
        if ("title".equals(element.getQualifiedName()) ) {
            System.out.println(element.getValue());
            inTitle = false;
        }
        return empty;
    }

    public Nodes makeComment(String data) {
        return empty;  
    }    

    public Element makeRootElement(String name, String namespace) {
        return new Element(name, namespace); 
    }

    public Nodes makeAttribute(String name, String namespace, 
      String value, Attribute.Type type) {
        return empty;
    }

    public Nodes makeDocType(String rootElementName, 
      String publicID, String systemID) {
        return empty;    
    }

    public Nodes makeProcessingInstruction(
      String target, String data) {
        return empty; 
    }    
    
    public static void main(String[] args) {
  
        String url = "http://www.bbc.co.uk/syndication/feeds/news/ukfs_news/world/rss091.xml";
        if (args.length > 0) {
          url = args[0];
        }
        
        try {
          Builder parser = new Builder(new RSSHeadlines());
          parser.build(url);
        }
        catch (ParsingException ex) {
          System.out.println(url + " is not well-formed.");
          System.out.println(ex.getMessage());
        }
        catch (IOException ex) { 
          System.out.println(
           "Due to an IOException, the parser could not read " + url
          ); 
        }
  
    }

}

Previous | Next | Top | Cafe con Leche

Copyright 2004-2006 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified February 9, 2004