XML Pull Parsing


XML Pull Parsing

Elliotte Rusty Harold

Software Development 2002 East

Wednesday, November 20, 2002

elharo@metalab.unc.edu

http://www.cafeconleche.org/


XML API Styles


A Quote from an XML Founder

pull parsing is the way to go in the future. The first 3 XML parsers (Lark, NXP, and expat) all were event-driven because... er well that was 1996, can't exactly remember, seemed like a good idea at the time.

--Tim Bray on the xml-dev mailing list, Wednesday, 18 Sep 2002


Pull Parsing is


Pull APIs


XMLPULL


Only Three Classes:

XmlPullParser:
an abstract class that represents the parser
XmlPullParserFactory:
the factory class that instantiates an implementation dependent subclass of XmlPullParser
XmlPullException:
the generic class for everything other than an IOException that might go wrong when parsing an XML document, particularly well-formedness errors and tokens that don't have the expected type
XmlSerializer:
Under development; planned for 1.2

Simple Wellformedness Checker

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class PullChecker {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java PullChecker url" );
      return;   
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      XmlPullParser parser = factory.newPullParser();

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
        
      for (int event = parser.next(); 
           event != XmlPullParser.END_DOCUMENT ;
            event = parser.next()) ;
            
      // If we get here there are no exceptions
      System.out.println(args[0] + " is well-formed");      
    }
    catch (XmlPullParserException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());   
      ex.printStackTrace();      
    }
        
  }

}

Output from a Simple Wellformedness Checker

~

Event Codes


Listening to Events

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class EventLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java EventLister url" );
     return;    
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      XmlPullParser parser = factory.newPullParser();

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
        
      while (true) {
         int event = parser.nextToken();
         if (event == XmlPullParser.START_TAG) {
             System.out.println("Start tag");
         }
         else if (event == XmlPullParser.END_TAG) {
             System.out.println("End tag");
         }
         else if (event == XmlPullParser.START_DOCUMENT) {
             System.out.println("Start document");
         }
         else if (event == XmlPullParser.TEXT) {
             System.out.println("Text");
         }
         else if (event == XmlPullParser.CDSECT) {
             System.out.println("CDATA Section");
         }
         else if (event == XmlPullParser.COMMENT) {
             System.out.println("Comment");
         }
         else if (event == XmlPullParser.DOCDECL) {
             System.out.println("Document type declaration");
         }
         else if (event == XmlPullParser.ENTITY_REF) {
             System.out.println("Entity Reference");
         }
         else if (event == XmlPullParser.IGNORABLE_WHITESPACE) {
             System.out.println("Ignorable white space");
         }
         else if (event == XmlPullParser.PROCESSING_INSTRUCTION) {
             System.out.println("Processing Instruction");
         }
         else if (event == XmlPullParser.END_DOCUMENT) {
             System.out.println("End Document");
             break;
         }
      }           
    }
    catch (XmlPullParserException ex) {
       System.out.println(ex);  
    }
    catch (IOException e) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Output from EventLister


getText()

The getText() method returns the text of the current event:

public String getText()

Exactly what this is depends on the type of the event:


getText() Example

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class EventText {

  public static void main(String[] args) {
		
    if (args.length == 0) {
      System.err.println("Usage: java EventText url" );
	 return;	
    }
		
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      factory.setNamespaceAware(true);
      XmlPullParser parser = factory.newPullParser();

      
      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
        
      while (true) {
  	     int event = parser.nextToken();
 	     if (event == XmlPullParser.START_TAG) {
             System.out.println("Start-tag: " + parser.getText()) ;
    	 }
         else if (event == XmlPullParser.END_TAG) {
             System.out.println("End-tag: " + parser.getText());
         }
         else if (event == XmlPullParser.START_DOCUMENT) {
             System.out.println("Start document: "  + parser.getText());
         }
         else if (event == XmlPullParser.TEXT) {
             System.out.println("Text: " + parser.getText());
         }
         else if (event == XmlPullParser.CDSECT) {
             System.out.println("CDATA Section: " + parser.getText());
         }
         else if (event == XmlPullParser.COMMENT) {
             System.out.println("Comment: " + parser.getText());
         }
         else if (event == XmlPullParser.DOCDECL) {
             System.out.println("Document type declaration: " + parser.getText());
         }
         else if (event == XmlPullParser.ENTITY_REF) {
             System.out.println("Entity Reference: " + parser.getText());
         }
         else if (event == XmlPullParser.IGNORABLE_WHITESPACE) {
             System.out.println("Ignorable white space: " + parser.getText());
         }
         else if (event == XmlPullParser.PROCESSING_INSTRUCTION) {
             System.out.println("Processing Instruction: " + parser.getText());
         }
  	     else if (event == XmlPullParser.END_DOCUMENT) {
             System.out.println("End Document: " + parser.getText());
             break;
         } // end else if
      }  // end while
    } // end try
    catch (XmlPullParserException ex) {
       System.out.println(ex);	
    }
    catch (IOException e) {
      System.out.println("IOException while parsing " + args[0]);	
    }
		
  }
 
}

Things to note


Names

If the event is a tag, then the following methods in XmlPullParser also work:

public String getName()
public String getNamespace()
public String getPrefix()

Names Example

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class NamePrinter {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java NamePrinter url" );
      return;   
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      factory.setNamespaceAware(true);
      XmlPullParser parser = factory.newPullParser();
      
      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
        
      while (true) {
         int event = parser.nextToken();
         if (event == XmlPullParser.START_TAG) {
             System.out.println("Start tag: ");
             printEvent(parser);
         }
         else if (event == XmlPullParser.END_TAG) {
             System.out.println("End tag");
             printEvent(parser);
         }
         else if (event == XmlPullParser.START_DOCUMENT) {
             System.out.println("Start document");
         }
         else if (event == XmlPullParser.TEXT) {
             System.out.println("Text");
             printEvent(parser);
         }
         else if (event == XmlPullParser.CDSECT) {
             System.out.println("CDATA Section");
             printEvent(parser);
         }
         else if (event == XmlPullParser.COMMENT) {
             System.out.println("Comment");
             printEvent(parser);
         }
         else if (event == XmlPullParser.DOCDECL) {
             System.out.println("Document type declaration");
             printEvent(parser);
         }
         else if (event == XmlPullParser.ENTITY_REF) {
             System.out.println("Entity Reference");
             printEvent(parser);
         }
         else if (event == XmlPullParser.IGNORABLE_WHITESPACE) {
             System.out.println("Ignorable white space");
             printEvent(parser);
         }
         else if (event == XmlPullParser.PROCESSING_INSTRUCTION) {
             System.out.println("Processing Instruction");
             printEvent(parser);
         }
         else if (event == XmlPullParser.END_DOCUMENT) {
             System.out.println("End Document");
             break;
         } // end else if
      }  // end while
    }
    catch (XmlPullParserException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
      ex.printStackTrace();
    }
        
  }
  
  private static void printEvent(XmlPullParser parser) {
      String localName = parser.getName();
      String prefix = parser.getPrefix();
      String uri = parser.getNamespace();
      
      if (localName != null) System.out.println("\tName: " + localName);
      if (prefix != null) System.out.println("\tPrefix: " + prefix);
      if (uri != null) System.out.println("\tNamespace URI: " + uri);
      System.out.println();
  }

}

The next() method


next() Example

List all the titles in an RSS 0.91 document:

<?xml version="1.0" encoding="iso-8859-1" ?>
<!-- generator="HPE/1.0" -->
<!-- Copyright (C) 2000-2002 News Is Free. Terms Of Service http://www.newsisfree.com/termsofservice.php -->

<rss version="0.91">
<channel>
<title>Ananova: <!-- interrupting comment -->Archeology</title>
<link>http://www.ananova.com/news/index.html?keywords=Archaeology&amp;menu=news.scienceanddiscovery.archaeology</link>
<description>Ananova: News on the move from the leading site for breaking 
UK and world news, sport, entertainment, business and weather stories and information. 
(By http://www.newsisfree.com/syndicate.php 
- FOR PERSONAL AND NON COMMERCIAL USE ONLY!)</description>
<language>en</language>
<webMaster>mkrus@newsisfree.com</webMaster>

<lastBuildDate>11/05/02 22:16 CET</lastBuildDate>
<image>
  <link>http://www.newsisfree.com/sources/info/3389/</link>
  <url>http://www.newsisfree.com/HPE/Images/button.gif</url>
  <title>Powered by News Is Free</title><width>88</width>
  <height>31</height>
</image>

<item>
<title>Britain's earliest leprosy victim may have been found</title>
<link>http://www.newsisfree.com/click/-2,9782455,3389/</link>
</item>
<item>
<title>20th anniversary of Mary Rose recovery</title>

<link>http://www.newsisfree.com/click/-2,9773139,3389/</link>
</item>
<item>
<title>'Proof of Jesus' burial box damaged on way to Canada</title>
<link>http://www.newsisfree.com/click/-6,9663454,3389/</link>
</item>
<item>
<title>Remains of four woolly rhinos give new insight into Ice Age</title>
<link>http://www.newsisfree.com/click/-4,9533904,3389/</link>
</item>
<item>
<title>Experts solve crop lines mystery</title>

<link>http://www.newsisfree.com/click/-5,9352720,3389/</link>
</item>
</channel>
</rss>

RSSLister

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class RSSTitles {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java RSSTitles url" );
      return;   
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      XmlPullParser parser = factory.newPullParser();
      
      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
      
      boolean printing = false;
      while (true) {
         int event = parser.next();
         if (event == XmlPullParser.START_TAG) {
             String name = parser.getName();
             if (name.equals("title")) printing = true;
         }
         else if (event == XmlPullParser.END_TAG) {
             String name = parser.getName();
             if (name.equals("title")) printing = false;
         }
         else if (event == XmlPullParser.TEXT) {
             if (printing) System.out.println(parser.getText());
         }
         else if (event == XmlPullParser.END_DOCUMENT) {
             break;
         } // end else if
      }  // end while
    }
    catch (XmlPullParserException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Improved RSSLister

Print only item titles:

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class BetterRSSLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java BetterRSSLister url" );
      return;   
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      XmlPullParser parser = factory.newPullParser();
      
      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
      
      boolean inItem = false;
      boolean inTitle = false;
      // Nested elements could be handled by incrementing
      // and decrementing an integer instead
      // of a simple boolean.
      while (true) {
         int event = parser.next();
         if (event == XmlPullParser.START_TAG) {
             String name = parser.getName();
             if (name.equals("title")) inTitle = true;
             if (name.equals("item")) inItem = true;
         }
         else if (event == XmlPullParser.END_TAG) {
             String name = parser.getName();
             if (name.equals("title")) inTitle = false;
             if (name.equals("item")) inItem = false;
         }
         else if (event == XmlPullParser.TEXT) {
             if (inTitle && inItem) System.out.println(parser.getText());
         }
         else if (event == XmlPullParser.END_DOCUMENT) {
             break;
         } // end else if
      }  // end while
    }
    catch (XmlPullParserException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

The nextTag() method


The nextText() method


Attributes


Attributes Example: XLinkSpider

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;
import java.util.*;

public class PullSpider {

  // Need to keep track of where we've been 
  // so we don't get stuck in an infinite loop
  private List spideredURIs = new Vector();

  // This linked list keeps track of where we're going.
  // Although the LinkedList class does not guarantee queue like
  // access, I always access it in a first-in/first-out fashion.
  private LinkedList queue = new LinkedList();
  
  private URL currentURL;
  private XmlPullParser parser;
  
  public PullSpider() {
      try {
        XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
        factory.setNamespaceAware(true);
        this.parser = factory.newPullParser();
      }
      catch (XmlPullParserException ex) {
         throw new RuntimeException("Could not locate a pull parser");   
      }
  }

  private void processStartTag() {
    
    String type 
     = parser.getAttributeValue("http://www.w3.org/1999/xlink", "type");
    if (type != null) {
      String href 
       = parser.getAttributeValue("http://www.w3.org/1999/xlink", "href");
          if (href != null) {
            try {
              URL foundURL = new URL(currentURL, href);
              if (!spideredURIs.contains(foundURL)) {
                queue.addFirst(foundURL);
              }
            }
           catch (MalformedURLException ex) {
             // skip it   
            }
        }
    }
  }
  
  public void spider(URL uri) {
      
    System.out.println("Spidering " + uri);
    currentURL = uri;
    try {
      parser.setInput(this.currentURL.openStream(), null);
      spideredURIs.add(currentURL);
      
      for (int event = parser.next(); event != XmlPullParser.END_DOCUMENT; event = parser.next()) {
         if (event == XmlPullParser.START_TAG) {
             processStartTag();
         }
       }  // end for
      
       while (!queue.isEmpty()) {
         URL nextURL = (URL) queue.removeLast();
         spider(nextURL);
       }
      
    }
    catch (Exception ex) {
       // skip this document
    }
    
  }

  public static void main(String[] args) throws Exception {
        
    if (args.length == 0) {
      System.err.println("Usage: java PullSpider url" );
       return;  
    }
        
    PullSpider spider = new PullSpider();
    spider.spider(new URL(args[0]));
        
  } // end main

} // end PullSpider


Processing Instructions

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class PILister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java PILister url" );
     return;    
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      XmlPullParser parser = factory.newPullParser();

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
        
      while (true) {
         int event = parser.nextToken();
         if (event == XmlPullParser.PROCESSING_INSTRUCTION) {
             System.out.println("Target: " + parser.getName());
             System.out.println("Data: " + parser.getText());
             System.out.println();
         }
         else if (event == XmlPullParser.END_DOCUMENT) {
            break;   
         }
      }           
    }
    catch (XmlPullParserException ex) {
       System.out.println(ex);  
    }
    catch (IOException e) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Comments

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class CommentPuller {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java CommentPuller url" );
      return;   
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      XmlPullParser parser = factory.newPullParser();

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
        
      while (true) {
         int event = parser.nextToken();
         if (event == XmlPullParser.COMMENT) {
             System.out.println(parser.getText());
         }
         else if (event == XmlPullParser.END_DOCUMENT) {
            break;   
         }
      }           
    }
    catch (XmlPullParserException ex) {
       System.out.println(ex);  
    }
    catch (IOException e) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Features and Properties

    public void setFeature(String name, boolean state) 
     throws XmlPullParserException;
    public boolean getFeature(String name);
    public void setProperty(String name, Object value)
     throws XmlPullParserException;
    public Object getProperty(String name);

Required Features


Optional Features


Example: PullValidator

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class PullValidator {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java PullValidator url" );
     return;    
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      XmlPullParser parser = factory.newPullParser();
      try {
        parser.setFeature(XmlPullParser.FEATURE_VALIDATION, true);
      }
      catch (XmlPullParserException ex) {
         System.err.println("This is not a validating parser");   
         return;
      }

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
        
      for (int event = parser.next(); 
           event != XmlPullParser.END_DOCUMENT ;
            event = parser.next()) ;
            
      // If we get here there are no exceptions
      System.out.println(args[0] + " is valid");      
    }
    catch (XmlPullParserException ex) {
       System.out.println(args[0] + " is not valid");   
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());   
      ex.printStackTrace();      
    }
        
  }

}

XML Declaration

<?xml version="1.0" encoding="ISO-8859-1" stanalone="no"?>


Namespaces


Requirements


XmlPullParserFactory

package org.xmlpull.v1;

public class XmlPullParserFactory {

  public static final String PROPERTY_NAME =
        "org.xmlpull.v1.XmlPullParserFactory";

  public void    setFeature(String name, boolean state) 
   throws XmlPullParserException;
  public boolean getFeature (String name);
  public void    setNamespaceAware(boolean awareness);
  public boolean isNamespaceAware();
  public void    setValidating(boolean validating) ;
  public boolean isValidating();
  
  public        XmlPullParser        newPullParser()
   throws XmlPullParserException;
  public static XmlPullParserFactory newInstance() 
   throws XmlPullParserException;
  public static XmlPullParserFactory newInstance(String classNames, Class context)
   throws XmlPullParserException;
   
}

XmlPullParser

package org.xmlpull.v1;

public interface XmlPullParser {

    public final static String NO_NAMESPACE = "";

    public final static int START_DOCUMENT;
    public final static int END_DOCUMENT;
    public final static int START_TAG;
    public final static int END_TAG;
    public final static int TEXT;
    public final static int CDSECT;
    public final static int ENTITY_REF;
    public final static int IGNORABLE_WHITESPACE;
    public final static int PROCESSING_INSTRUCTION;
    public final static int COMMENT;
    public final static int DOCDECL;

    public final static String [] TYPES = {
        "START_DOCUMENT",
        "END_DOCUMENT",
        "START_TAG",
        "END_TAG",
        "TEXT",
        "CDSECT",
        "ENTITY_REF",
        "IGNORABLE_WHITESPACE",
        "PROCESSING_INSTRUCTION",
        "COMMENT",
        "DOCDECL"
    };

    public final static String FEATURE_PROCESS_NAMESPACES =
        "http://xmlpull.org/v1/doc/features.html#process-namespaces";
    public final static String FEATURE_REPORT_NAMESPACE_ATTRIBUTES =
        "http://xmlpull.org/v1/doc/features.html#report-namespace-prefixes";
    public final static String FEATURE_PROCESS_DOCDECL =
        "http://xmlpull.org/v1/doc/features.html#process-docdecl";
    public final static String FEATURE_VALIDATION =
        "http://xmlpull.org/v1/doc/features.html#validation";

    public void setFeature(String name, boolean state) 
     throws XmlPullParserException;
    public boolean getFeature(String name);
    public void setProperty(String name, Object value)
     throws XmlPullParserException;
    public Object getProperty(String name);

    public void setInput(Reader in) throws XmlPullParserException;
    public void setInput(InputStream inputStream, String inputEncoding)
        throws XmlPullParserException;

    // actual parsing methods
    public int getEventType()
        throws XmlPullParserException;
    public int next()
        throws XmlPullParserException, IOException;
    public int nextToken()
        throws XmlPullParserException, IOException;
        
    // Utility methods
    public void require(int type, String namespace, String name)
        throws XmlPullParserException, IOException;
    public String nextText() throws XmlPullParserException, IOException;
    public int    nextTag() throws XmlPullParserException, IOException;        
        
    public String getInputEncoding();
    public void defineEntityReplacementText( String entityName,
     String replacementText ) throws XmlPullParserException;
    public int getNamespaceCount(int depth) 
     throws XmlPullParserException;
     
   public String getNamespacePrefix(int position) throws XmlPullParserException;
   public String getNamespaceUri(int position) throws XmlPullParserException;
   public String getNamespace(String prefix);
   public int    getDepth();
   public String getPositionDescription();
   public int    getLineNumber();
   public int    getColumnNumber();

   // Text methods
   public boolean isWhitespace() throws XmlPullParserException;
   public String  getText();
   public char[]  getTextCharacters(int[] holderForStartAndLength);

    // Tag methods
    public String  getNamespace();
    public String  getName();
    public String  getPrefix();
    public boolean isEmptyElementTag() throws XmlPullParserException;

    // Attribute methods
    public int     getAttributeCount();
    public String  getAttributeNamespace(int index);
    public String  getAttributePrefix(int index);
    public String  getAttributeType(int index);
    public boolean isAttributeDefault(int index);
    public String  getAttributeValue(int index);
    public String  getAttributeValue(String namespace, String name);
}

XmlPullException

package org.xmlpull.v1;

public class XmlPullParserException extends Exception {

    public XmlPullParserException(String message);
    public XmlPullParserException(String message, Throwable throwble) ;
    public XmlPullParserException(String message, int row, int column);
    public XmlPullParserException(String message, XmlPullParser parser, Throwable chain);

    public Throwable getDetail();
    public void printStackTrace();

}

XmlSerializer

package org.xmlpull.v1;

public interface XmlSerializer {

  public void setFeature(String name, boolean state)
   throws IllegalArgumentException, IllegalStateException;
  public boolean getFeature(String name);
  public void setProperty(String name, Object value)
   throws IllegalArgumentException, IllegalStateException;
  public Object getProperty(String name);

  public void setOutput(OutputStream out, String encoding)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public void setOutput(Writer out)
   throws IOException, IllegalArgumentException, IllegalStateException;

  public void startDocument(String encoding, Boolean standalone)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public void endDocument()
   throws IOException, IllegalArgumentException, IllegalStateException;
  public void setPrefix(String prefix, String namespace)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public String getPrefix(String namespace, boolean generatePrefix)
   throws IllegalArgumentException;
  public int getDepth();
  public String getNamespace();
  public String getName();

  public XmlSerializer startTag(String namespace, String name)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public XmlSerializer attribute(String namespace, String name, String value)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public XmlSerializer endTag(String namespace, String name)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public XmlSerializer text(String text)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public XmlSerializer text(char [] buf, int start, int len)
   throws IOException, IllegalArgumentException, IllegalStateException;

  public void cdsect(String text)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public void entityRef(String text)  throws IOException,
        IllegalArgumentException, IllegalStateException;
  public void processingInstruction(String text)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public void comment(String text)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public void docdecl(String text)
   throws IOException, IllegalArgumentException, IllegalStateException;
  public void ignorableWhitespace(String text)
   throws IOException, IllegalArgumentException, IllegalStateException;

  public void flush() throws IOException;

}

Serializer Example: Convert RDDL to XHTML


Example: RDDLStripper

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;

 
public class RDDLStripper {
    
  public final static String RDDL_NS = "http://www.rddl.org/";

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java RDDLStripper url" );
      return;    
    }
        
    try {
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      factory.setNamespaceAware(true);
      XmlPullParser parser = factory.newPullParser();
      XmlSerializer serializer = factory.newSerializer();
      serializer.setOutput(System.out, "ISO-8859-1");
      
      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      parser.setInput(in, null);
        
      while (true) {
         int event = parser.nextToken();
         if (event == XmlPullParser.START_TAG) {
             String namespaceURI = parser.getNamespace();
             if (!namespaceURI.equals(RDDL_NS)) {
                 String prefix = parser.getPrefix();
                 if (prefix == null) prefix = "";
                 if (namespaceURI != null) {
                     serializer.setPrefix(prefix, namespaceURI);
                 }
                 serializer.startTag(namespaceURI, parser.getName());
                 // add attributes
                 for (int i = 0; i < parser.getAttributeCount(); i++) {
                     serializer.attribute(
                       parser.getAttributeNamespace(i),
                       parser.getAttributeName(i),
                       parser.getAttributeValue(i)
                     );
                     // How to define attribute prefixes????
                 }
             }
         }
         else if (event == XmlPullParser.END_TAG) {
             String namespaceURI = parser.getNamespace();
             if (!namespaceURI.equals(RDDL_NS)) {
                 serializer.endTag(namespaceURI, parser.getName());
             }
         }
         else if (event == XmlPullParser.TEXT) {
             serializer.text(parser.getText());
         }
         else if (event == XmlPullParser.CDSECT) {
             serializer.cdsect(parser.getText());
         }
         else if (event == XmlPullParser.COMMENT) {
             serializer.comment(parser.getText());
         }
         else if (event == XmlPullParser.DOCDECL) {
             serializer.docdecl(parser.getText());
         }
         else if (event == XmlPullParser.ENTITY_REF) {
             serializer.entityRef(parser.getName());
        }
         else if (event == XmlPullParser.IGNORABLE_WHITESPACE) {
             serializer.ignorableWhitespace(parser.getText());
         }
         else if (event == XmlPullParser.PROCESSING_INSTRUCTION) {
             serializer.processingInstruction(parser.getText());
         }
         else if (event == XmlPullParser.TEXT) {
             serializer.text(parser.getText());
         }
         else if (event == XmlPullParser.END_DOCUMENT) {
            serializer.flush();
            break;
         }
      }           
    }
    catch (XmlPullParserException ex) {
       System.out.println(ex);  
    }
    catch (IOException e) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

One of my favorite features


Java Issues


XML Issues


NekoPull


XMLEvent


XMLEvent Subclasses

NekoPull Class Hierarchy diagram

Parsing Documents


Simple Wellformedness Checker

import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.io.IOException;

 
public class NekoChecker {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java NekoChecker url" );
      return;   
    }
        
    try {
      XMLPullParser parser = new Xerces2();;
      XMLInputSource source = new XMLInputSource(null, args[0], null);
      parser.setInputSource(source);
        
      // read entire document
      while (parser.nextEvent() != null) ;
            
      // If we get here there are no exceptions
      System.out.println(args[0] + " is well-formed");      
    }
    catch (XNIException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " + ex.getClass().getName());   
      ex.printStackTrace();      
    }
        
  }

}

Listening to Events

import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.io.IOException;

 
public class NekoLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java NekoLister url" );
      return;   
    }
        
    try {
      XMLPullParser parser = new Xerces2();;
      XMLInputSource source = new XMLInputSource(null, args[0], null);
      parser.setInputSource(source);
        
      XMLEvent event;
      while ((event = parser.nextEvent()) != null) {
        switch (event.type) {
          case XMLEvent.ELEMENT: 
            System.out.println("Element");
            break;
          case XMLEvent.DOCUMENT: 
            System.out.println("Document");
            break;
          case XMLEvent.CHARACTERS: 
            System.out.println("Characters");
            break;
          case XMLEvent.PREFIX_MAPPING: 
            System.out.println("Prefix mapping");
            break;
          case XMLEvent.GENERAL_ENTITY: 
            System.out.println("General Entity");
            break;
          case XMLEvent.PROCESSING_INSTRUCTION: 
            System.out.println("Processing instruction");
            break;
          case XMLEvent.CDATA: 
            System.out.println("CDATA section");
            break;
          case XMLEvent.TEXT_DECL: 
            System.out.println("Text declaration");
            break;
          case XMLEvent.DOCTYPE_DECL: 
            System.out.println("Document type declaration");
            break;
          default:
            System.out.println("Unexpected event");
        } 
      }
    }
    catch (XNIException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " + ex.getClass().getName());   
      ex.printStackTrace();      
    }
        
  }

}

BoundedEvent

package org.cyberneko.pull.event;

public abstract class BoundedEvent extends XMLEvent {

    public boolean start;

    protected BoundedEvent(short type);

} 

ElementEvent

package org.cyberneko.pull.event;

public class ElementEvent extends BoundedEvent {

    public QName element;
    public XMLAttributes attributes;
    public boolean empty;

    public ElementEvent();

} 

QName class

package org.apache.xerces.xni;

public class QName implements Cloneable {

    public String prefix;
    public String localpart;
    public String rawname;
    public String uri;

    public QName();
    public QName(String prefix, String localpart, String rawname, String uri);
    public QName(QName qname);
    
    public void setValues(QName qname);
    public void setValues(String prefix, String localpart, String rawname, String uri);
    public void clear();
    
    public Object  clone();
    public int     hashCode();
    public boolean equals(Object object);
    public String  toString();

}

CharactersEvent

package org.cyberneko.pull.event;

public class CharactersEvent extends XMLEvent {

    public XMLString text;
    public boolean ignorable;

    public CharactersEvent();

}

NekoRSSLister

import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.io.IOException;

 
public class NekoRSSLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java NekoRSSLister url");
      return;   
    }
        
    try {
      XMLPullParser parser = new Xerces2();
      XMLInputSource source = new XMLInputSource(null, args[0], null);
      parser.setInputSource(source);
        
      XMLEvent event;
      boolean inTitle = false
      while ((event = parser.nextEvent()) != null) {
        switch (event.type) {
          case XMLEvent.ELEMENT: 
            ElementEvent element = (ElementEvent) event;
            String name = element.QName.localpart;
            if (name.equals("title") && element.QName.uri == null) {
                if (element.start) inTitle = true;
                else inTitle = false;
            }
            break;
          case XMLEvent.CHARACTERS: 
            if (inTitle) {
              CharactersEvent text = (CharactersEvent) event;
              System.out.println(text.text);
            }
            break;
          case XMLEvent.CDATA: 
            if (inTitle) {
              CDATAEvent text = (CDATAEvent) event;
              System.out.println(text.text);
            }
            break;
          default:
            // do nothing
        } 
      }
    }
    catch (XNIException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());  
      ex.printStackTrace();      
    }
        
  }

}

Attributes

package org.apache.xerces.xni;

public interface XMLAttributes {

  public int     getLength();
  public int     getIndex(String qualifiedName);
  public int     getIndex(String uri, String localPart);
  public void    setName(int index, QName name);
  public void    getName(int index, QName name);
  public String  getPrefix(int index);
  public String  getURI(int index);
  public String  getLocalName(int index);
  public String  getQName(int index);
  
  public void    setValue(int index, String value);
  public String  getValue(int index);
  public String  getValue(String qualifiedName);
  public String  getValue(String uri, String localName);
  public void    setNonNormalizedValue(int index, String value);
  public String  getNonNormalizedValue(int index); 
  
  public void    setType(int index, String type);
  public String  getType(int index);
  public String  getType(String qualifiedName);
  public String  getType(String uri, String localName);
  public void    setSpecified(int index, boolean specified);
  public boolean isSpecified(int index);
  
  public int  addAttribute(QName name, String type, String value);
  public void removeAllAttributes();
  public void removeAttributeAt(int index);  
  
  public Augmentations getAugmentations (int attributeIndex);
  public Augmentations getAugmentations (String uri, String localPart);
  public Augmentations getAugmentations(String qualifiedName);

}

NekoSpider

import org.apache.xerces.xni.*;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.net.*;
import java.io.*;
import java.util.*;

public class NekoSpider {

  // Need to keep track of where we've been 
  // so we don't get stuck in an infinite loop
  private List spideredURIs = new Vector();

  // This linked list keeps track of where we're going.
  // Although the LinkedList class does not guarantee queue like
  // access, I always access it in a first-in/first-out fashion.
  private LinkedList queue = new LinkedList();
  
  private URL currentURL;
  private XMLPullParser parser;
  
  public NekoSpider() {
      this.parser = new Xerces2();
  }

  private void processStartTag(ElementEvent element) {
    
    XMLAttributes attributes = element.attributes;
    String type = attributes.getValue("http://www.w3.org/1999/xlink", "type");
    if (type != null) {
      String href = attributes.getValue("http://www.w3.org/1999/xlink", "href");
      if (href != null) {
        try {
          URL foundURL = new URL(currentURL, href);
          if (!spideredURIs.contains(foundURL)) {
            queue.addFirst(foundURL);
          }
        }
        catch (MalformedURLException ex) {
          // skip it   
        }
      }
    }
  }
  
  public void spider(URL uri) {
      
    System.out.println("Spidering " + uri);
    try {
      XMLInputSource source 
       = new XMLInputSource(null, uri.toExternalForm(), null);
      parser.setInputSource(source);
      spideredURIs.add(uri);
      
      XMLEvent event;
      while ((event = parser.nextEvent()) != null) {
         if (event.type == XMLEvent.ELEMENT) {
             ElementEvent element = (ElementEvent) event;
             if (element.start) processStartTag(element);
         }
       }  // end for
      
       while (!queue.isEmpty()) {
         URL nextURL = (URL) queue.removeLast();
         spider(nextURL);
       }
      
    }
    catch (Exception ex) {
       // skip this document
    }
    
  }

  public static void main(String[] args) throws Exception {
        
    if (args.length == 0) {
      System.err.println("Usage: java NekoSpider url" );
       return;  
    }
        
    NekoSpider spider = new NekoSpider();
    spider.spider(new URL(args[0]));
        
  } // end main

} // end NekoSpider


DocumentEvent

package org.cyberneko.pull.event;

public class DocumentEvent extends BoundedEvent {

    public XMLLocator locator;
    public String encoding;

    public DocumentEvent();

}

ProcessingInstructionEvent

package org.cyberneko.pull.event;

public class ProcessingInstructionEvent extends XMLEvent {

    public String target;
    public XMLString data;

    public ProcessingInstructionEvent();

}

NekoPILister

import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.io.IOException;

 
public class NekoPILister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java NekoPILister url" );
      return;   
    }
        
    try {
      XMLPullParser parser = new Xerces2();
      XMLInputSource source = new XMLInputSource(null, args[0], null);
      parser.setInputSource(source);
        
      XMLEvent event;
      while ((event = parser.nextEvent()) != null) {
        if (event.type == XMLEvent.PROCESSING_INSTRUCTION) { 
            ProcessingInstructionEvent instruction 
             = (ProcessingInstructionEvent) event;
            System.out.println("Target: " + instruction.target);
            System.out.println("Data:   " + instruction.data);
            System.out.println();
        }
      }
    }
    catch (XNIException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());  
      ex.printStackTrace();      
    }
        
  }

}

CommentEvent

package org.cyberneko.pull.event;

public class CommentEvent extends XMLEvent {

    public XMLString text;

    public CommentEvent();

} // class CommentEvent

NekoCommentPuller

import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.io.IOException;

 
public class NekoCommentReader {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java NekoCommentReader url" );
      return;   
    }
        
    try {
      XMLPullParser parser = new Xerces2();
      XMLInputSource source = new XMLInputSource(null, args[0], null);
      parser.setInputSource(source);
        
      XMLEvent event;
      while ((event = parser.nextEvent()) != null) {
        if (event.type == XMLEvent.COMMENT) { 
            CommentEvent comment = (CommentEvent) event;
            System.out.println(comment.text);
        }
      }
    }
    catch (XNIException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());   
      ex.printStackTrace();      
    }
        
  }

}

TextDeclEvent

package org.cyberneko.pull.event;

public class TextDeclEvent extends XMLEvent {

    public boolean xmldecl;
    public String  version;
    public String  encoding;
    public String  standalone;

    public TextDeclEvent();

}

PrefixMappingEvent

package org.cyberneko.pull.event;

public class PrefixMappingEvent extends BoundedEvent {

    public String prefix;
    public String uri;

    public PrefixMappingEvent();

} 

PrefixLister

import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.io.IOException;

 
public class PrefixLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java PrefixLister url" );
      return;   
    }
        
    try {
      XMLPullParser parser = new Xerces2();
      XMLInputSource source = new XMLInputSource(null, args[0], null);
      parser.setInputSource(source);
        
      XMLEvent event;
      while ((event = parser.nextEvent()) != null) {
        if (event.type == XMLEvent.PREFIX_MAPPING) { 
            PrefixMappingEvent mapping = (PrefixMappingEvent) event;
            System.out.println("Prefix: " + mapping.prefix);
            System.out.println("URI:    " + mapping.uri);
            System.out.println();
        }
      }
    }
    catch (XNIException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());  
      ex.printStackTrace();      
    }
        
  }

}

GeneralEntityEvent

package org.cyberneko.pull.event;

public class GeneralEntityEvent extends BoundedEvent {

  public String name;
  public String pubid;
  public String basesysid;
  public String literalsysid;
  public String expandedsysid;
  public String encoding;

  public GeneralEntityEvent();

}

EntityLister

import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.pull.*;
import org.cyberneko.pull.event.*;
import org.cyberneko.pull.parsers.Xerces2;
import java.io.IOException;

 
public class EntityLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java EntityLister url" );
      return;   
    }
        
    try {
      XMLPullParser parser = new Xerces2();
      XMLInputSource source = new XMLInputSource(null, args[0], null);
      parser.setInputSource(source);
        
      XMLEvent event;
      while ((event = parser.nextEvent()) != null) {
        if (event.type == XMLEvent.GENERAL_ENTITY) { 
            GeneralEntityEvent entity = (GeneralEntityEvent) event;
            if (entity.start) {
              System.out.println("Name:               " + entity.name);
              System.out.println("Public ID:          " + entity.pubid);
              System.out.println("Base System ID:     " + entity.basesysid);
              System.out.println("Literal System ID:  " + entity.literalsysid);
              System.out.println("Expanded System ID: " + entity.expandedsysid);
              System.out.println("Encoding:           " + entity.encoding);
              System.out.println();
           }
        }
      }
    }
    catch (XNIException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());  
      ex.printStackTrace();      
    }
        
  }

}

XMLPullParser

package org.cyberneko.pull;

public interface XMLPullParser 
  extends XMLEventIterator, XMLComponentManager {

    public void setInputSource(XMLInputSource inputSource)
      throws XMLConfigurationException, IOException;
    public void cleanup();
    
    public void setErrorHandler(XMLErrorHandler errorHandler);
    public XMLErrorHandler getErrorHandler();

    public void setEntityResolver(XMLEntityResolver entityResolver);
    public XMLEntityResolver getEntityResolver();

    public void setLocale(Locale locale) throws XNIException;
    public Locale getLocale();

    public boolean getFeature(String featureId)
      throws XMLConfigurationException;
    public void setFeature(String featureId, boolean state)
      throws XMLConfigurationException;
    public void setProperty(String propertyId, Object value)
      throws XMLConfigurationException;
    public Object getProperty(String propertyId)
      throws XMLConfigurationException;

    public XMLEvent nextEvent() throws XNIException, IOException;
    
}

StAX


To Learn More


Index | Cafe con Leche

Copyright 2000-2002 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified November 21, 2002