StAX


StAX

Elliotte Rusty Harold

Software Development 2004 West

Thursday, March 17, 2004

elharo@metalab.unc.edu

http://www.cafeconleche.org/


XML API Styles


Pull Parsing

pull parsing is the way to go in the future. The first 3 XML parsers (Lark, NXP, and expat) all were event-driven because... er well that was 1996, can't exactly remember, seemed like a good idea at the time.

--Tim Bray on the xml-dev mailing list, Wednesday, September 18, 2002


Pull Parsing is


Pull APIs


StAX


Major Classes and Interfaces

XMLStreamReader:
an interface that represents the parser
XMLInputFactory:
the factory class that instantiates an implementation dependent implementation of XMLStreamReader
XMLStreamException:
the generic class for everything other than an IOException that might go wrong when parsing an XML document, particularly well-formedness errors

Simple Wellformedness Checker

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class StAXChecker {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java StAXChecker url" );
      return;   
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      
      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      while (true) {
           int event = parser.next();
           if (event == XMLStreamConstants.END_DOCUMENT) {
                parser.close();
                break;
           }
      }
      parser.close();
            
      // If we get here there are no exceptions
      System.out.println(args[0] + " is well-formed");      
    }
    catch (XMLStreamException ex) {
       System.out.println(args[0] + " is not well-formed"); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());   
      ex.printStackTrace();      
    }
        
  }

}

Output from a Simple Wellformedness Checker

$ java -classpath stax.jar:.:bea.jar StAXChecker http://www.cafeconleche.org/
http://www.cafeconleche.org/ is well-formed
$ java -classpath stax.jar:.:bea.jar StAXChecker http://www.xml.com/
http://www.xml.com/ is not well-formed
javax.xml.stream.XMLStreamException: ParseError at [row,col]:[44,7]
Message: could not resolve entity named 'nbsp'


Event Codes


Listening to Events

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class EventLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java EventLister url" );
     return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }

      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      while (true) {
         int event = parser.next();
         if (event == XMLStreamConstants.START_ELEMENT) {
             System.out.println("Start tag");
         }
         else if (event == XMLStreamConstants.END_ELEMENT) {
             System.out.println("End tag");
         }
         else if (event == XMLStreamConstants.START_DOCUMENT) {
             System.out.println("Start document");
         }
         else if (event == XMLStreamConstants.CHARACTERS) {
             System.out.println("Text");
         }
         else if (event == XMLStreamConstants.CDATA) {
             System.out.println("CDATA Section");
         }
         else if (event == XMLStreamConstants.COMMENT) {
             System.out.println("Comment");
         }
         else if (event == XMLStreamConstants.DTD) {
             System.out.println("Document type declaration");
         }
         else if (event == XMLStreamConstants.ENTITY_REFERENCE) {
             System.out.println("Entity Reference");
         }
         else if (event == XMLStreamConstants.START_ENTITY) {
             System.out.println("Entity Reference");
         }
         else if (event == XMLStreamConstants.END_ENTITY) {
             System.out.println("Entity Reference");
         }
         else if (event == XMLStreamConstants.SPACE) {
             System.out.println("Ignorable white space");
         }
         else if (event == XMLStreamConstants.NOTATION_DECLARATION) {
             System.out.println("Notation Declaration");
         }
         else if (event == XMLStreamConstants.ENTITY_DECLARATION) {
             System.out.println("Entity Declaration");
         }
         else if (event == XMLStreamConstants.PROCESSING_INSTRUCTION) {
             System.out.println("Processing Instruction");
         }
         else if (event == XMLStreamConstants.END_DOCUMENT) {
             System.out.println("End Document");
             break;
         }
      }           
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Output from EventLister

% java -classpath stax.jar:.:bea.jar EventLister hotcop.xml
Ignorable white space
Start tag
Text
Start tag
Text
End tag
Text
Start tag
Text
End tag
Text
Start tag
Text
End tag
Text
Start tag
Text
End tag
Text
Start tag
Text
End tag
Text
Start tag
Text
End tag
Text
Start tag
Text
End tag
Text
Start tag
Text
End tag
Text
Start tag
Text
End tag
Text
End tag
Ignorable white space
End Document

XMLStreamReader getter methods depend on the current state

Invokable methods
Event Type Valid Methods
START_ELEMENT next(), getName(), getLocalName(), hasName(), getPrefix(), getAttributeCount(), getAttributeName(int index), getAttributeNamespace(int index), getAttributePrefix(int index), getAttributeQName(int index), getAttributeType(int index), getAttributeValue(int index), getAttributeValue(String namespaceURI, String localName), isAttributeSpecified(), getNamespaceContext(), getNamespaceCount(), getNamespacePrefix(int index), getNamespaceURI(), getNamespaceURI(int index), getNamespaceURI(String prefix), getElementText(), nextTag()
ATTRIBUTE next(), nextTag(), getAttributeCount(), getAttributeName(int index), getAttributeNamespace(int index), getAttributePrefix(int index), getAttributeQName(int index), getAttributeType(int index), getAttributeValue(int index), getAttributeValue(String namespaceURI, String localName), isAttributeSpecified()
NAMESPACE next(), nextTag(), getNamespaceContext(), getNamespaceCount(), getNamespacePrefix(int index), getNamespaceURI(), getNamespaceURI(int index), getNamespaceURI(String prefix)
END_ELEMENT next(), getName(), getLocalName(), hasName(), getPrefix(), getNamespaceContext(), getNamespaceCount(), getNamespacePrefix(int index), getNamespaceURI(), getNamespaceURI(int index), getNamespaceURI(String prefix), nextTag()
CHARACTERS next(), getText(), getTextCharacters(), getTextCharacters(int sourceStart, char[] target, int targetStart, int length), getTextLength(), nextTag()
CDATA next(), getText(), getTextCharacters(), getTextCharacters(int sourceStart, char[] target, int targetStart, int length), getTextLength(), nextTag()
COMMENT next(), getText(), getTextCharacters(), getTextCharacters(int sourceStart, char[] target, int targetStart, int length), getTextLength(), nextTag()
SPACE next(), getText(), getTextCharacters(), getTextCharacters(int sourceStart, char[] target, int targetStart, int length), getTextLength(), nextTag()
START_DOCUMENT next(), getEncoding(), next(), getPrefix(), getVersion(), isStandalone(), standaloneSet(), getCharacterEncodingScheme(), nextTag()
END_DOCUMENT close()
PROCESSING_INSTRUCTION next(), getPITarget(), getPIData(), nextTag()
ENTITY_REFERENCE next(), getLocalName(), getText(), nextTag()
DTD next(), getText(), nextTag()

getText()


getText() Example

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class EventText {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java EventText url" );
      return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }

      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      for (int event = parser.next(); 
           event != XMLStreamConstants.END_DOCUMENT; 
           event = parser.next()) {
         if (event == XMLStreamConstants.CHARACTERS 
           || event == XMLStreamConstants.SPACE 
           || event == XMLStreamConstants.CDATA) {
             System.out.println(parser.getText());
         }
         else if (event == XMLStreamConstants.COMMENT) {
             System.out.println("<!-- " + parser.getText() + "-->");
         }
      }           
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Output

[elharo@stallion examples]$ java -classpath stax.jar:.:bea.jar EventText hotcop.xml




Hot Cop


Jacques Morali


Henri Belolo


Victor Willis


Jacques Morali


PolyGram Records


6:20


1978


Village People


isFoo() and hasFoo()

Rather than testing for type, it's sometimes useful to ask if the current event can be queried for a certain characteristic:


hasText() Example

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class SimplerEventText {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java SimplerEventText url" );
      return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }

      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      for (int event = parser.next(); parser.hasNext(); event = parser.next()) {
         if (parser.hasText()) {
             System.out.println(parser.getText());
         }
      }           
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

More efficient way of getting text

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class EfficientEventText {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java EfficientEventText url" );
      return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }

      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      Writer out = new OutputStreamWriter(System.out);
      for (int event = parser.next(); 
           event != XMLStreamConstants.END_DOCUMENT; 
           event = parser.next()) {
         if (event == XMLStreamConstants.CHARACTERS 
           || event == XMLStreamConstants.SPACE 
           || event == XMLStreamConstants.CDATA) {
             out.write(parser.getTextCharacters(), parser.getTextStart(), parser.getTextLength());
         }
      }          
      out.flush();
      out.close();
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Reusable Text Arrays

public int getTextCharacters(int sourceStart, char[] target, int targetStart, int length)
  throws XMLStreamException, IndexOutOfBoundsException, 
         UnsupportedOperationException, IllegalStateException

Names

If the event is START_ELEMENT or END_ELEMENT, then the following methods in XMLStreamReader also work:

public String getLocalName()
public String getPrefix()
public QName getName()

QName Class

import javax.xml.namespace.*;

public class QName {

    public QName(String localPart);
    public QName(String namespaceURI, String localPart);
    public QName(String namespaceURI, String localPart, String prefix);
    
    public String getLocalPart();
    public String getPrefix();
    public String getNamespaceURI();
    
    public static QName valueOf(String qNameAsString);

    public int     hashCode();
    public boolean equals(Object object);
    public String  toString();

}

Names Example

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class NamePrinter {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java NamePrinter url" );
      return;   
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      
      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
              
      while (true) {
         int event = parser.next();
         if (event == XMLStreamConstants.START_ELEMENT) {
             System.out.println("Start tag: ");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.END_ELEMENT) {
             System.out.println("End tag");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.START_DOCUMENT) {
             System.out.println("Start document");
         }
         else if (event == XMLStreamConstants.CHARACTERS) {
             System.out.println("Text");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.CDATA) {
             System.out.println("CDATA Section");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.COMMENT) {
             System.out.println("Comment");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.DTD) {
             System.out.println("Document type declaration");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.ENTITY_REFERENCE) {
             System.out.println("Entity Reference");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.SPACE) {
             System.out.println("Ignorable white space");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.PROCESSING_INSTRUCTION) {
             System.out.println("Processing Instruction");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.END_DOCUMENT) {
             System.out.println("End Document");
             break;
         } // end else if
      }  // end while
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
      ex.printStackTrace();
    }
        
  }
  
  private static void printEvent(XMLStreamReader parser) {
      String localName = parser.getLocalName();
      String prefix = parser.getPrefix();
      String uri = parser.getNamespaceURI();
      
      if (localName != null) System.out.println("\tLocal name: " + localName);
      if (prefix != null) System.out.println("\tPrefix: " + prefix);
      if (uri != null) System.out.println("\tNamespace URI: " + uri);
      System.out.println();
  }

}

Names Example

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class NamePrinter {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java NamePrinter url" );
      return;   
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      
      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
              
      while (true) {
         int event = parser.next();
         if (event == XMLStreamConstants.START_ELEMENT) {
             System.out.println("Start tag: ");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.END_ELEMENT) {
             System.out.println("End tag");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.START_DOCUMENT) {
             System.out.println("Start document");
         }
         else if (event == XMLStreamConstants.CHARACTERS) {
             System.out.println("Text");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.CDATA) {
             System.out.println("CDATA Section");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.COMMENT) {
             System.out.println("Comment");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.DTD) {
             System.out.println("Document type declaration");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.ENTITY_REFERENCE) {
             System.out.println("Entity Reference");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.SPACE) {
             System.out.println("Ignorable white space");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.PROCESSING_INSTRUCTION) {
             System.out.println("Processing Instruction");
             printEvent(parser);
         }
         else if (event == XMLStreamConstants.END_DOCUMENT) {
             System.out.println("End Document");
             break;
         } // end else if
      }  // end while
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
      ex.printStackTrace();
    }
        
  }
  
  private static void printEvent(XMLStreamReader parser) {
      String localName = parser.getLocalName();
      String prefix = parser.getPrefix();
      String uri = parser.getNamespaceURI();
      
      if (localName != null) System.out.println("\tLocal name: " + localName);
      if (prefix != null) System.out.println("\tPrefix: " + prefix);
      if (uri != null) System.out.println("\tNamespace URI: " + uri);
      System.out.println();
  }

}

Names Example Output

[146:sd2004west/stax/examples] elharo% java -classpath .:bea.jar:stax.jar NamePrinter hotcop.xml
Ignorable white space

Start tag: 
        Local name: SONG
        Namespace URI: 

Text

Start tag: 
        Local name: TITLE
        Namespace URI: 

Text

End tag
        Local name: TITLE
        Namespace URI: 

Text

Start tag: 
        Local name: COMPOSER
        Namespace URI: 

Text

End tag
        Local name: COMPOSER
        Namespace URI: 

Text

Start tag: 
        Local name: COMPOSER
        Namespace URI: 

Text

End tag
        Local name: COMPOSER
        Namespace URI: 

Text

Start tag: 
        Local name: COMPOSER
        Namespace URI: 

Text

End tag
        Local name: COMPOSER
        Namespace URI: 

Text

Start tag: 
        Local name: PRODUCER
        Namespace URI: 

Text

End tag
        Local name: PRODUCER
        Namespace URI: 

Text

Start tag: 
        Local name: PUBLISHER
        Namespace URI: 

Text

End tag
        Local name: PUBLISHER
        Namespace URI: 

Text

Start tag: 
        Local name: LENGTH
        Namespace URI: 

Text

End tag
        Local name: LENGTH
        Namespace URI: 

Text

Start tag: 
        Local name: YEAR
        Namespace URI: 

Text

End tag
        Local name: YEAR
        Namespace URI: 

Text

Start tag: 
        Local name: ARTIST
        Namespace URI: 

Text

End tag
        Local name: ARTIST
        Namespace URI: 

Text

End tag
        Local name: SONG
        Namespace URI: 

Ignorable white space

End Document

RSSLister

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class RSSLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java RSSLister url" );
      return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }

      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      boolean printing = false;
      for (int event = parser.next(); 
           parser.hasNext(); 
           event = parser.next()) {
         if (event == XMLStreamConstants.START_ELEMENT) {
             String name = parser.getLocalName();
             if (name.equals("title")) printing = true;
         }
         else if (event == XMLStreamConstants.END_ELEMENT) {
             String name = parser.getLocalName();
             if (name.equals("title")) printing = false;
         }
         else if (parser.hasText() && event != XMLStreamConstants.COMMENT) {
             if (printing) System.out.println(parser.getText());
         }
      }  
      parser.close();
         
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

RSSLister Output

$ java -classpath stax.jar:.:bea.jar RSSLister ananova.rss
Ananova:
Archeology
Powered by News Is Free
Britain's earliest leprosy victim may have been found
20th anniversary of Mary Rose recovery
'Proof of Jesus' burial box damaged on way to Canada
Remains of four woolly rhinos give new insight into Ice Age
Experts solve crop lines mystery

Improved RSSLister

Print only item titles:

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class BetterRSSLister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java BetterRSSLister url" );
      return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }

      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      boolean inItem = false;
      boolean inTitle = false;
      // I am relying on no recursion here. To fix this
      // just keep an int count rather than a boolean
      for (int event = parser.nextTag(); 
           parser.hasNext(); 
           event = parser.next()) {
         if (event == XMLStreamConstants.START_ELEMENT) {
             String name = parser.getLocalName();
             if (name.equals("title")) inTitle = true;
             else if (name.equals("item")) inItem = true;
         }
         else if (event == XMLStreamConstants.END_ELEMENT) {
             String name = parser.getLocalName();
             if (name.equals("title")) inTitle = false;
             else if (name.equals("item")) inItem = false;
          }
         else if (parser.hasText() && event != XMLStreamConstants.COMMENT) {
             if (inItem && inTitle) System.out.println(parser.getText());
         }
      }  
      parser.close();
      
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

RSSLister Output

$ java -classpath stax.jar:.:bea.jar RSSLister ananova.rss
Archeology
Powered by News Is Free
Britain's earliest leprosy victim may have been found
20th anniversary of Mary Rose recovery
'Proof of Jesus' burial box damaged on way to Canada
Remains of four woolly rhinos give new insight into Ice Age
Experts solve crop lines mystery

The nextTag() method


Attributes


Attributes Example: XLinkSpider

import javax.xml.stream.*;
import java.net.*;
import java.io.*;
import java.util.*;

public class PullSpider {

  // Need to keep track of where we've been 
  // so we don't get stuck in an infinite loop
  private List spideredURIs = new Vector();

  // This linked list keeps track of where we're going.
  // Although the LinkedList class does not guarantee queue like
  // access, I always access it in a first-in/first-out fashion.
  private LinkedList queue = new LinkedList();
  
  private URL currentURL;
  private XMLInputFactory factory;
  
  public PullSpider() {
      this.factory = XMLInputFactory.newInstance();
  }

  private void processStartTag(XMLStreamReader parser) {
    
    String type 
     = parser.getAttributeValue("http://www.w3.org/1999/xlink", "type");
    if (type != null) {
      String href 
       = parser.getAttributeValue("http://www.w3.org/1999/xlink", "href");
          if (href != null) {
            try {
              URL foundURL = new URL(currentURL, href);
              if (!spideredURIs.contains(foundURL)) {
                queue.addFirst(foundURL);
              }
            }
           catch (MalformedURLException ex) {
             // skip this URL  
           }
        }
    }
  }
  
  public void spider(URL url) {
      
    System.out.println("Spidering " + url);
    currentURL = url;
    try {
      XMLStreamReader parser = factory.createXMLStreamReader(currentURL.openStream());
      spideredURIs.add(currentURL);
      
      for (int event = parser.next(); 
           parser.hasNext(); 
           event = parser.next()) {
         if (event == XMLStreamConstants.START_ELEMENT) {
             processStartTag(parser);
         }
       }  // end for
       parser.close();
       
       while (!queue.isEmpty()) {
         URL nextURL = (URL) queue.removeLast();
         spider(nextURL);
       }
      
    }
    catch (Exception ex) {
       // skip this document
    }
    
  }

  public static void main(String[] args) throws Exception {
        
    if (args.length == 0) {
      System.err.println("Usage: java PullSpider url" );
       return;  
    }
        
    PullSpider spider = new PullSpider();
    spider.spider(new URL(args[0]));
        
  } // end main

} // end PullSpider


Output from the PullSpider

$ java -classpath stax.jar:.:bea.jar PullSpider http://www.rddl.org
Spidering http://www.rddl.org
Spidering http://www.rddl.org/natures
Spidering http://www.rddl.org/purposes
Spidering http://www.rddl.org/xrd.css
Spidering http://www.rddl.org/rddl-xhtml.dtd
Spidering http://www.rddl.org/rddl-qname-1.mod
Spidering http://www.rddl.org/rddl-resource-1.mod
Spidering http://www.rddl.org/xhtml-arch-1.mod
Spidering http://www.rddl.org/xhtml-attribs-1.mod
Spidering http://www.rddl.org/xhtml-base-1.mod
Spidering http://www.rddl.org/xhtml-basic-form-1.mod
Spidering http://www.rddl.org/xhtml-basic-table-1.mod
Spidering http://www.rddl.org/xhtml-blkphras-1.mod
Spidering http://www.rddl.org/xhtml-blkstruct-1.mod
Spidering http://www.rddl.org/xhtml-charent-1.mod
Spidering http://www.rddl.org/xhtml-datatypes-1.mod
Spidering http://www.rddl.org/xhtml-framework-1.mod
Spidering http://www.rddl.org/xhtml-hypertext-1.mod
Spidering http://www.rddl.org/xhtml-image-1.mod
Spidering http://www.rddl.org/xhtml-inlphras-1.mod
Spidering http://www.rddl.org/xhtml-inlstruct-1.mod
Spidering http://www.rddl.org/xhtml-lat1.ent
Spidering http://www.rddl.org/xhtml-link-1.mod
Spidering http://www.rddl.org/xhtml-meta-1.mod
Spidering http://www.rddl.org/xhtml-notations-1.mod
Spidering http://www.rddl.org/xhtml-object-1.mod
Spidering http://www.rddl.org/xhtml-param-1.mod
Spidering http://www.rddl.org/xhtml-qname-1.mod
Spidering http://www.rddl.org/xhtml-rddl-model-1.mod
Spidering http://www.rddl.org/xhtml-special.ent
Spidering http://www.rddl.org/xhtml-struct-1.mod
Spidering http://www.rddl.org/xhtml-symbol.ent
Spidering http://www.rddl.org/xhtml-text-1.mod
Spidering http://www.rddl.org/xlink-module-1.mod
Spidering http://www.rddl.org/rddl.rdfs
Spidering http://www.rddl.org/rddl-integration.rxg
Spidering http://www.rddl.org/modules/rddl-1.rxm

Processing Instructions


Pull Processing Instructions Example

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class PILister {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java PILister url" );
      return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      for (int event = parser.next(); 
           parser.hasNext(); 
           event = parser.next()) {
         if (event == XMLStreamConstants.PROCESSING_INSTRUCTION) {
             String target = parser.getPITarget();
             String data = parser.getPIData();
             System.out.println("<?" + target + " " + data + "?>");
         }
      }  
      parser.close();
         
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException e) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Comments

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class CommentPuller {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java CommentPuller url" );
      return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }

      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      boolean inItem = false;
      boolean inTitle = false;
      // I am relying on no recursion here. To fix this
      // just keep an int count rather than a boolean
      for (int event = parser.next(); 
           parser.hasNext(); 
           event = parser.next()) {
         if (event == XMLStreamConstants.COMMENT) {
           System.out.println(parser.getText());
         }
      }  
      parser.close();
      
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Output from CommentPuller

$ java -classpath stax.jar:.:bea.jar CommentPuller http://www.cafeaulait.org/oldnews/news2004January8.html

        /* Only sunsites are allowed to mirror this page and then
        only with explicit, prior permission. For details,
        send email to elharo@metalab.unc.edu */
        if (location.protocol.toLowerCase().indexOf("file") != 0 ) {
           if (0 > location.host.toLowerCase().indexOf("cafeconleche.org")
           && 0 > location.host.toLowerCase().indexOf("cafeaulait.org")
           && 0 > location.host.toLowerCase().indexOf("sunsite")
           && 0 > location.host.toLowerCase().indexOf("ibiblio.org")) {
            location.href="http://www.cafeconleche.org/oldnews/news2004January8.html";
          }
        }
        //

XML Declaration


Example: PullDeclaration

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class PullDeclaration {

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java PullDeclaration url" );
      return;    
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }

      XMLInputFactory factory = XMLInputFactory.newInstance();
      XMLStreamReader parser = factory.createXMLStreamReader(in);
        
      boolean inItem = false;
      boolean inTitle = false;
      // I am relying on no recursion here. To fix this
      // just keep an int count rather than a boolean
      for (int event = XMLStreamConstants.START_DOCUMENT; 
           parser.hasNext(); 
           event = parser.next()) {
         if (event == XMLStreamConstants.START_DOCUMENT) {
             
           String encoding = parser.getCharacterEncodingScheme();
           if (encoding == null) encoding = parser.getEncoding();
           if (encoding == null) encoding = "UTF-8";
           String version = parser.getVersion();
           if (version == null) version = "1.0";
             
           String declaration = "<?xml version=\"";
           declaration += version;
           declaration += "\" encoding=\"";
           declaration += encoding;
           if (parser.standaloneSet()) {
             declaration += "\" standalone=\"";
             if (parser.isStandalone()) {
               declaration += "yes";
             }
             else {
               declaration += "no";                 
             }
           }
           declaration += "\"?>";
           System.out.println(declaration);
           break;
         }
      }  
      parser.close();
      
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

Output from PullDeclaration

% java -classpath stax.jar:.:bea.jar PullDeclaration http://www.cafeconleche.org/
<?xml version="1.0" encoding="UTF-8"?>

Namespaces


The NamespaceContext Class

package javax.xml.namespace;

public interface NamespaceContext {

  public String   getNamespaceURI(String prefix);
  public String   getPrefix(String namespaceURI);
  public Iterator getPrefixes(String namespaceURI);
  
}


Requirements


XMLInputFactory Properties

Property Name Behavior Return type Default Value Required
javax.xml.stream.isValidating Validate?Boolean FalseNo
javax.xml.stream.isNamespaceAware Support namespaces? Boolean True True (required) / False (optional)
javax.xml.stream.isCoalescing Always return maximum contiguous run of text in a CHARACTERS event Boolean False Yes
javax.xml.stream.isReplacingEntityReferences resolve internal entity references Boolean True Yes
javax.xml.stream.isSupportingExternalEntities resolve external entity references Boolean Unspecified Yes
javax.xml.stream.reporter class used to report errors javax.xml.stream.XMLReporter NullYes
javax.xml.stream.resolver class used to resolve URIs javax.xml.stream.XMLResolver Null Yes
javax.xml.stream.allocator class used to allocate events javax.xml.stream.util.XMLEventAllocator NullYes
public void setProperty(String name, Object value)
  throws XMLStreamException;
public Object getProperty(String name);
public boolean isPropertySupported(String name)

XMLReporter

package javax.xml.stream;

public interface XMLReporter {

  public void report(String message, String errorType, Object relatedInformation, Location location) 
    throws XMLStreamException;
    
}

Example: PullValidator

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class PullValidator {

  private static boolean valid;

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java PullValidator url" );
      return;   
    }
        
    try {

      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      
      XMLInputFactory factory = XMLInputFactory.newInstance();
      if (!factory.isPropertySupported("javax.xml.stream.isValidating")) {
        System.err.println("This StAX implementation does not support validation.");
        return;   
      }
      factory.setProperty("javax.xml.stream.isValidating", Boolean.TRUE);
      XMLStreamReader parser = factory.createXMLStreamReader(in);
      
      valid = true;
      factory.setProperty("javax.xml.stream.reporter", new XMLReporter() {
        public void report(String message, String errorType, Object relatedInformation, Location location) {
          System.err.println(message);
          valid = false;
        }
      });
      
        
      while (true) {
           int event = parser.next();
           if (event == XMLStreamConstants.END_DOCUMENT) {
                parser.close();
                break;
           }
      }
      parser.close();
            
      // If we get here there are no exceptions
      if (valid) System.out.println(args[0] + " is valid.");      
      else System.out.println(args[0] + " is not valid.");      
    }
    catch (XMLStreamException ex) {
       System.out.println(args[0] + " is not well-formed."); 
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println(args[0] + " could not be checked due to an " 
       + ex.getClass().getName());   
      ex.printStackTrace();      
    }
        
  }

}

Location

package javax.xml.stream;

public interface Location {

  int getLineNumber();
  int getColumnNumber();
  int getCharacterOffset();
  String getLocationURI();
  
}




Locating an Implementation of XMLInputFactory


XMLStreamWriter

package javax.xml.stream;

public interface XMLStreamWriter {
  
  public void writeStartElement(String localName) 
    throws XMLStreamException;
  public void writeStartElement(String namespaceURI, String localName) 
    throws XMLStreamException;
  public void writeStartElement(String prefix,
                                String localName,
                                String namespaceURI) 
    throws XMLStreamException;

  public void writeEmptyElement(String namespaceURI, String localName) 
    throws XMLStreamException;
  public void writeEmptyElement(String prefix, String localName, String namespaceURI) 
    throws XMLStreamException;
  public void writeEmptyElement(String localName) 
    throws XMLStreamException;
    
  public void writeEndElement() 
    throws XMLStreamException;
    
  public void writeEndDocument() 
    throws XMLStreamException;

   public void writeAttribute(String localName, String value) 
    throws XMLStreamException;
  public void writeAttribute(String prefix,
                             String namespaceURI,
                             String localName,
                             String value) 
    throws XMLStreamException;
  public void writeAttribute(String namespaceURI,
                             String localName,
                             String value) 
    throws XMLStreamException;

  public void writeNamespace(String prefix, String namespaceURI) 
    throws XMLStreamException;
  public void writeDefaultNamespace(String namespaceURI)
    throws XMLStreamException;

  public void writeComment(String data) 
    throws XMLStreamException;
  public void writeProcessingInstruction(String target) 
    throws XMLStreamException;
  public void writeProcessingInstruction(String target,
                                         String data) 
    throws XMLStreamException;
  public void writeCData(String data) 
    throws XMLStreamException;
  public void writeDTD(String dtd) 
    throws XMLStreamException;
  public void writeEntityRef(String name) 
    throws XMLStreamException;
  public void writeStartDocument() 
    throws XMLStreamException;
  public void writeStartDocument(String version) 
    throws XMLStreamException;
  public void writeStartDocument(String encoding,
                                 String version) 
    throws XMLStreamException;
  public void writeCharacters(String text) 
    throws XMLStreamException;
    
  public void writeCharacters(char[] text, int start, int len) 
    throws XMLStreamException;

  public String getPrefix(String uri) 
    throws XMLStreamException;
  public void setPrefix(String prefix, String uri) 
    throws XMLStreamException;
  public void setDefaultNamespace(String uri) 
    throws XMLStreamException;
  public void setNamespaceContext(NamespaceContext context)
    throws XMLStreamException;
  public NamespaceContext getNamespaceContext();

  public void close() throws XMLStreamException;
  public void flush() throws XMLStreamException;  
  
  public Object getProperty(java.lang.String name) throws IllegalArgumentException;

}




XMLStreamWriter Example: Convert RDDL to XHTML


Example: RDDLStripper

import javax.xml.stream.*;
import java.net.*;
import java.io.*;

 
public class RDDLStripper {
    
  public final static String RDDL_NS = "http://www.rddl.org/";

  public static void main(String[] args) {
        
    if (args.length == 0) {
      System.err.println("Usage: java RDDLStripper url" );
      return;    
    }
        
    try {      
      InputStream in;
      try {
        URL u = new URL(args[0]);
        in = u.openStream();
      }
      catch (MalformedURLException ex) {
          // Maybe it's a file name
          in = new FileInputStream(args[0]);
      }
      
      XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(in);
      XMLStreamWriter serializer = XMLOutputFactory.newInstance().createXMLStreamWriter(System.out);
        
      while (true) {
         int event = parser.next();
         if (parser.isStartElement()) {
             String namespaceURI = parser.getNamespaceURI();
             if (!namespaceURI.equals(RDDL_NS)) {
                 serializer.writeStartElement(namespaceURI, parser.getLocalName());
                 // add attributes
                 for (int i = 0; i < parser.getAttributeCount(); i++) {
                     serializer.writeAttribute(
                       parser.getAttributeNamespace(i),
                       parser.getAttributeName(i),
                       parser.getAttributeValue(i)
                     );
                 }
                 // add namespace declarations
                 for (int i = 0; i < parser.getNamespaceCount(); i++) {
                     String uri = parser.getNamespaceURI(i);
                     if (!RDDL_NS.equals(uri)) {
                       serializer.writeNamespace(parser.getNamespacePrefix(i), uri);
                     }
                 }
             }
         }
         else if (parser.isEndElement()) {
             String namespaceURI = parser.getNamespaceURI();
             if (!namespaceURI.equals(RDDL_NS)) {
                 serializer.writeEndElement();
             }
         }
         else if (event == XMLStreamConstants.CHARACTERS || event == XMLStreamConstants.SPACE) {
             serializer.writeCharacters(parser.getText());
         }
         else if (event == XMLStreamConstants.CDATA) {
             serializer.writeCData(parser.getText());
         }
         else if (event == XMLStreamConstants.COMMENT) {
             serializer.writeComment(parser.getText());
         }
         else if (event == XMLStreamConstants.DTD) {
             serializer.writeDTD(parser.getText());
         }
         else if (event == XMLStreamConstants.ENTITY_REFERENCE) {
             serializer.writeEntityRef(parser.getLocalName());
         }
         else if (event == XMLStreamConstants.PROCESSING_INSTRUCTION) {
             serializer.writeProcessingInstruction(parser.getPITarget(), parser.getPIData());
         }
         else if (event == XMLStreamConstants.END_DOCUMENT) {
            serializer.flush();
            break;
         }
      }  
      serializer.close();         
      parser.close();
      
    }
    catch (XMLStreamException ex) {
       System.out.println(ex);  
    }
    catch (IOException ex) {
      System.out.println("IOException while parsing " + args[0]);   
    }
        
  }

}

One of my favorite features


StreamFilter


XMLIterator

package javax.xml.stream;

public interface XMLIterator {
 
  public XMLEvent next() throws XMLStreamException;
  public boolean  hasNext() throws XMLStreamException;
  
}




15 Kinds of XMLEvents


XMLEvent Methods


StartElement

package javax.xml.stream.events;

public interface StartElement extends XMLEvent {

  public QName     getName();
  public Iterator  getAttributes();
  public Iterator  getNamespaces();
  public Attribute getAttributeByName(QName name);
  public String    getNamespaceURI(String prefix);
  public NamespaceContext getNamespaceContext();

}

EndElement

package javax.xml.stream.events;

public interface EndElement extends XMLEvent {

  public QName    getName();
  public Iterator getNamespaces();

}

XMLEventReader

package javax.xml.stream;

public interface XMLEventReader extends XMLIterator {

  public XMLEvent peek() throws XMLStreamException;
  public String   getElementText() throws XMLStreamException;
  public XMLEvent nextTag() throws XMLStreamException;
  public Object   getProperty(java.lang.String name) 
   throws IllegalArgumentException;
  
}


Attribute

package javax.xml.stream.events;

public interface Attribute extends XMLEvent {
  
  public QName   getName();
  public String  getValue();
  public QName   getDTDType();
  public boolean isSpecified();

}

Characters

package javax.xml.stream.events;

public interface Characters extends XMLEvent {

  public String  getData();
  public boolean isWhiteSpace();
  public boolean isCData();
  public boolean isIgnorableWhiteSpace();

}

DTD

package javax.xml.stream.events;

public interface DTD extends XMLEvent {

  public String getDocumentTypeDeclaration(); 
  public Object getProcessedDTD();
  public List   getNotations();
  public List   getEntities();
  
}

Comment

package javax.xml.stream.events;

public interface Comment extends XMLEvent {

  public String getText();
  
}

ProcessingInstruction

package javax.xml.stream.events;

public interface ProcessingInstruction extends XMLEvent {

  public String getTarget();
  public String getData();
  
}

StartDocument

package javax.xml.stream.events;

public interface StartDocument extends XMLEvent {

  public String  getSystemId();
  public String  getCharacterEncodingScheme();
  public boolean encodingSet();
  public boolean isStandalone();
  public boolean standaloneSet();
  public String  getVersion();
  
}


EndDocument

package javax.xml.stream.events;

public interface EndDocument extends XMLEvent {

}

StartEntity

package javax.xml.stream.events;

public interface StartEntity extends XMLEvent {

  public String getName();
  
}


EndEntity

package javax.xml.stream.events;

public interface EndEntity extends XMLEvent {

  public String getName();

}


EntityDeclaration

package javax.xml.stream.events;

public interface EntityDeclaration extends XMLEvent {

  public String getPublicId();
  public String getSystemId(); 
  public String getName();
  public String getNotationName();
  
}

EntityReference

package javax.xml.stream.events;

public interface EntityReference extends XMLEvent {

  public String getBaseUri();
  public String getPublicId();
  public String getSystemId(); 
  public String getName();
  public String getReplacementText(); 
  
}

Namespace

package javax.xml.stream.events;

public interface Namespace extends Attribute {

  public String  getPrefix();
  public String  getNamespaceURI();
  public boolean isDefaultNamespaceDeclaration();
  
}

NotationDeclaration

package javax.xml.stream.events;

public interface NotationDeclaration extends XMLEvent {

  public String getName();
  public String getPublicId();
  public String getSystemId(); 
  
}

XMLEventWriter

package javax.xml.stream;

public interface XMLEventWriter extends XMLEventConsumer {

  public void flush() throws XMLStreamException;
  public void close() throws XMLStreamException;

  public void add(XMLEvent event) throws XMLStreamException;
  public void add(XMLEventReader reader) throws XMLStreamException;

  public String getPrefix(String uri) throws XMLStreamException;
  public void   setPrefix(String prefix, String uri) throws XMLStreamException;
  public void   setDefaultNamespace(String uri) throws XMLStreamException;
  public void   setNamespaceContext(NamespaceContext context)
    throws XMLStreamException;
  public NamespaceContext getNamespaceContext();

}

EventFilter


To Learn More


Index | Cafe con Leche

Copyright 2000-2004 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified March 19, 2004