Implementation as DOM

/*--

 Copyright 2001 Elliotte Rusty Harold.
 All rights reserved.

 I haven't yet decided on a license.
 It will be some form of open source.

 THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED.  IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY
 OTHER CONTRIBUTORS TO THIS PACKAGE
 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.

 */

package com.macfaq.xml;

import java.net.URL;
import java.net.URLConnection;
import java.net.MalformedURLException;
import java.util.Stack;
import java.util.List;
import java.util.ArrayList;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.io.InputStream;

import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import org.w3c.dom.Element;
import org.w3c.dom.Document;
import org.w3c.dom.Comment;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Text;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.DOMImplementation;

import org.apache.xerces.parsers.DOMParser;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;

/**
 * <p><code>DOMXIncluder</code> provides methods to
 * resolve DOM elements and documents to produce
 * a new <code>Document</code> or <code>Element</code> with all
 * XInclude references resolved.
 * </p>
 *
 * <p>
 * It does not yet handle the merging of unparsed entity
 * and notation information items from the included infosets.
 * Furthermore it does not include the source document's doctype 
 * declaration if that contains an internal DTD subset.
 * This may be the result of a Xerces bug. 
 * </p>
 *
 *
 * @author Elliotte Rusty Harold
 * @version 1.0d8
 */
public class DOMXIncluder {

  public final static String XINCLUDE_NAMESPACE
   = "http://www.w3.org/2001/XInclude";
  public final static String XML_NAMESPACE
   = "http://www.w3.org/XML/1998/namespace";

  // No instances allowed
  private DOMXIncluder() {}

  private static DOMParser parser = new DOMParser();

  /**
    * <p>
    * This method resolves a DOM <code>Document</code>
    * and merges in all XInclude references.
    * The <code>Document</code> object returned is a new document.
    * The original <code>Document</code> object is not changed.
    * </p>
    *
    * <p>
    * This method depends on the ability to clone a DOM <code>Document</code>
    * which not all DOM parsers may be able to do.
    * It definitely exercises a bug in Xerces-J 1.3.1.
    * This bug is fixed in Xerces-J 1.4.0.
    * </p>
    *
    * @param original <code>Document</code> that will be processed
    * @param base     <code>String</code> form of the base URI against which
    *                 relative URLs will be resolved. This can be null if the
    *                 document includes an <code>xml:base</code> attribute.
    * @return Document new <code>Document</code> object in which all
    *                  XInclude elements have been replaced.
    * @throws XIncludeException if this document, though namespace well-formed,
    *                           violates one of the rules of XInclude.
    * @throws NullPointerException  if the original argument is null.
    */
    public static Document resolve(Document original, String base)
      throws XIncludeException, NullPointerException {

        if (original == null) {
          throw new NullPointerException("Document must not be null");
        }

        Document resultDocument = (Document) original.cloneNode(true);
        // This clone doesn't seem to include the DOCTYPE 
        // if there's an internal DTD subset????
        // Is this the correct behavior? No, a bug in Xerces 1.4.3
        Element resultRoot = resultDocument.getDocumentElement();
 
        // Should this method return a DocumentFragment instead of a
        // NodeList????
        NodeList resolved = resolve(resultRoot, base, resultDocument);
        // Check that this contains exactly one root element
        // and no Text, DocumentType, or other nodes
        int numberRoots = 0;
        for (int i = 0; i < resolved.getLength(); i++) {
            if (resolved.item(i) instanceof Comment
              || resolved.item(i) instanceof ProcessingInstruction) {
                continue;
            }
            else if (resolved.item(i) instanceof Element) numberRoots++;
            else if (resolved.item(i) instanceof Text) {
                throw new XIncludeException(
                  "Tried to include text node outside document element");
            }
            else {
                throw new XIncludeException(
                  // convert type to a string????
                  "Cannot include a " + resolved.item(i).getNodeType() + " node");
            }
        }
        if (numberRoots != 1) {
            throw new XIncludeException("Tried to include multiple roots");
        }

        // insert nodes before the root
        int nodeIndex = 0;
        while (nodeIndex < resolved.getLength()) {
            if (resolved.item(nodeIndex) instanceof Element) break;
            resultDocument.insertBefore(resolved.item(nodeIndex), resultRoot);
            nodeIndex++;
        }

        // insert new root
        resultDocument.replaceChild(
          resolved.item(nodeIndex), resultRoot
        );
        nodeIndex++;

        //insert nodes after new root
        Node refNode = resultDocument.getDocumentElement().getNextSibling();
        if (refNode == null) {
            while (nodeIndex < resolved.getLength()) {
                resultDocument.appendChild(resolved.item(nodeIndex));
                nodeIndex++;
            }
        }
        else {
            while (nodeIndex < resolved.getLength()) {
                resultDocument.insertBefore(resolved.item(nodeIndex), refNode);
                nodeIndex++;
            }
        }

        return resultDocument;

    }

  /**
    * <p>
    * This method resolves a DOM <code>Element</code>
    * and merges in all XInclude references. This process is recursive.
    * The element returned contains no XInclude elements.
    * If a referenced document cannot be found it is replaced with
    * an error message. The <code>Element</code> object returned is a new element.
    * The original <code>Element</code> is not changed.
    * </p>
    *
    * @param original <code>Element</code> that will be processed
    * @param base     <code>String</code> form of the base URI against which
    *                 relative URLs will be resolved. This can be null if the
    *                 element includes an <code>xml:base</code> attribute.
    * @param resolved <code>Document</code> into which the resolved element will be placed.
    * @return NodeList the infoset that this element resolves to
    * @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
    *                                  that attempts to include a document in which
    *                                  this element is directly or indirectly included.
    * @throws NullPointerException  if the <code>original</code> argument is null.
    */
    public static NodeList resolve(Element original, String base, Document resolved)
      throws XIncludeException, NullPointerException {

        if (original == null) {
          throw new NullPointerException(
           "You can't XInclude a null element."
          );
        }
        Stack bases = new Stack();
        if (base != null) bases.push(base);

        NodeList result = resolve(original, bases, resolved);
        bases.pop();
        return result;

    }

    private static boolean isIncludeElement(Element element) {

        if (element.getLocalName().equals("include") &&
            element.getNamespaceURI().equals(XINCLUDE_NAMESPACE)) {
            return true;
        }
        return false;

    }


  /**
    * <p>
    * This method resolves a DOM <code>Element</code> into an infoset
    * and merges in all XInclude references. This process is recursive.
    * The returned infoset contains no XInclude elements.
    * If a referenced document cannot be found it is replaced with
    * an error message. The <code>NodeList</code> object returned is new.
    * The original <code>Element</code> is not changed.
    * </p>
    *
    * @param original <code>Element</code> that will be processed
    * @param bases    <code>Stack</code> containing the string forms of
    *                 all the URIs of documents which contain this element
    *                 through XIncludes. This used to detect if a circular
    *                 reference is being used.
    * @param resolved <code>Document</code> into which the resolved element will be placed.
    * @return NodeList the infoset into whihc this element resolves. This is just a copy
                       of the element if the element is not an XInclude element and does
                       not contain any XInclude elements.
    * @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
    *                                  that attempts to include a document in which
    *                                  this element is directly or indirectly included.
    * @throws MissingHrefException if the <code>href</code> attribute is missing from an include element.
    * @throws MalformedResourceException if an included document is not namespace well-formed
    * @throws BadParseAttributeException if an <code>include</code> element has a <code>parse</code> attribute
                                         with any value other than <code>text</code> or <code>parse</code>
    * @throws UnavailableResourceException if the URL in the include element's
                                           <code>href</code> attribute cannot be loaded.
    * @throws XIncludeException if this document, though namespace well-formed,
    *                           violates one of the rules of XInclude.
    */
    private static NodeList resolve(Element original, Stack bases, Document resolved)
      throws CircularIncludeException, MissingHrefException, MalformedResourceException,
      BadParseAttributeException, UnavailableResourceException, XIncludeException {

        XIncludeNodeList result = new XIncludeNodeList();
        String base = null;
        if (bases.size() != 0) base = (String) bases.peek();
  
        if (isIncludeElement(original)) {
  
          // Verify that there is an href attribute
          if (!original.hasAttribute("href")) {
            throw new MissingHrefException("Missing href attribute");
          }
          String href = original.getAttribute("href");
  
          // Check for a base attribute
          String baseAttribute
            = original.getAttributeNS(XML_NAMESPACE, "base");
          if (baseAttribute != null && !baseAttribute.equals("")) {
              base = baseAttribute;
          }
  
          String remote;
          if (base != null) {
              try {
                  URL context = new URL(base);
                  URL u = new URL(context, href);
                  remote = u.toExternalForm();
              }
              catch (MalformedURLException ex) {
                  XIncludeException xex = new UnavailableResourceException(
                    "Unresolvable URL " + base + "/" + href);
                  xex.setRootCause(ex);
                  throw xex;
              }
          }
          else {
              remote = href;
          }
  
          // check for parse attribute; default is true
          boolean parse = true;
          if (original.hasAttribute("parse")) {
              String parseAttribute = original.getAttribute("parse");
              if (parseAttribute.equals("text")) {
                  parse = false;
              }
              else if (!parseAttribute.equals("xml")) {
                  throw new BadParseAttributeException(
                    parseAttribute + "is not a legal value for the parse attribute"
                  );
              }
          }
  
          if (parse) {
              // checks for equality (OK) or identity (not OK)????
              if (bases.contains(remote)) {
                // need to figure out how to get file and number where
                // bad include occurs????
                  throw new CircularIncludeException(
                    "Circular XInclude Reference to "
                    + remote + " in " );
              }
  
              try {
                  parser.parse(remote);
                  Document doc = parser.getDocument();
                  bases.push(remote);
                  // this method needs to remove DocType node if any
                  NodeList docChildren = doc.getChildNodes();
                  for (int i = 0; i < docChildren.getLength(); i++) {
                      Node node = docChildren.item(i);
                      if (node instanceof Element) {
                          result.add(resolve((Element) node, bases, resolved));
                      }
                      else if (node instanceof DocumentType) continue;
                      else result.add(node);
                  }
                  bases.pop();
              }
              catch (SAXParseException e) {
                  int line = e.getLineNumber();
                  int column = e.getColumnNumber();
                  if (line <= 0) {
                      XIncludeException ex = new UnavailableResourceException("Document "
                        + remote + " was not found.");
                      ex.setRootCause(e);
                      throw ex;                        
                  }
                  else {
                      XIncludeException ex = new MalformedResourceException("Document "
                        + remote + " is not well-formed at line " + line + ", column " + column);
                      ex.setRootCause(e);
                      throw ex;
                  }
              }
              catch (SAXException e) {
                 XIncludeException ex = new MalformedResourceException("Document "
                   + remote + " is not well-formed.");
                 ex.setRootCause(e);
                 throw ex;
              }
              catch (IOException e) {
                  XIncludeException ex
                    = new UnavailableResourceException("Document not found: "
                    + remote);
                  ex.setRootCause(e);
                  throw ex;
              }
          }
          else { // insert text
              String encoding = original.getAttribute("encoding");
              String s = downloadTextDocument(remote, encoding);
              result.add(resolved.createTextNode(s));
          }
  
        }
        // not an include element
        else { // recursively process children
           // still need to adjust bases here????
           // replace nodes instead
           // Do I need to explicitly attach attributes here or does
           // importing take care of that????
           Element copy = (Element) resolved.importNode(original, false);
           NodeList children = original.getChildNodes();
           for (int i = 0; i < children.getLength(); i++) {
             Node n = children.item(i);
             if (n instanceof Element) {
               Element e = (Element) n;
               NodeList kids = resolve(e, bases, resolved);
               for (int j = 0; j < kids.getLength(); j++) {
                   copy.appendChild(kids.item(j));
               }
             }
             else {
               copy.appendChild(resolved.importNode(n, true));
             }
           }
           result.add(copy);
        }
  
        return result;

    }

  /**
    * <p>
    * This utility method reads a document at a specified URL
    * and returns the contents of that document as a <code>Text</code>.
    * It's used to include files with <code>parse="text"</code>
    * </p>
    *
    * @param url      URL of the document that will be stored in
    *                 <code>String</code>.
    * @param  encoding Encoding of the document; e.g. UTF-8,
    *                  ISO-8859-1, etc. If this is null or the empty string
    *                  then UTF-8 is guessed. 
    * @return String  The document retrieved from the source <code>URL</code>
    * @throws UnavailableResourceException if the requested document cannot
                                           be downloaded from the specified URL.
    */
    private static String downloadTextDocument(String url, String encoding)
      throws UnavailableResourceException {

        if (encoding == null || encoding.equals("")) {
            encoding = "UTF-8";  
            // should try to read encoding from HTTP header
            // and XML declaration heuristics     
        }
        URL source;
        try {
            source = new URL(url);
        }
        catch (MalformedURLException e) {
            UnavailableResourceException ex =
              new UnavailableResourceException("Unresolvable URL " + url);
            ex.setRootCause(e);
            throw ex;
        }

        StringBuffer s = new StringBuffer();
        try {
            URLConnection uc = source.openConnection();
            InputStream in = new BufferedInputStream(uc.getInputStream());
            String encodingFromHeader = uc.getContentEncoding();
            String contentType = uc.getContentType();
            if (encodingFromHeader != null) encoding = encodingFromHeader;
            else {
                // What if file does not have a MIME type but name ends in .xml????
                // MIME types are case-insensitive
                // Java may be picking this up from file URL
                if (contentType != null) {
                    contentType = contentType.toLowerCase();
                    if (contentType.equals("text/xml") 
                      || contentType.equals("application/xml")   
                      || (contentType.startsWith("text/") && contentType.endsWith("+xml") ) 
                      || (contentType.startsWith("application/") && contentType.endsWith("+xml"))) {
                         encoding = EncodingHeuristics.readEncodingFromStream(in);
                    }
                }
            }
            InputStreamReader reader = new InputStreamReader(in, encoding);
            int c;
            while ((c = in.read()) != -1) {
                s.append((char) c);
            }
            return s.toString();
        }
        catch (UnsupportedEncodingException e) {
            UnavailableResourceException ex = new UnavailableResourceException(
              "Encoding not recognized for document " + source.toExternalForm());
            ex.setRootCause(e);
            throw ex;
        }
        catch (IOException e) {
            UnavailableResourceException ex = new UnavailableResourceException(
              "Document not found: " + source.toExternalForm());
            ex.setRootCause(e);
            throw ex;
        }

    }

    /**
      * <p>
      * The driver method for the XIncluder program.
      * I'll probably move this to a separate class soon.
      * </p>
      *
      * @param args  contains the URLs and/or filenames
      *              of the documents to be procesed.
      */
    public static void main(String[] args) {

        DOMParser parser = new DOMParser();
        for (int i = 0; i < args.length; i++) {
            try {
                parser.parse(args[i]);
                Document input = parser.getDocument();
                // absolutize URL
                String base = args[i];
                if (base.indexOf(':') < 0) {
                  File f = new File(base);
                  base = f.toURL().toExternalForm();
                }
                Document output = resolve(input, base);
                // need to set encoding on this to Latin-1 and check what
                // happens to UTF-8 curly quotes
                OutputFormat format = new OutputFormat("XML", "ISO-8859-1", false);
                format.setPreserveSpace(true);
                XMLSerializer serializer
                 = new XMLSerializer(System.out, format);
                serializer.serialize(output);
            }
            catch (Exception e) {
                System.err.println(e);
                e.printStackTrace();
            }
        }

    }

}


// I need to create NodeLists in a parser independent fashion
class XIncludeNodeList implements NodeList {

    private List data = new ArrayList();

// could easily expose more List methods if they seem useful
    public void add(int index, Node node) {
        data.add(index, node);
    }

    public void add(Node node) {
        data.add(node);
    }

    public void add(NodeList nodes) {
        for (int i = 0; i < nodes.getLength(); i++) {
            data.add(nodes.item(i));
        }
    }

    public Node item(int index) {
        return (Node) data.get(index);
    }

// copy DOM JavaDoc
    public int getLength() {
        return data.size();
    }

}

Previous | Next | Top | Cafe con Leche

Copyright 2000, 2001 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified January 13, 2001