Implementation as DOM

/*--

 Copyright 2001 Elliotte Rusty Harold.
 All rights reserved.

 I haven't yet decided on a license.
 It will be some form of open source.

 THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED.  IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY
 OTHER CONTRIBUTORS TO THIS PACKAGE
 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.

 */

package com.macfaq.xml;

import java.net.URL;
import java.net.MalformedURLException;
import java.util.Stack;
import java.util.List;
import java.util.ArrayList;
import org.xml.sax.SAXException;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.io.InputStream;

import org.w3c.dom.Element;
import org.w3c.dom.Document;
import org.w3c.dom.Comment;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Text;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import org.apache.xerces.parsers.DOMParser;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;

/**
 * <p><code>DOMXIncluder</code> provides methods to
 * resolve DOM elements and documents to produce
 * a new <code>Document</code> or <code>Element</code> with all
 * XInclude references resolved.
 * </p>
 *
 *
 * @author Elliotte Rusty Harold
 * @version 1.0d6
 */
public class DOMXIncluder {

  public final static String XINCLUDE_NAMESPACE
   = "http://www.w3.org/2001/XInclude";
  public final static String XML_NAMESPACE
   = "http://www.w3.org/XML/1998/namespace";

  // No instances allowed
  private DOMXIncluder() {}

  private static DOMParser parser = new DOMParser();

  /**
    * <p>
    * This method resolves a DOM <code>Document</code>
    * and merges in all XInclude references.
    * The <code>Document</code> object returned is a new document.
    * The original <code>Document</code> object is not changed.
    * </p>
    *
    * <p>
    * This method depends on the ability to clone a DOM <code>Document</code>
    * which not all DOM parsers may be able to do.
    * It definitely exercises a big in Xerces-J 1.3.1.
    * This bug is fixed in Xerces-J 1.4.0.
    * </p>
    *
    * @param original <code>Document</code> that will be processed
    * @param base     <code>String</code> form of the base URI against which
    *                 relative URLs will be resolved. This can be null if the
    *                 document includes an <code>xml:base</code> attribute.
    * @return Document new <code>Document</code> object in which all
    *                  XInclude elements have been replaced.
    * @throws XIncludeException if this document, though namespace well-formed,
    *                           violates one of the rules of XInclude.
    * @throws NullPointerException  if the original argument is null.
    */
    public static Document resolve(Document original, String base)
      throws XIncludeException, NullPointerException {

        if (original == null) {
          throw new NullPointerException("Document must not be null");
        }

        Document resultDocument = (Document) original.cloneNode(true);
 
        Element resultRoot = resultDocument.getDocumentElement();

        NodeList resolved = resolve(resultRoot, base, resultDocument);
        // Check that this contains exactly one root element
        // and no Text nodes
        int numberRoots = 0;
        for (int i = 0; i < resolved.getLength(); i++) {
            if (resolved.item(i) instanceof Comment 
              ||resolved.item(i) instanceof ProcessingInstruction
              ||resolved.item(i) instanceof DocumentType) {
                continue;       
            }
            else if (resolved.item(i) instanceof Element) numberRoots++;       
            else if (resolved.item(i) instanceof Text) {
                // Is this OK if the text node only contains white space????
                throw new XIncludeException(
                  "Tried to include text node outside document element");   
            }
            else {
                throw new XIncludeException(
                  // convert type to a string????
                  "Cannot include a " + resolved.item(i).getNodeType() + " node");                   
            }
        } 
        if (numberRoots != 1) {
            throw new XIncludeException("Tried to include multiple roots");
        }

        // insert nodes before the root
        int nodeIndex = 0;
        while (nodeIndex < resolved.getLength()) {
            if (resolved.item(nodeIndex) instanceof Element) break;
            resultDocument.insertBefore(resolved.item(nodeIndex), resultRoot);
            nodeIndex++;   
        } 

        // insert new root
        resultDocument.replaceChild(
          resolved.item(nodeIndex), resultRoot
        );
        nodeIndex++;

        //insert nodes after new root
        Node refNode = resultDocument.getDocumentElement().getNextSibling();
        if (refNode == null) {
            while (nodeIndex < resolved.getLength()) {
                resultDocument.appendChild(resolved.item(nodeIndex));
                nodeIndex++;
            } 
        }                          
        else {
            while (nodeIndex < resolved.getLength()) {
                resultDocument.insertBefore(resolved.item(nodeIndex), refNode);
                nodeIndex++;
            }                
        }
        
        return resultDocument;
        
    }

  /**
    * <p>
    * This method resolves a DOM <code>Element</code>
    * and merges in all XInclude references. This process is recursive.
    * The element returned contains no XInclude elements.
    * If a referenced document cannot be found it is replaced with
    * an error message. The <code>Element</code> object returned is a new element.
    * The original <code>Element</code> is not changed.
    * </p>
    *
    * @param original <code>Element</code> that will be processed
    * @param base     <code>String</code> form of the base URI against which
    *                 relative URLs will be resolved. This can be null if the
    *                 element includes an <code>xml:base</code> attribute.
    * @param resolved <code>Document</code> into which the resolved element will be placed.
    * @return NodeList the infoset that this element resolves to
    * @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
    *                                  that attempts to include a document in which
    *                                  this element is directly or indirectly included.
    * @throws NullPointerException  if the <code>original</code> argument is null.
    */
    public static NodeList resolve(Element original, String base, Document resolved)
      throws XIncludeException, NullPointerException {

        if (original == null) {
          throw new NullPointerException(
           "You can't XInclude a null element."
          );
        }
        Stack bases = new Stack();
        if (base != null) bases.push(base);

        NodeList result = resolve(original, bases, resolved);
        bases.pop();
        return result;

    }

    private static boolean isIncludeElement(Element element) {

        if (element.getLocalName().equals("include") &&
            element.getNamespaceURI().equals(XINCLUDE_NAMESPACE)) {
            return true;
        }
        return false;

    }


  /**
    * <p>
    * This method resolves a DOM <code>Element</code> into an infoset
    * and merges in all XInclude references. This process is recursive.
    * The returned infoset contains no XInclude elements.
    * If a referenced document cannot be found it is replaced with
    * an error message. The <code>NodeList</code> object returned is new.
    * The original <code>Element</code> is not changed.
    * </p>
    *
    * @param original <code>Element</code> that will be processed
    * @param bases    <code>Stack</code> containing the string forms of
    *                 all the URIs of documents which contain this element
    *                 through XIncludes. This used to detect if a circular
    *                 reference is being used.
    * @param resolved <code>Document</code> into which the resolved element will be placed.
    * @return NodeList the infoset into whihc this element resolves. This is just a copy
                       of the element if the element is not an XINclude element and does
                       not contain any XInclude elements.
    * @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
    *                                  that attempts to include a document in which
    *                                  this element is directly or indirectly included.
    * @throws MissingHrefException if the <code>href</code> attribute is missing from an include element.
    * @throws MalformedResourceException if an included document is not namespace well-formed or 
    * @throws UnavailableResourceException if the URL in the include element's 
                                           <code>href</code> attribute cannot be loaded.
    * @throws XIncludeException if this document, though namespace well-formed,
    *                           violates one of the rules of XInclude.
    */
    private static NodeList resolve(Element original, Stack bases, Document resolved)
      throws CircularIncludeException, MissingHrefException, MalformedResourceException,
             UnavailableResourceException, XIncludeException {
  
      XIncludeNodeList result = new XIncludeNodeList();
      String base = null;
      if (bases.size() != 0) base = (String) bases.peek();
  
      if (isIncludeElement(original)) {
          
        // Verify that there is an href attribute
        if (!original.hasAttribute("href")) { 
          throw new MissingHrefException("Missing href attribute");
        }
        String href = original.getAttribute("href");
        
        // Check for a base attribute
        String baseAttribute
          = original.getAttributeNS(XML_NAMESPACE, "base");
        if (baseAttribute != null && !baseAttribute.equals("")) {
            base = baseAttribute;
        }
        
        String remote;
        if (base != null) {
            try {
                URL context = new URL(base);
                URL u = new URL(context, href);
                remote = u.toExternalForm();
            }
            catch (MalformedURLException ex) {
                XIncludeException xex = new UnavailableResourceException(
                  "Unresolvable URL " + base + "/" + href);
                xex.setRootCause(ex);
                throw xex;
            }
        }
        else {
            remote = href;
        }
  
        // check for parse attribute; default is true
        boolean parse = true;
        if (original.hasAttribute("parse")) {
            String parseAttribute = original.getAttribute("parse");
            if (parseAttribute.equals("text")) {
                parse = false;
            }
        }
  
        if (parse) {
            // checks for equality (OK) or identity (not OK)????
            if (bases.contains(remote)) {
              // need to figure out how to get file and number where
              // bad include occurs????
                throw new CircularIncludeException(
                  "Circular XInclude Reference to "
                  + remote + " in " );
            }
  
            try {
                parser.parse(remote);
                Document doc = parser.getDocument();
                bases.push(remote);
                // this method need to remove DocType node if any
                NodeList docChildren = doc.getChildNodes();
                for (int i = 0; i < docChildren.getLength(); i++) {
                    Node node = docChildren.item(i);
                    if (node instanceof Element) {
                        result.add(resolve((Element) node, bases, resolved));
                    }
                    else if (node instanceof DocumentType) continue;
                    else result.add(node);
                }
                bases.pop();
            }
            // Make this configurable
            catch (SAXException e) {
               XIncludeException ex = new MalformedResourceException("Document "
                 + remote + " is not well-formed.");
               ex.setRootCause(e);
               throw ex;
            }
            catch (IOException e) {
                XIncludeException ex 
                  = new UnavailableResourceException("Document not found: "
                  + remote);
                ex.setRootCause(e);
                throw ex;
            }
        }
        else { // insert text
            String s = downloadTextDocument(remote);
            result.add(resolved.createTextNode(s));
        }
  
      }
      // not an include element
      else { // recursively process children
         // still need to adjust bases here????
         // replace nodes instead
         // Do I need to explicitly attach attributes here or does
         // importing take care of that????
         Element copy = (Element) resolved.importNode(original, false);
         NodeList children = original.getChildNodes();
         for (int i = 0; i < children.getLength(); i++) {
           Node n = children.item(i);
           if (n instanceof Element) {
             Element e = (Element) n;
             NodeList kids = resolve(e, bases, resolved);
             for (int j = 0; j < kids.getLength(); j++) {
                 copy.appendChild(kids.item(j));
             }
           }
           else {
             copy.appendChild(resolved.importNode(n, true));
           }
         }
         result.add(copy);
      }
  
      return result;
  
    }

  /**
    * <p>
    * This utility method reads a document at a specified URL
    * and returns the contents of that document as a <code>Text</code>.
    * It's used to include files with <code>parse="text"</code>
    * </p>
    *
    * @param url      URL of the document that will be stored in
    *                 <code>String</code>.
    * @return Text  The document retrieved from the source <code>URL</code>
    *                 or an error message if the document can't be retrieved.
    *                 Note: throwing an exception might be better here. I should
    *                 at least allow the setting of the error message.
    * @throws UnavailableResourceException if the requested document cannot
                                           be downloaded from the specified URL.
    */
    private static String downloadTextDocument(String url) 
      throws UnavailableResourceException {

        URL source;
        try {
            source = new URL(url);
        }
        catch (MalformedURLException e) {
            UnavailableResourceException ex =
              new UnavailableResourceException("Unresolvable URL " + url);
            ex.setRootCause(e);
            throw ex;
        }
        
        StringBuffer s = new StringBuffer();
        try {
            InputStream in = new BufferedInputStream(source.openStream());
            // does XInclude give you anything to specify the character set????
            InputStreamReader reader = new InputStreamReader(in, "8859_1");
            int c;
            while ((c = in.read()) != -1) {
                s.append((char) c);
            }
            return s.toString();
        }
        catch (IOException e) {
            UnavailableResourceException ex = new UnavailableResourceException(
              "Document not found: " + source.toExternalForm());
            ex.setRootCause(e);
            throw ex;
        }

    }

    /**
      * <p>
      * The driver method for the XIncluder program.
      * I'll probably move this to a separate class soon.
      * </p>
      *
      * @param args  contains the URLs and/or filenames
      *              of the documents to be procesed.
      */
    public static void main(String[] args) {

        DOMParser parser = new DOMParser();
        for (int i = 0; i < args.length; i++) {
            try {
                parser.parse(args[i]);
                Document input = parser.getDocument();
                // absolutize URL
                String base = args[i];
                if (base.indexOf(':') < 0) {
                  File f = new File(base);
                  base = f.toURL().toExternalForm();
                }
                Document output = resolve(input, base);
                // need to set encoding on this to Latin-1 and check what
                // happens to UTF-8 curly quotes   
                OutputFormat format = new OutputFormat("XML", "ISO-8859-1", false);
                format.setPreserveSpace(true);
                XMLSerializer serializer
                 = new XMLSerializer(System.out, format);
                serializer.serialize(output);
            }
            catch (Exception e) {
                System.err.println(e);
                e.printStackTrace();
            }
        }

    }

}


// I need to create NodeLists in a parser independent fashion
class XIncludeNodeList implements NodeList {
   
    private List data = new ArrayList();     

// could easily expose more List methods if they seem useful
    public void add(int index, Node node) {
        data.add(index, node);
    }

    public void add(Node node) {
        data.add(node);
    }

    public void add(NodeList nodes) {
        for (int i = 0; i < nodes.getLength(); i++) {
            data.add(nodes.item(i));
        }
    }

    public Node item(int index) {
        return (Node) data.get(index);  
    }

// copy DOM JavaDoc
    public int getLength() {
        return data.size();
    }      
        
}

Previous | Next | Top | Cafe con Leche

Copyright 2000, 2001 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified January 13, 2001