Implementation as JDOM

/*--

 Copyright 2000 Elliotte Rusty Harold.
 All rights reserved.

 I haven't yet decided on a license.
 It will be some form of open source.

 THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED.  IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY
 OTHER CONTRIBUTORS TO THIS PACKAGE
 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.

 */

package com.macfaq.xml;

import java.net.URL;
import java.net.MalformedURLException;
import java.util.Stack;
import java.util.Iterator;
import java.util.List;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.io.InputStream;
import org.jdom.Namespace;
import org.jdom.Comment;
import org.jdom.CDATA;
import org.jdom.JDOMException;
import org.jdom.Attribute;
import org.jdom.Element;
import org.jdom.ProcessingInstruction;
import org.jdom.Document;
import org.jdom.input.SAXBuilder;
import org.jdom.output.XMLOutputter;

/**
 * <p><code>XIncluder</code> provides methods to
 * resolve JDOM elements and documents to produce
 * a new Document or Element with all
 * XInclude references resolved.
 * </p>
 *
 *
 * @author Elliotte Rusty Harold
 * @version 1.0d2
 */
public class XIncluder {

  public final static Namespace XINCLUDE_NAMESPACE
    = Namespace.getNamespace("xinclude", "http://www.w3.org/1999/XML/xinclude");

  // No instances allowed
  private XIncluder() {}

  private static SAXBuilder builder = new SAXBuilder();

  /**
    * <p>
    * This method resolves a JDOM <code>Document</code>
    * and merges in all XInclude references.
    * If a referenced document cannot be found it is replaced with
    * an error message. The Document object returned is a new document.
    * The original <code>Document</code> is not changed.
    * </p>
    *
    * @param original <code>Document</code> that will be processed
    * @param base     <code>String</code> form of the base URI against which
    *                 relative URLs will be resolved. This can be null if the
    *                 document includes an <code>xml:base</code> attribute.
    * @return Document new <code>Document</code> object in which all
    *                  XInclude elements have been replaced.
    * @throws CircularIncludeException if this document possesses a cycle of
    *                                  XIncludes.
    * @throws MalformedURLException if Java cannot parse the base URI using the
    *                               the normal methods of java.net.URL.
    */
    public static Document resolve(Document original, String base)
      throws CircularIncludeException, MalformedURLException {

        if (original == null) throw new NullPointerException("Document must not be null");

        Element root = original.getRootElement();
        Element resolved = (Element) resolve(root, base);

        // catch a ClassCastException if a String is returned????
        // Is the root element allowed to be replaced by
        // an parse="text"

        Document result = new Document(resolved, original.getDocType());

        Iterator iterator = original.getMixedContent().iterator();
        while (iterator.hasNext()) {
          Object o = iterator.next();
          if (o instanceof Comment) {
            Comment c = (Comment) o;
            result.addContent((Comment) c.clone());
          }
          else if (o instanceof ProcessingInstruction) {
            ProcessingInstruction pi =(ProcessingInstruction) o;
            result.addContent((ProcessingInstruction) pi.clone());
          }
        }

        return result;
  }

  /**
    * <p>
    * This method resolves a JDOM <code>Element</code>
    * and merges in all XInclude references. This process is recursive.
    * The element returned contains no XInclude elements.
    * If a referenced document cannot be found it is replaced with
    * an error message. The <code>Element</code> object returned is a new element.
    * The original <code>Element</code> is not changed.
    * </p>
    *
    * @param original <code>Element</code> that will be processed
    * @param base     <code>String</code> form of the base URI against which
    *                 relative URLs will be resolved. This can be null if the
    *                 element includes an <code>xml:base</code> attribute.
    * @return Object  Either an <code>Element</code>
    *                 (<code>parse="text"</code>) or a <code>String</code>
    *                 (<code>parse="xml"</code>)
    * @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
    *                                  that attempts to include a document in which
    *                                  this element is directly or indirectly included.
    */
    public static Object resolve(Element original, String base)
     throws CircularIncludeException, MalformedURLException {

        if (original == null) {
          throw new NullPointerException("You can't XInclude a null element.");
        }
        Stack bases = new Stack();
        if (base != null) bases.push(base);

        Object result = resolve(original, bases);
        bases.pop();
        return result;

    }

    private static boolean isIncludeElement(Element element) {

        if (element.getName().equals("include") &&
            element.getNamespace().equals(XINCLUDE_NAMESPACE)) {
          return true;
        }
        return false;

    }


  /**
    * <p>
    * This method resolves a JDOM <code>Element</code>
    * and merges in all XInclude references. This process is recursive.
    * The element returned contains no XInclude elements.
    * If a referenced document cannot be found it is replaced with
    * an error message. The <code>Element</code> object returned is a new element.
    * The original <code>Element</code> is not changed.
    * </p>
    *
    * @param original <code>Element</code> that will be processed
    * @param bases    <code>Stack</code> containing the string forms of
    *                 all the URIs of documents which contain this element
    *                 through XIncludes. This used to detect if a circular
    *                 reference is being used.
    * @return Object  Either an <code>Element</code>
    *                 (<code>parse="text"</code>) or a <code>String</code>
    *                 (<code>parse="xml"</code>)
    * @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
    *                                  that attempts to include a document in which
    *                                  this element is directly or indirectly included.
    */
  protected static Object resolve(Element original, Stack bases)
   throws CircularIncludeException {

    Element result;
    String base = "";
    if (bases.size() != 0) base = (String) bases.peek();

    if (isIncludeElement(original)) {
      Attribute href = original.getAttribute("href");
      if (href == null) { // illegal, what kind of exception????
        throw new IllegalArgumentException("Missing href attribute");
      }
      Attribute baseAttribute
       = original.getAttribute("base", Namespace.XML_NAMESPACE);
      if (baseAttribute != null) base = baseAttribute.getValue();
      boolean parse = true;
      Attribute parseAttribute = original.getAttribute("parse");
      if (parseAttribute != null) {
        if (parseAttribute.getValue().equals("text")) parse = false;
      }

      URL remote;
      if (base != null) {
        try {
          URL context = new URL(base);
          remote = new URL(context, href.getValue());
        }
        catch (MalformedURLException ex) {
          return "Unresolvable URL " + base + "/" + href.getValue();
        }
      }
      else {
        try {
          remote = new URL(href.getValue());
        }
        catch (MalformedURLException ex) {
          return "Unresolvable URL " + href.getValue();
        }
      }

      if (parse) {
                 // checks for equality (OK) or identity (not OK)????
        if (bases.contains(remote.toExternalForm())) {
          // need to figure out how to get file and number where
          // bad include occurs
          throw new CircularIncludeException(
            "Circular XInclude Reference to "
           + remote.toExternalForm() + " in " );
        }

        try {
          Document doc = builder.build(remote);
          bases.push(remote.toExternalForm());
          result = (Element) resolve(doc.getRootElement(), bases);
          bases.pop();
        }
        // Make this configurable
        catch (JDOMException e) {
           return "Document not found: " + remote.toExternalForm()
            + "\r\n" + e.getMessage();
        }
      }
      else { // insert text
        return downloadTextDocument(remote);
      }

    }
    // not an include element
    else { // recursively process children
       result = new Element(original.getName(), original.getNamespace());
       Iterator attributes = original.getAttributes().iterator();
       while (attributes.hasNext()) {
         Attribute a = (Attribute) attributes.next();
         result.addAttribute((Attribute) a.clone());
       }
       List children = original.getMixedContent();

       Iterator iterator = children.iterator();
       while (iterator.hasNext()) {
         Object o = iterator.next();
         if (o instanceof Element) {
           Element e = (Element) o;
           Object resolved = resolve(e, bases);
           if (resolved instanceof String) {
               result.addContent((String) resolved);
           }
           else result.addContent((Element) resolved);
         }
         else if (o instanceof String) {
           result.addContent((String) o);
         }
         else if (o instanceof Comment) {
           result.addContent((Comment) o);
         }
         else if (o instanceof CDATA) {
           result.addContent((CDATA) o);
         }
         else if (o instanceof ProcessingInstruction) {
           result.addContent((ProcessingInstruction) o);
         }
       }
    }

    return result;

  }

  /**
    * <p>
    * This utility method reads a document at a specified URL
    * and returns the contents of that document as a <code>String</code>.
    * It's used to include files with <code>parse="text"</code>
    * </p>
    *
    * <p>
    * If the document cannot be located due to an IOException,
    * then an error message string is returned. I'm not yet convinced this
    * is the right behavior. Perhaps I should pass on the exception?
    * </p>
    *
    * @param source   <code>URL</code> of the document that will be stored in
    *                 <code>String</code>.
    * @return String  The document retrieved from the source <code>URL</code>
    *                 or an error message if the document can't be retrieved.
    *                 Note: throwing an exception might be better here. I should
    *                 at least allow the setting of the error message.
    */
    public static String downloadTextDocument(URL source) {

        StringBuffer s = new StringBuffer();
        try {
          InputStream in = new BufferedInputStream(source.openStream());
          // does XInclude give you anything to specify the character set????
          InputStreamReader reader = new InputStreamReader(in, "8859_1");
          int c;
          while ((c = in.read()) != -1) {
            if (c == '<') s.append("&lt;");
            else if (c == '&') s.append("&amp;");
            else s.append((char) c);
          }
          return s.toString();
        }
        catch (IOException e) {
          return "Document not found: " + source.toExternalForm();
        }

    }

    /**
      * <p>
      * The driver method for the XIncluder program.
      * I'll probably move this to a separate class soon.
      * </p>
      *
      * @param args  <code>args[0]</code> contains the URL or file name
      *              of the document to be processed.
      */
    public static void main(String[] args) {

        SAXBuilder builder = new SAXBuilder();
        XMLOutputter outputter = new XMLOutputter();
        for (int i = 0; i < args.length; i++) {
          try {
            Document input = builder.build(args[i]);
            // absolutize URL
            String base = args[i];
            if (base.indexOf(':') < 0) {
              File f = new File(base);
              base = f.toURL().toExternalForm();
            }
            Document output = resolve(input, base);
            // need to set encoding on this to Latin-1 and check what
            // happens to UTF-8 curly quotes
            outputter.output(output, System.out);
          }
          catch (Exception e) {
            System.err.println(e);
            e.printStackTrace();
          }
        }

    }

}

Previous | Next | Top | Cafe con Leche

Copyright 2000, 2001 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified January 13, 2001