Implementation as DOM
/*--
Copyright 2001 Elliotte Rusty Harold.
All rights reserved.
I haven't yet decided on a license.
It will be some form of open source.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY
OTHER CONTRIBUTORS TO THIS PACKAGE
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
*/
package com.macfaq.xml;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.Stack;
import java.util.List;
import java.util.ArrayList;
import org.xml.sax.SAXException;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.io.InputStream;
import org.w3c.dom.Element;
import org.w3c.dom.Document;
import org.w3c.dom.Comment;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Text;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.apache.xerces.parsers.DOMParser;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
/**
* <p><code>DOMXIncluder</code> provides methods to
* resolve DOM elements and documents to produce
* a new <code>Document</code> or <code>Element</code> with all
* XInclude references resolved.
* </p>
*
*
* @author Elliotte Rusty Harold
* @version 1.0d6
*/
public class DOMXIncluder {
public final static String XINCLUDE_NAMESPACE
= "http://www.w3.org/2001/XInclude";
public final static String XML_NAMESPACE
= "http://www.w3.org/XML/1998/namespace";
// No instances allowed
private DOMXIncluder() {}
private static DOMParser parser = new DOMParser();
/**
* <p>
* This method resolves a DOM <code>Document</code>
* and merges in all XInclude references.
* The <code>Document</code> object returned is a new document.
* The original <code>Document</code> object is not changed.
* </p>
*
* <p>
* This method depends on the ability to clone a DOM <code>Document</code>
* which not all DOM parsers may be able to do.
* It definitely exercises a big in Xerces-J 1.3.1.
* This bug is fixed in Xerces-J 1.4.0.
* </p>
*
* @param original <code>Document</code> that will be processed
* @param base <code>String</code> form of the base URI against which
* relative URLs will be resolved. This can be null if the
* document includes an <code>xml:base</code> attribute.
* @return Document new <code>Document</code> object in which all
* XInclude elements have been replaced.
* @throws XIncludeException if this document, though namespace well-formed,
* violates one of the rules of XInclude.
* @throws NullPointerException if the original argument is null.
*/
public static Document resolve(Document original, String base)
throws XIncludeException, NullPointerException {
if (original == null) {
throw new NullPointerException("Document must not be null");
}
Document resultDocument = (Document) original.cloneNode(true);
Element resultRoot = resultDocument.getDocumentElement();
NodeList resolved = resolve(resultRoot, base, resultDocument);
// Check that this contains exactly one root element
// and no Text nodes
int numberRoots = 0;
for (int i = 0; i < resolved.getLength(); i++) {
if (resolved.item(i) instanceof Comment
||resolved.item(i) instanceof ProcessingInstruction
||resolved.item(i) instanceof DocumentType) {
continue;
}
else if (resolved.item(i) instanceof Element) numberRoots++;
else if (resolved.item(i) instanceof Text) {
// Is this OK if the text node only contains white space????
throw new XIncludeException(
"Tried to include text node outside document element");
}
else {
throw new XIncludeException(
// convert type to a string????
"Cannot include a " + resolved.item(i).getNodeType() + " node");
}
}
if (numberRoots != 1) {
throw new XIncludeException("Tried to include multiple roots");
}
// insert nodes before the root
int nodeIndex = 0;
while (nodeIndex < resolved.getLength()) {
if (resolved.item(nodeIndex) instanceof Element) break;
resultDocument.insertBefore(resolved.item(nodeIndex), resultRoot);
nodeIndex++;
}
// insert new root
resultDocument.replaceChild(
resolved.item(nodeIndex), resultRoot
);
nodeIndex++;
//insert nodes after new root
Node refNode = resultDocument.getDocumentElement().getNextSibling();
if (refNode == null) {
while (nodeIndex < resolved.getLength()) {
resultDocument.appendChild(resolved.item(nodeIndex));
nodeIndex++;
}
}
else {
while (nodeIndex < resolved.getLength()) {
resultDocument.insertBefore(resolved.item(nodeIndex), refNode);
nodeIndex++;
}
}
return resultDocument;
}
/**
* <p>
* This method resolves a DOM <code>Element</code>
* and merges in all XInclude references. This process is recursive.
* The element returned contains no XInclude elements.
* If a referenced document cannot be found it is replaced with
* an error message. The <code>Element</code> object returned is a new element.
* The original <code>Element</code> is not changed.
* </p>
*
* @param original <code>Element</code> that will be processed
* @param base <code>String</code> form of the base URI against which
* relative URLs will be resolved. This can be null if the
* element includes an <code>xml:base</code> attribute.
* @param resolved <code>Document</code> into which the resolved element will be placed.
* @return NodeList the infoset that this element resolves to
* @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
* that attempts to include a document in which
* this element is directly or indirectly included.
* @throws NullPointerException if the <code>original</code> argument is null.
*/
public static NodeList resolve(Element original, String base, Document resolved)
throws XIncludeException, NullPointerException {
if (original == null) {
throw new NullPointerException(
"You can't XInclude a null element."
);
}
Stack bases = new Stack();
if (base != null) bases.push(base);
NodeList result = resolve(original, bases, resolved);
bases.pop();
return result;
}
private static boolean isIncludeElement(Element element) {
if (element.getLocalName().equals("include") &&
element.getNamespaceURI().equals(XINCLUDE_NAMESPACE)) {
return true;
}
return false;
}
/**
* <p>
* This method resolves a DOM <code>Element</code> into an infoset
* and merges in all XInclude references. This process is recursive.
* The returned infoset contains no XInclude elements.
* If a referenced document cannot be found it is replaced with
* an error message. The <code>NodeList</code> object returned is new.
* The original <code>Element</code> is not changed.
* </p>
*
* @param original <code>Element</code> that will be processed
* @param bases <code>Stack</code> containing the string forms of
* all the URIs of documents which contain this element
* through XIncludes. This used to detect if a circular
* reference is being used.
* @param resolved <code>Document</code> into which the resolved element will be placed.
* @return NodeList the infoset into whihc this element resolves. This is just a copy
of the element if the element is not an XINclude element and does
not contain any XInclude elements.
* @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
* that attempts to include a document in which
* this element is directly or indirectly included.
* @throws MissingHrefException if the <code>href</code> attribute is missing from an include element.
* @throws MalformedResourceException if an included document is not namespace well-formed or
* @throws UnavailableResourceException if the URL in the include element's
<code>href</code> attribute cannot be loaded.
* @throws XIncludeException if this document, though namespace well-formed,
* violates one of the rules of XInclude.
*/
private static NodeList resolve(Element original, Stack bases, Document resolved)
throws CircularIncludeException, MissingHrefException, MalformedResourceException,
UnavailableResourceException, XIncludeException {
XIncludeNodeList result = new XIncludeNodeList();
String base = null;
if (bases.size() != 0) base = (String) bases.peek();
if (isIncludeElement(original)) {
// Verify that there is an href attribute
if (!original.hasAttribute("href")) {
throw new MissingHrefException("Missing href attribute");
}
String href = original.getAttribute("href");
// Check for a base attribute
String baseAttribute
= original.getAttributeNS(XML_NAMESPACE, "base");
if (baseAttribute != null && !baseAttribute.equals("")) {
base = baseAttribute;
}
String remote;
if (base != null) {
try {
URL context = new URL(base);
URL u = new URL(context, href);
remote = u.toExternalForm();
}
catch (MalformedURLException ex) {
XIncludeException xex = new UnavailableResourceException(
"Unresolvable URL " + base + "/" + href);
xex.setRootCause(ex);
throw xex;
}
}
else {
remote = href;
}
// check for parse attribute; default is true
boolean parse = true;
if (original.hasAttribute("parse")) {
String parseAttribute = original.getAttribute("parse");
if (parseAttribute.equals("text")) {
parse = false;
}
}
if (parse) {
// checks for equality (OK) or identity (not OK)????
if (bases.contains(remote)) {
// need to figure out how to get file and number where
// bad include occurs????
throw new CircularIncludeException(
"Circular XInclude Reference to "
+ remote + " in " );
}
try {
parser.parse(remote);
Document doc = parser.getDocument();
bases.push(remote);
// this method need to remove DocType node if any
NodeList docChildren = doc.getChildNodes();
for (int i = 0; i < docChildren.getLength(); i++) {
Node node = docChildren.item(i);
if (node instanceof Element) {
result.add(resolve((Element) node, bases, resolved));
}
else if (node instanceof DocumentType) continue;
else result.add(node);
}
bases.pop();
}
// Make this configurable
catch (SAXException e) {
XIncludeException ex = new MalformedResourceException("Document "
+ remote + " is not well-formed.");
ex.setRootCause(e);
throw ex;
}
catch (IOException e) {
XIncludeException ex
= new UnavailableResourceException("Document not found: "
+ remote);
ex.setRootCause(e);
throw ex;
}
}
else { // insert text
String s = downloadTextDocument(remote);
result.add(resolved.createTextNode(s));
}
}
// not an include element
else { // recursively process children
// still need to adjust bases here????
// replace nodes instead
// Do I need to explicitly attach attributes here or does
// importing take care of that????
Element copy = (Element) resolved.importNode(original, false);
NodeList children = original.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node n = children.item(i);
if (n instanceof Element) {
Element e = (Element) n;
NodeList kids = resolve(e, bases, resolved);
for (int j = 0; j < kids.getLength(); j++) {
copy.appendChild(kids.item(j));
}
}
else {
copy.appendChild(resolved.importNode(n, true));
}
}
result.add(copy);
}
return result;
}
/**
* <p>
* This utility method reads a document at a specified URL
* and returns the contents of that document as a <code>Text</code>.
* It's used to include files with <code>parse="text"</code>
* </p>
*
* @param url URL of the document that will be stored in
* <code>String</code>.
* @return Text The document retrieved from the source <code>URL</code>
* or an error message if the document can't be retrieved.
* Note: throwing an exception might be better here. I should
* at least allow the setting of the error message.
* @throws UnavailableResourceException if the requested document cannot
be downloaded from the specified URL.
*/
private static String downloadTextDocument(String url)
throws UnavailableResourceException {
URL source;
try {
source = new URL(url);
}
catch (MalformedURLException e) {
UnavailableResourceException ex =
new UnavailableResourceException("Unresolvable URL " + url);
ex.setRootCause(e);
throw ex;
}
StringBuffer s = new StringBuffer();
try {
InputStream in = new BufferedInputStream(source.openStream());
// does XInclude give you anything to specify the character set????
InputStreamReader reader = new InputStreamReader(in, "8859_1");
int c;
while ((c = in.read()) != -1) {
s.append((char) c);
}
return s.toString();
}
catch (IOException e) {
UnavailableResourceException ex = new UnavailableResourceException(
"Document not found: " + source.toExternalForm());
ex.setRootCause(e);
throw ex;
}
}
/**
* <p>
* The driver method for the XIncluder program.
* I'll probably move this to a separate class soon.
* </p>
*
* @param args contains the URLs and/or filenames
* of the documents to be procesed.
*/
public static void main(String[] args) {
DOMParser parser = new DOMParser();
for (int i = 0; i < args.length; i++) {
try {
parser.parse(args[i]);
Document input = parser.getDocument();
// absolutize URL
String base = args[i];
if (base.indexOf(':') < 0) {
File f = new File(base);
base = f.toURL().toExternalForm();
}
Document output = resolve(input, base);
// need to set encoding on this to Latin-1 and check what
// happens to UTF-8 curly quotes
OutputFormat format = new OutputFormat("XML", "ISO-8859-1", false);
format.setPreserveSpace(true);
XMLSerializer serializer
= new XMLSerializer(System.out, format);
serializer.serialize(output);
}
catch (Exception e) {
System.err.println(e);
e.printStackTrace();
}
}
}
}
// I need to create NodeLists in a parser independent fashion
class XIncludeNodeList implements NodeList {
private List data = new ArrayList();
// could easily expose more List methods if they seem useful
public void add(int index, Node node) {
data.add(index, node);
}
public void add(Node node) {
data.add(node);
}
public void add(NodeList nodes) {
for (int i = 0; i < nodes.getLength(); i++) {
data.add(nodes.item(i));
}
}
public Node item(int index) {
return (Node) data.get(index);
}
// copy DOM JavaDoc
public int getLength() {
return data.size();
}
}