The ContentHandler

Basically like any other ContentHandler that writes out a document (e.g. David Megginson's XMLWriter) except it calls XMLRandomizer to shuffle everything before it writes it out.

/* Copyright 2005 Elliotte Rusty Harold
   
   This library is free software; you can redistribute it and/or modify
   it under the terms of version 2.1 of the GNU General Public 
   License as published by the Free Software Foundation.
   
   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
   GNU Lesser General Public License for more details.
   
   You should have received a copy of the GNU General Public
   License along with this library; if not, write to the 
   Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
   Boston, MA 02111-1307  USA
   
   You can contact Elliotte Rusty Harold by sending e-mail to
   elharo@metalab.unc.edu. 
*/

package com.elharo.xml;

import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.ext.DeclHandler;
import org.xml.sax.ext.LexicalHandler;

// ???? I need a cleaner interface for this that doesn't expose ContentHandler

/**
 * <p>
 * <code>RandomizingHandler</code> converts an XML document into an obscured form.
 * that can be safely distributed without exposing private information.
 * The transformation is random and irreversible. The document is not 
 * merely encrypted. Content is shuffled randomly with no key. Randomizing
 * the same document twice will produce two different documents. The goal 
 * is to produce a document that shares the same performance characteristics
 * and will expose the same bugs as the original document, without revealing
 * the original document's contents. In other words, it attempts to keep the
 * structure of the document the same while completely erasing the contents. 
 * To this end, certain properties of the document remain invariant.
 * Specifically: 
 * </p>
 * <ol>
 * <li>ASCII characters remain ASCII.</li>
 * <li>White space is not changed.</li>
 * <li>&amp;, &lt;, >, and " are not changed.</li>
 * <li>Plane 0 characters remain in Plane 0. The other planes may be shuffled. 
 * <li>ISO-8859-1 remains ISO-8859-1.</li>
 * <li>C1 controls remain C1 controls.</li>
 * <li>Plane 0 Unicode characters stay within the same block (e,g. Arabic stays Arabic,
 * It doesn't change to Thai and vice versa.</li>
 * <li>Element and attribute names and attribute values can be randomized at 
 *     user option but identical names stay identical. The same name becomes the same randomized name.
 * <li><code>xml:space</code>, <code>xml:lang</code>, <code>xml:base</code>, 
 *     and other attributes in the XML namespace are not changed.
 * <li>Namespace names and prefixes are randomized at 
 *     user option. However, the prefixes and
 *     the bindings still match up.</li>
 * <li>Non-ASCII, non-name characters in Plane 0 are mostly unchanged. </li>
 * <li>CDATA sections remain CDATA sections </li>
 * </ol>
 * 
 * <p>
 * This doesn't achieve military grade security, but it should 
 * sufficient to allow people to submit their sensitive documents
 * for benchmarks and bug reports with a reasonable expectation of
 * privacy. 
 * </p>
 * 
 * @author Elliotte Rusty Harold
 */
public class RandomizingHandler 
  implements ContentHandler, LexicalHandler, DTDHandler, DeclHandler {
    
    private Writer out;
    private XMLRandomizer randomizer;
    private boolean inExternalSubset;
    private boolean inDTD;
    private boolean hasInternalSubset = false;
    private boolean outsideRoot = true;
    private int entityDepth;
    private int elementDepth = 0;
    
    /**
     * <p>
     * Create a new RandomizingHandler that shuffles names and content.
     * </p>
     * 
     * @param out the OutputStream to write the randomized document to. This stream will
     *     be flushed but not closed.
     * @throws IOException if an I/O error occurs when writing to <code>out</code>
     */
    public RandomizingHandler(OutputStream out) throws IOException {
        this(out, false);
    }

    
    /**
    /**
     * <p>
     * Create a new RandomizingHandler that shuffles content and otpionally
     * names and namespace URIs.
     * </p>
     * 
     * @param out the OutputStream to write the randomized document to. This stream will
     *     be flushed but not closed.
     * @param preserveNames if true element and attribute names and namespace URIs
     *     are not shuffled. If false, they are.
     * @throws IOException if an I/O error occurs when writing to <code>out</code>
     */
    public RandomizingHandler(OutputStream out, boolean preserveNames) throws IOException {
        
        // XXX need to preserve encoding
        this.randomizer = new XMLRandomizer(preserveNames);
        this.out = new OutputStreamWriter(out, "UTF-8");
        
    }

    
    public void setDocumentLocator(Locator locator) {}

    public void startDocument() throws SAXException {
        
        hasInternalSubset = false;
        outsideRoot = true;
        entityDepth = 0;
        elementDepth = 0;
        inDTD = false;
        try {
            out.flush();
            // XXX preserve encoding and standalone
            out.write("<?xml version='1.0' encoding='UTF-8'?>\n");
        }
        catch (IOException ex) {
            throw new SAXException(ex);
        }
        
    }


    public void endDocument() throws SAXException {
        
        try {
            out.flush();
        }
        catch (IOException ex) {
            throw new SAXException(ex);
        }
        
    }


    public void startPrefixMapping(String prefix, String uri) {
        // fix the value
        randomizer.randomizeName(prefix);
        // change the URL too????
        // could explicitly store a mapping
    }


    public void endPrefixMapping(String prefix) {}


    public void startElement(String namespaceURI, String localName, String qName,
      Attributes attributes) throws SAXException {

        outsideRoot = false;
        elementDepth++;
        if (entityDepth > 0) return;
        String randomizedQName = randomizer.randomizeQName(qName); 
        write("<" +  randomizedQName);
        for (int i = 0; i < attributes.getLength(); i++) {
            String name = attributes.getQName(i);
            String value = attributes.getValue(i);
            String type = attributes.getType(i);
            
            write(" ");
            if (name.startsWith("xml:")) write(name);
            else if (name.equals("xmlns")) {
                write("xmlns");
            }
            else if (name.startsWith("xmlns:")) {
                write("xmlns:");
                String attPrefix = name.substring(6);
                write(randomizer.randomizeName(attPrefix));
            }
            else write(randomizer.randomizeName(name));
            write("=\"");
            if (name.startsWith("xml:")) write(value);
            else {
                if (name.startsWith("xmlns:") 
                  || name.equals("xmlns")) {
                    write(randomizer.randomizeNamespaceURI(value));
                }
                else if (type.equals("CDATA")) write(randomizer.randomize(value));
                else if ( type.equals("ID") || type.equals("NMTOKEN")
                       || type.equals("IDREF") || type.equals("ENTITY")
                       || type.equals("NOTATION") ) {
                    write(randomizer.randomizeToken(value));
                }
                else {
                    write(randomizer.randomizeTokens(value));
                }
            }
            write("\"");
        }
        write(">");
        
    }


    public void endElement(String namespaceURI, String localName, String qName)
     throws SAXException {

        elementDepth--;
        if (entityDepth > 0) return;
        write("</" + randomizer.randomizeQName(qName) + ">");
        if (elementDepth == 0) {
            write("\n");
            outsideRoot = true;
        }

    }


    public void characters(char[] text, int start, int length)
      throws SAXException {
        if (entityDepth > 0) return;
        write(randomizer.randomize(text, start, length));
    }


    public void ignorableWhitespace(char[] text, int start, int length) 
      throws SAXException {
        characters(text, start, length);
    }


    public void processingInstruction(String target, String data)
      throws SAXException {

        if (inExternalSubset) return;
        
        if (inDTD && !hasInternalSubset) {
            startInternalSubset();
        }
        write("<?");
        write(randomizer.randomizeName(target));
        write(" ");
        write(randomizer.randomize(data));
        write("?>");
       if (outsideRoot) write("\n");
        
    }


    public void skippedEntity(String name) throws SAXException {
        if (entityDepth > 0) return;
        write("&" + randomizer.randomizeQName(name) + ";");
    }
    
    
    public void startDTD(String root, String publicID, String systemID) 
      throws SAXException {
        inDTD = true;
        write("<!DOCTYPE " + randomizer.randomizeName(root) + " ");
        if (publicID != null) write("PUBLIC \"" + publicID + "\" \"" + systemID + "\"");
        else if (systemID != null) write("SYSTEM \"" + systemID + "\"");
    }

    public void endDTD() throws SAXException {
        if (hasInternalSubset) write("]");
        write(">\n");
        inDTD = false;
    }
    
    public void startEntity(String name) throws SAXException {
        
      if (name.equals("[dtd]")) inExternalSubset = true;
      else entityDepth++;
      
      if (entityDepth == 1) {
          write("&");
          if ("amp".equals(name) || "lt".equals(name) ||
              "gt".equals(name) || "quot".equals(name) ||
              "apos".equals(name)) {
              write(name);
          }
          else {
              write(randomizer.randomizeQName(name));
          }
          write(";");
      }
      
    }
    
    
    public void endEntity(String name) {
      if (name.equals("[dtd]")) inExternalSubset = false; 
      else entityDepth--;
    }

    
    public void startCDATA() throws SAXException {
        if (entityDepth > 0) return;
        write("<![CDATA[");
    }

    
    public void endCDATA() throws SAXException {
        if (entityDepth > 0) return;
        write("]]>"); 
    }
    
    
    private void write(String s) throws SAXException {
        
        try {
            out.write(s);
        }
        catch (IOException ex) {
            throw new SAXException(ex);
        }
    }
    

    public void comment(char[] text, int start, int length) 
      throws SAXException {

        if (inExternalSubset) return;
        if (entityDepth > 0) return;
        
        if (inDTD && !hasInternalSubset) {
            startInternalSubset();
        }
        write("<!--");
        write(randomizer.randomize(text, start, length));
        write("-->");
        if (outsideRoot) write("\n");
        
    }


    private void startInternalSubset() throws SAXException {

        hasInternalSubset = true;
        write("[\n");
        
    }


    public void notationDecl(String name, String publicID, String systemID) 
      throws SAXException {

        if (!inExternalSubset) {
            if (!hasInternalSubset) startInternalSubset();
            write("  <!NOTATION ");
            write(randomizer.randomizeQName(name));
            write(" ");
            if (publicID != null) {
                write("PUBLIC \"");
                write(publicID);
            }
            else if (systemID != null) {
                write("SYSTEM \"");
                write(systemID);
            }
            write("\">\n");
        }
        
    }


    public void unparsedEntityDecl(String name, String publicID, String systemID, String notation) 
      throws SAXException {

        if (!inExternalSubset) {
            if (!hasInternalSubset) startInternalSubset();
            write("  <!ENTITY ");
            write(name);
            write(" ");
            if (publicID != null) {
                write("PUBLIC ");
                write('"' + publicID + '"');
                write(" ");
            }
            else {
              write("SYSTEM ");
            }
            write('"' + systemID + '"');
            write(" NDATA ");
            write('"' + randomizer.randomizeQName(notation));
            write("\">\n");
        }
        
    }


    public void elementDecl(String name, String model) throws SAXException {

        // XXX need to parse the model and randomize all its QNames
        if (!inExternalSubset) {
            if (!hasInternalSubset) {
                hasInternalSubset = true;
                write(" [\n");
            }
            write("  <!ELEMENT ");
            write(randomizer.randomizeQName(name));
            write(" ");
            write(model);
            write(">\n");
        }
        
    }


    public void attributeDecl(String elementName, String attributeName, 
      String type, String defaultValue, String value) 
      throws SAXException {

        if (!inExternalSubset) {
            if (!hasInternalSubset) {
                startInternalSubset();
            }
            write("  <!ATTLIST ");
            write(randomizer.randomizeQName(elementName));
            write(" ");
            write(randomizer.randomizeQName(attributeName));
            write(" ");
            
            if (type.startsWith("(")) {
                write(randomizer.randomizeEnumeratedList(type));
            }
            else write(type);
            if (defaultValue != null) {
                write(" ");
                write(defaultValue);
            }
            if (value != null) {
                write(" \"");
                write(randomizer.randomize(value));
                write("\"");
            }
            write(">\n");
        }
        
    }


    public void internalEntityDecl(String name, String value) throws SAXException {

        if (!inExternalSubset) {
            if (!hasInternalSubset) startInternalSubset();
            write("  <!ENTITY " + randomizer.randomizeQName(name) 
                + " \"" + randomizer.randomize(value) + "\">\n");
        }
        
    }


    public void externalEntityDecl(String name, String publicID, String systemID) 
      throws SAXException {

        if (!inExternalSubset) {
            if (!hasInternalSubset) startInternalSubset();
            write("  <!ENTITY ");
            write(name);
            write(" ");
            if (publicID != null) {
                write("PUBLIC ");
                write('"' + publicID + '"');
                write(" ");
            }
            else {
              write("SYSTEM ");
            }
            write('"' + systemID);
            write("\">\n");
        }
        
    }


    public XMLRandomizer getRandomizer() {
        return this.randomizer;
    }

}

Previous | Next | Top | Cafe con Leche

Copyright 2005 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified August 2, 2005