The Randomizer

/* Copyright 2005 Elliotte Rusty Harold
   
   This library is free software; you can redistribute it and/or modify
   it under the terms of version 2.1 of the GNU General Public 
   License as published by the Free Software Foundation.
   
   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
   GNU Lesser General Public License for more details.
   
   You should have received a copy of the GNU General Public
   License along with this library; if not, write to the 
   Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
   Boston, MA 02111-1307  USA
   
   You can contact Elliotte Rusty Harold by sending e-mail to
   elharo@metalab.unc.edu. 
*/

package com.elharo.xml;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.StringTokenizer;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

/**
 * <p>
 * The <code>XMLRandomizer</code> class converts strings into reproducible
 * obscured forms. It maintains maps of the names it's used previously so it 
 * can reproduce the same name for the same string.
 * This doesn't achieve military grade security, but it should be
 * sufficient to allow people to submit their sensitive documents
 * for benchmarks and bug reports with a reasonable expectation of
 * privacy. 
 * </p>
 * 
 * @author Elliotte Rusty Harold
 */
public class XMLRandomizer {

    // XXX should I clear these two between documents?
    private Map names = new HashMap();
    private Map tokens = new HashMap();
    private Random random = new SecureRandom();
    private boolean preserveNames = false;

    // should all preserve Name functionality be part of the handler????
    // probably
    
    public XMLRandomizer(boolean preserveNames) {
        this.preserveNames = preserveNames;
        try {
            random = SecureRandom.getInstance("SHA1PRNG");
        }
        catch (NoSuchAlgorithmException ex) {
            System.err.println("Using insecure random number generator");
            random = new Random();
        }
        
    }


    public String randomizeNamespaceURI(String uri) {
        if (preserveNames) return uri;
        // XXX preserve scheme
        return randomize(uri);
    }


    public String randomizeQName(String qName) {

        if (preserveNames) return qName;
        int colon = qName.indexOf(':');
        if (colon == -1) return randomizeName(qName);
        
        String prefix = qName.substring(0, colon);
        String name = qName.substring(colon+1);
        return randomizeName(prefix) + ':' + randomizeName(name);
        
    }


    public String randomize(String text) {
        return randomize(text.toCharArray(), 0, text.length());
    }

    
    public String randomizeName(String name) {
        if (preserveNames) return name;
        String cachedName = (String) (names.get(name));
        if (cachedName != null) return cachedName;
        String result = randomize(name.toCharArray(), 0, name.length());
        names.put(name, result);
        return result;
    }

    
    public String randomizeToken(String token) {
        String cachedToken = (String) (tokens.get(token));
        if (cachedToken != null) return cachedToken;
        String result = randomize(token.toCharArray(), 0, token.length());
        tokens.put(token, result);
        return result;
    }

    
    public String randomizeTokens(String value) {
        
        StringBuffer result = new StringBuffer();
        String[] tokens = value.split("\\s+");
        
        for (int i = 0; i < tokens.length; i++) {
            result.append(randomizeToken(tokens[i]));
            if (i == tokens.length) break;
        }
        
        return result.toString();
    }

    
    public String randomize(char[] text, int start, int length) {

        StringBuffer sb = new StringBuffer();
        for (int i = start; i < start+length; i++) {
            char c = text[i];
            switch (c) {
                case '&':
                    sb.append("&amp;");
                case '<':
                    sb.append("&lt;");
                case '>':
                    sb.append("&gt;");
                case '"':
                    sb.append("&quot;");
                default:
                    sb.append(randomize(c));
            }
        }
            
        return sb.toString();
        
    }
    
    // need a randomize without lookup table method????

    private char randomize(char c) {

        if (c == ':') return ':';
        else if (c == ' ') return ' ';
        else if (c == '\t') return '\t';
        else if (c == '\n') return '\n';
        else if (c == '\r') return '\r';
        else if (c >= 'A' && c <= 'Z') return randomChar('A', 'Z');
        else if (c >= 'a' && c <= 'z') return randomChar('a', 'z');
        else if (c >= '0' && c <= '9') return randomChar('0', '9');
        else if (isASCIIPunctuationCharacter(c)) return getRandomAsciiPunctuation();
        else if (c <= 127) return c;
        else if (c >= 0xA1 && c <= 0xBF) return randomChar(0xA1, 0xBF);
        else if (c >= 0xC0 && c <= 0xD6) return randomChar(0xC0, 0xD6);
        else if (c >= 0xC0 && c <= 0xD6) return randomChar(0xC0, 0xD6);
        else if (c >= 0xD8 && c <= 0xF6) return randomChar(0xD8, 0xF6);
        else if (c >= 0xF8 && c <= 0xFF) return randomChar(0xF8, 0xFF);
        else if (c >= 0x4E00 && c <= 0x9FA5) return randomChar(0x4E00, 0x9FA5);
        else if (c >= 0x0100 && c <= 0x0131) return randomChar(0x0100, 0x0131);
        else if (c >= 0x0134 && c <= 0x013E) return randomChar(0x0134, 0x013E);
        else if (c >= 0x0141 && c <= 0x0148) return randomChar(0x0141, 0x0148);
        else if (c >= 0x014A && c <= 0x017E) return randomChar(0x014A, 0x017E);
        else if (c >= 0x0180 && c <= 0x01C3) return randomChar(0x0180, 0x01C3);
        else if (c >= 0x01CD && c <= 0x01F0) return randomChar(0x01CD, 0x01F0);
        else if (c >= 0x01F4 && c <= 0x01F5) return randomChar(0x01F4, 0x01F5);
        else if (c >= 0x01FA && c <= 0x0217) return randomChar(0x01FA, 0x0217);
        else if (c >= 0x0250 && c <= 0x02A8) return randomChar(0x0250, 0x02A8);
        else if (c >= 0x02BB && c <= 0x02C1) return randomChar(0x02BB, 0x02C1);
        else if (c >= 0x0388 && c <= 0x038A) return randomChar(0x0388, 0x038A);
        else if (c >= 0x038E && c <= 0x03A1) return randomChar(0x038E, 0x03A1);
        else if (c >= 0x03A3 && c <= 0x03CE) return randomChar(0x03A3, 0x03CE);
        else if (c >= 0x03D0 && c <= 0x03D6) return randomChar(0x03D0, 0x03D6);
        else if (c >= 0x03E2 && c <= 0x03F3) return randomChar(0x03E2, 0x03F3);
        else if (c >= 0x0401 && c <= 0x040C) return randomChar(0x0401, 0x040C);
        else if (c >= 0x040E && c <= 0x044F) return randomChar(0x040E, 0x044F);
        else if (c >= 0x0451 && c <= 0x045C) return randomChar(0x0451, 0x045C);
        else if (c >= 0x045E && c <= 0x0481) return randomChar(0x045E, 0x0481);
        else if (c >= 0x0490 && c <= 0x04C4) return randomChar(0x0490, 0x04C4);
        else if (c >= 0x04C7 && c <= 0x04C8) return randomChar(0x04C7, 0x04C8);
        else if (c >= 0x04CB && c <= 0x04CC) return randomChar(0x04CB, 0x04CC);
        else if (c >= 0x04D0 && c <= 0x04EB) return randomChar(0x04D0, 0x04EB);
        else if (c >= 0x04EE && c <= 0x04F5) return randomChar(0x04EE, 0x04F5);
        else if (c >= 0x04F8 && c <= 0x04F9) return randomChar(0x04F8, 0x04F9);
        else if (c >= 0x0531 && c <= 0x0556) return randomChar(0x0531, 0x0556);
        else if (c >= 0x0561 && c <= 0x0586) return randomChar(0x0561, 0x0586);
        else if (c >= 0x05D0 && c <= 0x05EA) return randomChar(0x05D0, 0x05EA);
        else if (c >= 0x05F0 && c <= 0x05F2) return randomChar(0x05F0, 0x05F2);
        else if (c >= 0x0621 && c <= 0x063A) return randomChar(0x0621, 0x063A);
        else if (c >= 0x0641 && c <= 0x064A) return randomChar(0x0641, 0x064A);
        else if (c >= 0x0671 && c <= 0x06B7) return randomChar(0x0671, 0x06B7);
        else if (c >= 0x06BA && c <= 0x06BE) return randomChar(0x06BA, 0x06BE);
        else if (c >= 0x06C0 && c <= 0x06CE) return randomChar(0x06C0, 0x06CE);
        else if (c >= 0x06D0 && c <= 0x06D3) return randomChar(0x06D0, 0x06D3);
        else if (c >= 0x06E5 && c <= 0x06E6) return randomChar(0x06E5, 0x06E6);
        else if (c >= 0x0905 && c <= 0x0939) return randomChar(0x0905, 0x0939);
        else if (c >= 0x0958 && c <= 0x0961) return randomChar(0x0958, 0x0961);
        else if (c >= 0x0985 && c <= 0x098C) return randomChar(0x0985, 0x098C);
        else if (c >= 0x098F && c <= 0x0990) return randomChar(0x098F, 0x0990);
        else if (c >= 0x0993 && c <= 0x09A8) return randomChar(0x0993, 0x09A8);
        else if (c >= 0x09AA && c <= 0x09B0) return randomChar(0x09AA, 0x09B0);
        else if (c >= 0x09B6 && c <= 0x09B9) return randomChar(0x09B6, 0x09B9);
        else if (c >= 0x09DC && c <= 0x09DD) return randomChar(0x09DC, 0x09DD);
        else if (c >= 0x09DF && c <= 0x09E1) return randomChar(0x09DF, 0x09E1);
        else if (c >= 0x09F0 && c <= 0x09F1) return randomChar(0x09F0, 0x09F1);
        else if (c >= 0x0A05 && c <= 0x0A0A) return randomChar(0x0A05, 0x0A0A);
        else if (c >= 0x0A0F && c <= 0x0A10) return randomChar(0x0A0F, 0x0A10);
        else if (c >= 0x0A13 && c <= 0x0A28) return randomChar(0x0A13, 0x0A28);
        else if (c >= 0x0A2A && c <= 0x0A30) return randomChar(0x0A2A, 0x0A30);
        else if (c >= 0x0A32 && c <= 0x0A33) return randomChar(0x0A32, 0x0A33);
        else if (c >= 0x0A35 && c <= 0x0A36) return randomChar(0x0A35, 0x0A36);
        else if (c >= 0x0A38 && c <= 0x0A39) return randomChar(0x0A38, 0x0A39);
        else if (c >= 0x0A59 && c <= 0x0A5C) return randomChar(0x0A59, 0x0A5C);
        else if (c >= 0x0A72 && c <= 0x0A74) return randomChar(0x0A72, 0x0A74);
        else if (c >= 0x0A85 && c <= 0x0A8B) return randomChar(0x0A85, 0x0A8B);
        else if (c >= 0x0A8F && c <= 0x0A91) return randomChar(0x0A8F, 0x0A91);
        else if (c >= 0x0A93 && c <= 0x0AA8) return randomChar(0x0A93, 0x0AA8);
        else if (c >= 0x0AAA && c <= 0x0AB0) return randomChar(0x0AAA, 0x0AB0);
        else if (c >= 0x0AB2 && c <= 0x0AB3) return randomChar(0x0AB2, 0x0AB3);
        else if (c >= 0x0AB5 && c <= 0x0AB9) return randomChar(0x0AB5, 0x0AB9);
        else if (c >= 0x0B05 && c <= 0x0B0C) return randomChar(0x0B05, 0x0B0C);
        else if (c >= 0x0B0F && c <= 0x0B10) return randomChar(0x0B0F, 0x0B10);
        else if (c >= 0x0B13 && c <= 0x0B28) return randomChar(0x0B13, 0x0B28);
        else if (c >= 0x0B2A && c <= 0x0B30) return randomChar(0x0B2A, 0x0B30);
        else if (c >= 0x0B32 && c <= 0x0B33) return randomChar(0x0B32, 0x0B33);
        else if (c >= 0x0B36 && c <= 0x0B39) return randomChar(0x0B36, 0x0B39);
        else if (c >= 0x0B5C && c <= 0x0B5D) return randomChar(0x0B5C, 0x0B5D);
        else if (c >= 0x0B5F && c <= 0x0B61) return randomChar(0x0B5F, 0x0B61);
        else if (c >= 0x0B85 && c <= 0x0B8A) return randomChar(0x0B85, 0x0B8A);
        else if (c >= 0x0B8E && c <= 0x0B90) return randomChar(0x0B8E, 0x0B90);
        else if (c >= 0x0B92 && c <= 0x0B95) return randomChar(0x0B92, 0x0B95);
        else if (c >= 0x0B99 && c <= 0x0B9A) return randomChar(0x0B99, 0x0B9A);
        else if (c >= 0x0B9E && c <= 0x0B9F) return randomChar(0x0B9E, 0x0B9F);
        else if (c >= 0x0BA3 && c <= 0x0BA4) return randomChar(0x0BA3, 0x0BA4);
        else if (c >= 0x0BA8 && c <= 0x0BAA) return randomChar(0x0BA8, 0x0BAA);
        else if (c >= 0x0BAE && c <= 0x0BB5) return randomChar(0x0BAE, 0x0BB5);
        else if (c >= 0x0BB7 && c <= 0x0BB9) return randomChar(0x0BB7, 0x0BB9);
        else if (c >= 0x0C05 && c <= 0x0C0C) return randomChar(0x0C05, 0x0C0C);
        else if (c >= 0x0C0E && c <= 0x0C10) return randomChar(0x0C0E, 0x0C10);
        else if (c >= 0x0C12 && c <= 0x0C28) return randomChar(0x0C12, 0x0C28);
        else if (c >= 0x0C2A && c <= 0x0C33) return randomChar(0x0C2A, 0x0C33);
        else if (c >= 0x0C35 && c <= 0x0C39) return randomChar(0x0C35, 0x0C39);
        else if (c >= 0x0C60 && c <= 0x0C61) return randomChar(0x0C60, 0x0C61);
        else if (c >= 0x0C85 && c <= 0x0C8C) return randomChar(0x0C85, 0x0C8C);
        else if (c >= 0x0C8E && c <= 0x0C90) return randomChar(0x0C8E, 0x0C90);
        else if (c >= 0x0C92 && c <= 0x0CA8) return randomChar(0x0C92, 0x0CA8);
        else if (c >= 0x0CAA && c <= 0x0CB3) return randomChar(0x0CAA, 0x0CB3);
        else if (c >= 0x0CB5 && c <= 0x0CB9) return randomChar(0x0CB5, 0x0CB9);
        else if (c >= 0x0CE0 && c <= 0x0CE1) return randomChar(0x0CE0, 0x0CE1);
        else if (c >= 0x0D05 && c <= 0x0D0C) return randomChar(0x0D05, 0x0D0C);
        else if (c >= 0x0D0E && c <= 0x0D10) return randomChar(0x0D0E, 0x0D10);
        else if (c >= 0x0D12 && c <= 0x0D28) return randomChar(0x0D12, 0x0D28);
        else if (c >= 0x0D2A && c <= 0x0D39) return randomChar(0x0D2A, 0x0D39);
        else if (c >= 0x0D60 && c <= 0x0D61) return randomChar(0x0D60, 0x0D61);
        else if (c >= 0x0E01 && c <= 0x0E2E) return randomChar(0x0E01, 0x0E2E);
        else if (c >= 0x0E32 && c <= 0x0E33) return randomChar(0x0E32, 0x0E33);
        else if (c >= 0x0E40 && c <= 0x0E45) return randomChar(0x0E40, 0x0E45);
        else if (c >= 0x0E81 && c <= 0x0E82) return randomChar(0x0E81, 0x0E82);
        else if (c >= 0x0E87 && c <= 0x0E88) return randomChar(0x0E87, 0x0E88);
        else if (c >= 0x0E94 && c <= 0x0E97) return randomChar(0x0E94, 0x0E97);
        else if (c >= 0x0E99 && c <= 0x0E9F) return randomChar(0x0E99, 0x0E9F);
        else if (c >= 0x0EA1 && c <= 0x0EA3) return randomChar(0x0EA1, 0x0EA3);
        else if (c >= 0x0EAA && c <= 0x0EAB) return randomChar(0x0EAA, 0x0EAB);
        else if (c >= 0x0EAD && c <= 0x0EAE) return randomChar(0x0EAD, 0x0EAE);
        else if (c >= 0x0EB2 && c <= 0x0EB3) return randomChar(0x0EB2, 0x0EB3);
        else if (c >= 0x0EC0 && c <= 0x0EC4) return randomChar(0x0EC0, 0x0EC4);
        else if (c >= 0x0F40 && c <= 0x0F47) return randomChar(0x0F40, 0x0F47);
        else if (c >= 0x0F49 && c <= 0x0F69) return randomChar(0x0F49, 0x0F69);
        else if (c >= 0x10A0 && c <= 0x10C5) return randomChar(0x10A0, 0x10C5);
        else if (c >= 0x10D0 && c <= 0x10F6) return randomChar(0x10D0, 0x10F6);
        else if (c >= 0x1102 && c <= 0x1103) return randomChar(0x1102, 0x1103);
        else if (c >= 0x1105 && c <= 0x1107) return randomChar(0x1105, 0x1107);
        else if (c >= 0x110B && c <= 0x110C) return randomChar(0x110B, 0x110C);
        else if (c >= 0x110E && c <= 0x1112) return randomChar(0x110E, 0x1112);
        else if (c >= 0x1154 && c <= 0x1155) return randomChar(0x1154, 0x1155);
        else if (c >= 0x115F && c <= 0x1161) return randomChar(0x115F, 0x1161);
        else if (c >= 0x116D && c <= 0x116E) return randomChar(0x116D, 0x116E);
        else if (c >= 0x1172 && c <= 0x1173) return randomChar(0x1172, 0x1173);
        else if (c >= 0x11AE && c <= 0x11AF) return randomChar(0x11AE, 0x11AF);
        else if (c >= 0x11B7 && c <= 0x11B8) return randomChar(0x11B7, 0x11B8);
        else if (c >= 0x11BC && c <= 0x11C2) return randomChar(0x11BC, 0x11C2);
        else if (c >= 0x1E00 && c <= 0x1E9B) return randomChar(0x1E00, 0x1E9B);
        else if (c >= 0x1EA0 && c <= 0x1EF9) return randomChar(0x1EA0, 0x1EF9);
        else if (c >= 0x1F00 && c <= 0x1F15) return randomChar(0x1F00, 0x1F15);
        else if (c >= 0x1F18 && c <= 0x1F1D) return randomChar(0x1F18, 0x1F1D);
        else if (c >= 0x1F20 && c <= 0x1F45) return randomChar(0x1F20, 0x1F45);
        else if (c >= 0x1F48 && c <= 0x1F4D) return randomChar(0x1F48, 0x1F4D);
        else if (c >= 0x1F50 && c <= 0x1F57) return randomChar(0x1F50, 0x1F57);
        else if (c >= 0x1F5F && c <= 0x1F7D) return randomChar(0x1F5F, 0x1F7D);
        else if (c >= 0x1F80 && c <= 0x1FB4) return randomChar(0x1F80, 0x1FB4);
        else if (c >= 0x1FB6 && c <= 0x1FBC) return randomChar(0x1FB6, 0x1FBC);
        else if (c >= 0x1FC2 && c <= 0x1FC4) return randomChar(0x1FC2, 0x1FC4);
        else if (c >= 0x1FC6 && c <= 0x1FCC) return randomChar(0x1FC6, 0x1FCC);
        else if (c >= 0x1FD0 && c <= 0x1FD3) return randomChar(0x1FD0, 0x1FD3);
        else if (c >= 0x1FD6 && c <= 0x1FDB) return randomChar(0x1FD6, 0x1FDB);
        else if (c >= 0x1FE0 && c <= 0x1FEC) return randomChar(0x1FE0, 0x1FEC);
        else if (c >= 0x1FF2 && c <= 0x1FF4) return randomChar(0x1FF2, 0x1FF4);
        else if (c >= 0x1FF6 && c <= 0x1FFC) return randomChar(0x1FF6, 0x1FFC);
        else if (c >= 0x212A && c <= 0x212B) return randomChar(0x212A, 0x212B);
        else if (c >= 0x2180 && c <= 0x2182) return randomChar(0x2180, 0x2182);
        else if (c >= 0x3041 && c <= 0x3094) return randomChar(0x3041, 0x3094);
        else if (c >= 0x30A1 && c <= 0x30FA) return randomChar(0x30A1, 0x30FA);
        else if (c >= 0x3105 && c <= 0x312C) return randomChar(0x3105, 0x312C);
        else if (c >= 0x0300 && c <= 0x0345) return randomChar(0x0300, 0x0345);
        else if (c >= 0x0360 && c <= 0x0361) return randomChar(0x0360, 0x0361);
        else if (c >= 0x0483 && c <= 0x0486) return randomChar(0x0483, 0x0486);
        else if (c >= 0x0591 && c <= 0x05A1) return randomChar(0x0591, 0x05A1);
        else if (c >= 0x05A3 && c <= 0x05B9) return randomChar(0x05A3, 0x05B9);
        else if (c >= 0x05BB && c <= 0x05BD) return randomChar(0x05BB, 0x05BD);
        else if (c >= 0x05C1 && c <= 0x05C2) return randomChar(0x05C1, 0x05C2);
        else if (c >= 0x064B && c <= 0x0652) return randomChar(0x064B, 0x0652);
        else if (c >= 0x0660 && c <= 0x0669) return randomChar(0x0660, 0x0669);
        else if (c >= 0x06D6 && c <= 0x06DC) return randomChar(0x06D6, 0x06DC);
        else if (c >= 0x06DD && c <= 0x06DF) return randomChar(0x06DD, 0x06DF);
        else if (c >= 0x06E0 && c <= 0x06E4) return randomChar(0x06E0, 0x06E4);
        else if (c >= 0x06E7 && c <= 0x06E8) return randomChar(0x06E7, 0x06E8);
        else if (c >= 0x06EA && c <= 0x06ED) return randomChar(0x06EA, 0x06ED);
        else if (c >= 0x06F0 && c <= 0x06F9) return randomChar(0x06F0, 0x06F9);
        else if (c >= 0x0901 && c <= 0x0903) return randomChar(0x0901, 0x0903);
        else if (c >= 0x093E && c <= 0x094C) return randomChar(0x093E, 0x094C);
        else if (c >= 0x0951 && c <= 0x0954) return randomChar(0x0951, 0x0954);
        else if (c >= 0x0962 && c <= 0x0963) return randomChar(0x0962, 0x0963);
        else if (c >= 0x0966 && c <= 0x096F) return randomChar(0x0966, 0x096F);
        else if (c >= 0x0981 && c <= 0x0983) return randomChar(0x0981, 0x0983);
        else if (c >= 0x09C0 && c <= 0x09C4) return randomChar(0x09C0, 0x09C4);
        else if (c >= 0x09C7 && c <= 0x09C8) return randomChar(0x09C7, 0x09C8);
        else if (c >= 0x09CB && c <= 0x09CD) return randomChar(0x09CB, 0x09CD);
        else if (c >= 0x09E2 && c <= 0x09E3) return randomChar(0x09E2, 0x09E3);
        else if (c >= 0x09E6 && c <= 0x09EF) return randomChar(0x09E6, 0x09EF);
        else if (c >= 0x0A40 && c <= 0x0A42) return randomChar(0x0A40, 0x0A42);
        else if (c >= 0x0A47 && c <= 0x0A48) return randomChar(0x0A47, 0x0A48);
        else if (c >= 0x0A4B && c <= 0x0A4D) return randomChar(0x0A4B, 0x0A4D);
        else if (c >= 0x0A66 && c <= 0x0A6F) return randomChar(0x0A66, 0x0A6F);
        else if (c >= 0x0A70 && c <= 0x0A71) return randomChar(0x0A70, 0x0A71);
        else if (c >= 0x0A81 && c <= 0x0A83) return randomChar(0x0A81, 0x0A83);
        else if (c >= 0x0ABE && c <= 0x0AC5) return randomChar(0x0ABE, 0x0AC5);
        else if (c >= 0x0AC7 && c <= 0x0AC9) return randomChar(0x0AC7, 0x0AC9);
        else if (c >= 0x0ACB && c <= 0x0ACD) return randomChar(0x0ACB, 0x0ACD);
        else if (c >= 0x0AE6 && c <= 0x0AEF) return randomChar(0x0AE6, 0x0AEF);
        else if (c >= 0x0B01 && c <= 0x0B03) return randomChar(0x0B01, 0x0B03);
        else if (c >= 0x0B3E && c <= 0x0B43) return randomChar(0x0B3E, 0x0B43);
        else if (c >= 0x0B47 && c <= 0x0B48) return randomChar(0x0B47, 0x0B48);
        else if (c >= 0x0B4B && c <= 0x0B4D) return randomChar(0x0B4B, 0x0B4D);
        else if (c >= 0x0B56 && c <= 0x0B57) return randomChar(0x0B56, 0x0B57);
        else if (c >= 0x0B66 && c <= 0x0B6F) return randomChar(0x0B66, 0x0B6F);
        else if (c >= 0x0B82 && c <= 0x0B83) return randomChar(0x0B82, 0x0B83);
        else if (c >= 0x0BBE && c <= 0x0BC2) return randomChar(0x0BBE, 0x0BC2);
        else if (c >= 0x0BC6 && c <= 0x0BC8) return randomChar(0x0BC6, 0x0BC8);
        else if (c >= 0x0BCA && c <= 0x0BCD) return randomChar(0x0BCA, 0x0BCD);
        else if (c >= 0x0BE7 && c <= 0x0BEF) return randomChar(0x0BE7, 0x0BEF);
        else if (c >= 0x0C01 && c <= 0x0C03) return randomChar(0x0C01, 0x0C03);
        else if (c >= 0x0C3E && c <= 0x0C44) return randomChar(0x0C3E, 0x0C44);
        else if (c >= 0x0C46 && c <= 0x0C48) return randomChar(0x0C46, 0x0C48);
        else if (c >= 0x0C4A && c <= 0x0C4D) return randomChar(0x0C4A, 0x0C4D);
        else if (c >= 0x0C55 && c <= 0x0C56) return randomChar(0x0C55, 0x0C56);
        else if (c >= 0x0C66 && c <= 0x0C6F) return randomChar(0x0C66, 0x0C6F);
        else if (c >= 0x0C82 && c <= 0x0C83) return randomChar(0x0C82, 0x0C83);
        else if (c >= 0x0CBE && c <= 0x0CC4) return randomChar(0x0CBE, 0x0CC4);
        else if (c >= 0x0CC6 && c <= 0x0CC8) return randomChar(0x0CC6, 0x0CC8);
        else if (c >= 0x0CCA && c <= 0x0CCD) return randomChar(0x0CCA, 0x0CCD);
        else if (c >= 0x0CD5 && c <= 0x0CD6) return randomChar(0x0CD5, 0x0CD6);
        else if (c >= 0x0CE6 && c <= 0x0CEF) return randomChar(0x0CE6, 0x0CEF);
        else if (c >= 0x0D02 && c <= 0x0D03) return randomChar(0x0D02, 0x0D03);
        else if (c >= 0x0D3E && c <= 0x0D43) return randomChar(0x0D3E, 0x0D43);
        else if (c >= 0x0D46 && c <= 0x0D48) return randomChar(0x0D46, 0x0D48);
        else if (c >= 0x0D4A && c <= 0x0D4D) return randomChar(0x0D4A, 0x0D4D);
        else if (c >= 0x0D66 && c <= 0x0D6F) return randomChar(0x0D66, 0x0D6F);
        else if (c >= 0x0E34 && c <= 0x0E3A) return randomChar(0x0E34, 0x0E3A);
        else if (c >= 0x0E47 && c <= 0x0E4E) return randomChar(0x0E47, 0x0E4E);
        else if (c >= 0x0E50 && c <= 0x0E59) return randomChar(0x0E50, 0x0E59);
        else if (c >= 0x0EB4 && c <= 0x0EB9) return randomChar(0x0EB4, 0x0EB9);
        else if (c >= 0x0EBB && c <= 0x0EBC) return randomChar(0x0EBB, 0x0EBC);
        else if (c >= 0x0EC8 && c <= 0x0ECD) return randomChar(0x0EC8, 0x0ECD);
        else if (c >= 0x0ED0 && c <= 0x0ED9) return randomChar(0x0ED0, 0x0ED9);
        else if (c >= 0x0F18 && c <= 0x0F19) return randomChar(0x0F18, 0x0F19);
        else if (c >= 0x0F20 && c <= 0x0F29) return randomChar(0x0F20, 0x0F29);
        else if (c >= 0x0F71 && c <= 0x0F84) return randomChar(0x0F71, 0x0F84);
        else if (c >= 0x0F86 && c <= 0x0F8B) return randomChar(0x0F86, 0x0F8B);
        else if (c >= 0x0F90 && c <= 0x0F95) return randomChar(0x0F90, 0x0F95);
        else if (c >= 0x0F99 && c <= 0x0FAD) return randomChar(0x0F99, 0x0FAD);
        else if (c >= 0x0FB1 && c <= 0x0FB7) return randomChar(0x0FB1, 0x0FB7);
        else if (c >= 0x20D0 && c <= 0x20DC) return randomChar(0x20D0, 0x20DC);
        else if (c >= 0x302A && c <= 0x302F) return randomChar(0x302A, 0x302F);
        else if (c >= 0x3031 && c <= 0x3035) return randomChar(0x3031, 0x3035);
        else if (c >= 0x309D && c <= 0x309E) return randomChar(0x309D, 0x309E);
        else if (c >= 0x30FC && c <= 0x30FE) return randomChar(0x30FC, 0x30FE);

        // high surrogates
        else if (c >= 0xD800 && c <= 0xDBFF) return randomChar(0xD800, 0xDBFF);
        // low surrogates
        else if (c >= 0xDC00 && c <= 0xDFFF) return randomChar(0xDC00, 0xDFFF);
        
        // C1 controls
        if (c > 127 && c < 160) return randomChar(127, 159);
        
        return c;
    }

    private boolean isASCIIPunctuationCharacter(char c) {

        for (int i = 0; i < asciiPunctuation.length; i++) {
            if (c == asciiPunctuation[i]) return true;
        }
        return false;
        
    }


    private char randomChar(int low, int high) {

        int n = random.nextInt(high-low+1);
        return (char) (n+low);
    }

    
    // non-name punctuation characters that have no significance in XML
    private char[] asciiPunctuation = {
      '!', '$', '%', '(', ')', '*', '+', ',', ';', '=', '?', 
      '@', '[', ']', '\\', '^', '`', '{', '}', '|', '~'};
    
    
    private char getRandomAsciiPunctuation() {
     
        int index = random.nextInt(asciiPunctuation.length);
        return asciiPunctuation[index];
        
    }


    String randomizeEnumeratedList(String type) {

        StringBuffer sb = new StringBuffer(type.length());
        sb.append("(");
        StringTokenizer st = new StringTokenizer(type, "(|)");
        while (st.hasMoreTokens()) {
            String token = st.nextToken();
            sb.append(randomizeQName(token));
            if (st.hasMoreTokens()) {
               sb.append('|');
            }
        }
        sb.append(")");
        return sb.toString();
    }


    public static void main(String[] args) throws SAXException  {
    
        if (args.length == 0) {
            System.out.println("Usage: java com.elharo.xml.Randomizer url output_file");
            return;
        }
        
        boolean preserveNames = false;
        String input = args[0];
        if (args[0].equals("-preservenames")) {
            input = args[1];
            preserveNames = true;
        }
        
        try {
            InputSource source;
            try {
                URL u = new URL(input);
                source = new InputSource(u.toExternalForm());
            }
            catch (MalformedURLException ex) {
                File f = new File(input);
                source = new InputSource(new FileInputStream(f));
                source.setSystemId(f.toURL().toExternalForm());
            }
            OutputStream out = System.out;
            RandomizingHandler randomizer = new RandomizingHandler(out, preserveNames);
            // genericize????
            XMLReader reader = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
            reader.setFeature(
              "http://xml.org/sax/features/namespace-prefixes", true);
            reader.setContentHandler(randomizer);
            reader.setProperty("http://xml.org/sax/properties/lexical-handler", randomizer);
            reader.parse(source);
            
        }
        catch (IOException ex) {
            System.err.println(ex.getMessage());
        }
        
    }

}

Previous | Next | Top | Cafe con Leche

Copyright 2005 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified August 2, 2005