blob: bf1f96331bd4fc6fc9ddb7b6cdd0d954734092ee [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2004 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
*******************************************************************************/
package org.eclipse.wst.xml.tests.encoding;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.eclipse.core.runtime.content.IContentDescription;
/**
* The purpose and logic of this class is to create small "XML files" of
* various, known encodings, write them to files, and in later tests, be sure
* appropriate encoding can be detected, and read in and intact characters.
*/
public class GenerateXMLFiles extends GenerateFiles {
private String LF = "\n";
private String CR = "\r";
private String CRLF = CR + LF;
// different text strings for comparisons
private String textUS_ASCII_LF = "abcdefghijklmnopqrstuvwxyz\n1234567890\nABCDEFGHIJKLMNOPQRSTUVWXYZ";
private String textUS_ASCII_CRLF = "abcdefghijklmnopqrstuvwxyz\r\n1234567890\r\nABCDEFGHIJKLMNOPQRSTUVWXYZ";
private boolean DEBUG = true;
private boolean DEBUGCRLF = false;
private boolean DEBUGINFO = true;
public GenerateXMLFiles() {
super();
}
public static void main(String[] args) {
//junit.textui.TestRunner.run(GenerateXMLFiles.class);
GenerateXMLFiles thisApp = new GenerateXMLFiles();
try {
//thisApp.generateOriginalFiles();
thisApp.generateAllFilesForCurrentVM();
}
catch (IOException e) {
e.printStackTrace();
}
}
private void generateAllFilesForCurrentVM() throws IOException {
Map allCharsetMap = Charset.availableCharsets();
Set allKeys = allCharsetMap.keySet();
Object[] allcharsets = allKeys.toArray();
String[] allcharsetNames = new String[allcharsets.length];
for (int i = 0; i < allcharsets.length; i++) {
allcharsetNames[i] = allcharsets[i].toString();
}
//createFiles(allcharsetNames, false);
createFiles(allcharsetNames, true);
}
private void createFiles(String[] charsetnames, boolean useCRLF) throws FileNotFoundException, IOException {
String charsetName = null;
Writer out = null;
String mainDirectory = getMainDirectoryBasedOnVMNameAndFileExtension();
List charsetFilesWritten = new ArrayList();
for (int i = 0; i < charsetnames.length; i++) {
try {
charsetName = charsetnames[i];
Charset charset = Charset.forName(charsetName);
CharsetEncoder charsetEncoder = charset.newEncoder();
charsetEncoder.onMalformedInput(CodingErrorAction.REPORT);
charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPORT);
String header = getHeaderStart() + charsetName + getHeaderEnd();
String fulltext = null;
if (useCRLF) {
fulltext = header + textUS_ASCII_CRLF;
}
else {
fulltext = header + textUS_ASCII_LF;
}
if (!isEbcidic(charsetName, charsetEncoder)) {
if (charsetEncoder.canEncode(fulltext)) {
// if (canEncodeCRLF(charsetName, charsetEncoder)
// &&
// canEncodeSimpleString(charsetName,
// charsetEncoder, "<?") &&
// charsetEncoder.canEncode(fulltext)) {
String outputfilename = "test-" + charsetName + ".xml";
File outFile = FileUtil.makeFileFor(mainDirectory, outputfilename, null);
//System.out.println(outFile.getAbsolutePath());
OutputStream outputStream = new FileOutputStream(outFile);
ByteArrayOutputStream bytesout = new ByteArrayOutputStream();
Writer fileWriter = new OutputStreamWriter(outputStream, charsetEncoder);
// this byte writer is created just to be able to
// count precise bytes.
Writer byteWriter = new OutputStreamWriter(bytesout, charsetEncoder);
supplyBOMs(charsetName, outputStream, bytesout);
out = new BufferedWriter(fileWriter);
out.write(fulltext);
byteWriter.write(fulltext);
out.close();
byteWriter.flush();
// if we made is this far, with no exceptions,
// etc.,
// then
// must have been
// really written.
String writtenRecord = charsetName;
charsetFilesWritten.add(writtenRecord);
if (DEBUG) {
printDebugInfo(useCRLF, header, outputfilename, bytesout);
}
}
else {
if (DEBUGINFO) {
System.out.println(" *** could not convert sample ascii text for " + charsetName);
}
}
}
}
catch (IOException e) {
if (DEBUGINFO) {
System.out.println(" ***** could not generate for " + charsetName);
String msg = e.getMessage();
if (msg == null)
msg = "";
System.out.println(" due to " + e.getClass().getName() + " " + msg);
}
}
catch (Exception e) {
if (DEBUGINFO) {
System.out.println(" ***** could not generate for " + charsetName);
String msg = e.getMessage();
if (msg == null)
msg = "";
System.out.println(" due to " + e.getClass().getName() + " " + msg);
}
}
finally {
if (out != null) {
out.close();
}
}
}
// now create file that summarizes what was written
// suitable to paste as method in test class
File outFile = FileUtil.makeFileFor(mainDirectory, "testMethods.text", null);
FileWriter outproperties = new FileWriter(outFile);
outproperties.write(charsetFilesWritten.size() + CRLF);
Iterator items = charsetFilesWritten.iterator();
int n = 0;
while (items.hasNext()) {
String itemCreated = (String) items.next();
String testMethod = createMethod(n++, itemCreated);
outproperties.write(testMethod + CRLF);
}
outproperties.close();
}
/**
* I thought this used to be automatic, but doesn't seem to be now?!
*/
private void supplyBOMs(String charsetName, OutputStream outputStream, ByteArrayOutputStream bytesout) throws IOException {
byte[] nullBytes = new byte[]{0x00, 0x00};
if (charsetName.equals("UTF-16")) {
outputStream.write(IContentDescription.BOM_UTF_16LE);
bytesout.write(IContentDescription.BOM_UTF_16LE);
}
if (charsetName.equals("UTF-16LE")) {
outputStream.write(IContentDescription.BOM_UTF_16LE);
bytesout.write(IContentDescription.BOM_UTF_16LE);
}
if (charsetName.equals("X-UnicodeLittle")) {
outputStream.write(IContentDescription.BOM_UTF_16LE);
bytesout.write(IContentDescription.BOM_UTF_16LE);
}
if (charsetName.equals("UTF-16BE")) {
outputStream.write(IContentDescription.BOM_UTF_16BE);
bytesout.write(IContentDescription.BOM_UTF_16BE);
}
if (charsetName.equals("X-UnicodeBig")) {
outputStream.write(IContentDescription.BOM_UTF_16BE);
bytesout.write(IContentDescription.BOM_UTF_16BE);
}
if (charsetName.equals("UTF-32")) {
outputStream.write(nullBytes);
outputStream.write(IContentDescription.BOM_UTF_16LE);
bytesout.write(nullBytes);
bytesout.write(IContentDescription.BOM_UTF_16LE);
}
if (charsetName.equals("UTF-32LE")) {
outputStream.write(nullBytes);
outputStream.write(IContentDescription.BOM_UTF_16LE);
bytesout.write(nullBytes);
bytesout.write(IContentDescription.BOM_UTF_16LE);
}
if (charsetName.equals("UTF-32BE")) {
outputStream.write(nullBytes);
outputStream.write(IContentDescription.BOM_UTF_16BE);
bytesout.write(nullBytes);
bytesout.write(IContentDescription.BOM_UTF_16BE);
}
}
/**
* @param i
* @param itemCreated
*/
private String createMethod(int i, String itemCreated) {
String template = " public void testFile" + i + "() throws CoreException, IOException {\r\n" + " String charsetName = \"" + itemCreated + "\";\r\n" + " doGenTest(charsetName);\r\n" + " }";
return template;
}
private void printDebugInfo(boolean useCRLF, String header, String outputfilename, ByteArrayOutputStream bytesout) {
byte[] bytes = bytesout.toByteArray();
int nBytes = bytes.length;
int nChars = 0;
if (useCRLF) {
nChars = header.length() + textUS_ASCII_CRLF.length();
}
else {
nChars = header.length() + textUS_ASCII_LF.length();
}
System.out.println("Wrote " + nChars + " characters and " + nBytes + " bytes to " + outputfilename);
}
// TODO: never used
boolean canEncodeSimpleString(String charsetName, CharsetEncoder charsetEncocder, String simpleString) {
// this method added since some encoders don't report that they can't
// encode something, but they obviously
// can't, at least in the normal meaning of the word.
// This seems to mostly apply to some IBM varieties where, apparently,
// the input can't be interpreted at all without knowing encoding
// (that is
// could not be used for content based encoding).
boolean result = false;
String newAsciiString = null;
byte[] translatedBytes = null;
try {
translatedBytes = simpleString.getBytes(charsetName);
newAsciiString = new String(translatedBytes, "ascii");
}
catch (UnsupportedEncodingException e) {
// impossible, since checked already
throw new Error(e);
}
result = simpleString.equals(newAsciiString);
if (!result) {
if (charsetEncocder.maxBytesPerChar() != 1) {
// don't check mulitbyte encoders, just assume true (for now).
result = true;
if (charsetEncocder.maxBytesPerChar() == 4) {
//except, let's just exclude four byte streams, for now.
result = false;
if (charsetEncocder.averageBytesPerChar() == 2) {
// except, for some reason UTF has max bytes of 4
// (average bytes of 2).
result = false;
}
}
}
}
return result;
}
/**
* A very heuristic method. Should have table, someday.
*/
private boolean isEbcidic(String charsetName, CharsetEncoder charsetEncocder) {
boolean result = false;
String simpleString = "<?";
String newAsciiString = null;
byte[] translatedBytes = null;
try {
translatedBytes = simpleString.getBytes(charsetName);
newAsciiString = new String(translatedBytes, "ascii");
}
catch (UnsupportedEncodingException e) {
// impossible, since checked already
throw new Error(e);
}
// experimenting/debugging showed the known ebcidic onces always
// "mis" tranlated to characters L and o.
result = "Lo".equals(newAsciiString);
if (result) {
System.out.println(charsetName + " assumed to be Edcidic");
}
return result;
}
/**
* @param charset
*/
boolean canEncodeCRLF(String charsetName, CharsetEncoder charsetEncoder) {
boolean result = true;
//String charsetCononicalName = charsetEncoder.charset().name();
if (!charsetEncoder.canEncode(LF)) {
if (DEBUGCRLF) {
String stringName = "LF";
String testString = LF;
exploreConversion(charsetName, stringName, testString);
System.out.println("can not encode LF for " + charsetEncoder.charset().name());
}
result = false;
}
if (!charsetEncoder.canEncode(CR)) {
if (DEBUGCRLF) {
String stringName = "CR";
String testString = CR;
exploreConversion(charsetName, stringName, testString);
System.out.println("can not encode CR for " + charsetEncoder.charset().name());
}
result = false;
}
if (!charsetEncoder.canEncode(CRLF)) {
if (DEBUGCRLF) {
String stringName = "CRLF";
String testString = CRLF;
exploreConversion(charsetName, stringName, testString);
System.out.println("can not encode CRLF for " + charsetEncoder.charset().name());
}
result = false;
}
return result;
}
private void exploreConversion(String charsetName, String stringName, String testString) throws Error {
try {
String newLF = new String(testString.getBytes(charsetName));
System.out.print("old " + stringName + " (dec): ");
dumpString(System.out, testString);
System.out.println();
System.out.print("new " + stringName + " (dec): ");
dumpString(System.out, newLF);
System.out.println();
}
catch (UnsupportedEncodingException e) {
//should never happen, already checked
throw new Error(e);
}
}
/**
* @param out
* @param lf2
*/
private void dumpString(PrintStream out, String lf2) {
for (int i = 0; i < lf2.length(); i++) {
out.print((int) lf2.charAt(i) + " ");
}
}
public final static String getMainDirectoryBasedOnVMNameAndFileExtension() {
String mainDirectory = getMainDirectoryBasedOnVMName() + "/xml";
return mainDirectory;
}
private String getHeaderStart() {
return "<?xml version=\"1.0\" encoding=\"";
}
private String getHeaderEnd() {
return "\"?>";
}
}