| /******************************************************************************* |
| * Copyright (c) 2004, 2005 IBM Corporation and others. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: |
| * IBM Corporation - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.wst.xml.tests.encoding; |
| |
| import java.io.BufferedWriter; |
| import java.io.ByteArrayOutputStream; |
| import java.io.File; |
| import java.io.FileNotFoundException; |
| import java.io.FileOutputStream; |
| import java.io.FileWriter; |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.io.OutputStreamWriter; |
| import java.io.PrintStream; |
| import java.io.UnsupportedEncodingException; |
| import java.io.Writer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CodingErrorAction; |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.eclipse.core.runtime.content.IContentDescription; |
| |
| /** |
| * The purpose and logic of this class is to create small "XML files" of |
| * various, known encodings, write them to files, and in later tests, be sure |
| * appropriate encoding can be detected, and read in and intact characters. |
| */ |
| public class GenerateXMLFiles extends GenerateFiles { |
| private String LF = "\n"; |
| private String CR = "\r"; |
| private String CRLF = CR + LF; |
| // different text strings for comparisons |
| private String textUS_ASCII_LF = "abcdefghijklmnopqrstuvwxyz\n1234567890\nABCDEFGHIJKLMNOPQRSTUVWXYZ"; |
| private String textUS_ASCII_CRLF = "abcdefghijklmnopqrstuvwxyz\r\n1234567890\r\nABCDEFGHIJKLMNOPQRSTUVWXYZ"; |
| private boolean DEBUG = true; |
| private boolean DEBUGCRLF = false; |
| private boolean DEBUGINFO = true; |
| |
| |
| public GenerateXMLFiles() { |
| super(); |
| } |
| |
| public static void main(String[] args) { |
| //junit.textui.TestRunner.run(GenerateXMLFiles.class); |
| GenerateXMLFiles thisApp = new GenerateXMLFiles(); |
| try { |
| //thisApp.generateOriginalFiles(); |
| thisApp.generateAllFilesForCurrentVM(); |
| } |
| catch (IOException e) { |
| |
| e.printStackTrace(); |
| } |
| } |
| |
| private void generateAllFilesForCurrentVM() throws IOException { |
| Map allCharsetMap = Charset.availableCharsets(); |
| Set allKeys = allCharsetMap.keySet(); |
| Object[] allcharsets = allKeys.toArray(); |
| String[] allcharsetNames = new String[allcharsets.length]; |
| for (int i = 0; i < allcharsets.length; i++) { |
| allcharsetNames[i] = allcharsets[i].toString(); |
| |
| } |
| //createFiles(allcharsetNames, false); |
| createFiles(allcharsetNames, true); |
| |
| } |
| |
| private void createFiles(String[] charsetnames, boolean useCRLF) throws FileNotFoundException, IOException { |
| |
| String charsetName = null; |
| Writer out = null; |
| String mainDirectory = getMainDirectoryBasedOnVMNameAndFileExtension(); |
| List charsetFilesWritten = new ArrayList(); |
| for (int i = 0; i < charsetnames.length; i++) { |
| try { |
| |
| |
| charsetName = charsetnames[i]; |
| |
| Charset charset = Charset.forName(charsetName); |
| CharsetEncoder charsetEncoder = charset.newEncoder(); |
| charsetEncoder.onMalformedInput(CodingErrorAction.REPORT); |
| charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPORT); |
| |
| String header = getHeaderStart() + charsetName + getHeaderEnd(); |
| String fulltext = null; |
| if (useCRLF) { |
| fulltext = header + textUS_ASCII_CRLF; |
| } |
| else { |
| fulltext = header + textUS_ASCII_LF; |
| } |
| |
| if (!isEbcidic(charsetName, charsetEncoder)) { |
| if (charsetEncoder.canEncode(fulltext)) { |
| // if (canEncodeCRLF(charsetName, charsetEncoder) |
| // && |
| // canEncodeSimpleString(charsetName, |
| // charsetEncoder, "<?") && |
| // charsetEncoder.canEncode(fulltext)) { |
| String outputfilename = "test-" + charsetName + ".xml"; |
| File outFile = FileUtil.makeFileFor(mainDirectory, outputfilename, null); |
| //System.out.println(outFile.getAbsolutePath()); |
| OutputStream outputStream = new FileOutputStream(outFile); |
| ByteArrayOutputStream bytesout = new ByteArrayOutputStream(); |
| |
| Writer fileWriter = new OutputStreamWriter(outputStream, charsetEncoder); |
| // this byte writer is created just to be able to |
| // count precise bytes. |
| Writer byteWriter = new OutputStreamWriter(bytesout, charsetEncoder); |
| |
| supplyBOMs(charsetName, outputStream, bytesout); |
| |
| out = new BufferedWriter(fileWriter); |
| |
| |
| out.write(fulltext); |
| byteWriter.write(fulltext); |
| out.close(); |
| byteWriter.flush(); |
| // if we made is this far, with no exceptions, |
| // etc., |
| // then |
| // must have been |
| // really written. |
| String writtenRecord = charsetName; |
| charsetFilesWritten.add(writtenRecord); |
| if (DEBUG) { |
| printDebugInfo(useCRLF, header, outputfilename, bytesout); |
| } |
| } |
| else { |
| if (DEBUGINFO) { |
| System.out.println(" *** could not convert sample ascii text for " + charsetName); |
| } |
| } |
| } |
| } |
| |
| catch (IOException e) { |
| if (DEBUGINFO) { |
| System.out.println(" ***** could not generate for " + charsetName); |
| String msg = e.getMessage(); |
| if (msg == null) |
| msg = ""; |
| System.out.println(" due to " + e.getClass().getName() + " " + msg); |
| } |
| } |
| catch (Exception e) { |
| if (DEBUGINFO) { |
| System.out.println(" ***** could not generate for " + charsetName); |
| String msg = e.getMessage(); |
| if (msg == null) |
| msg = ""; |
| System.out.println(" due to " + e.getClass().getName() + " " + msg); |
| } |
| } |
| finally { |
| if (out != null) { |
| out.close(); |
| } |
| } |
| } |
| |
| |
| // now create file that summarizes what was written |
| // suitable to paste as method in test class |
| File outFile = FileUtil.makeFileFor(mainDirectory, "testMethods.text", null); |
| FileWriter outproperties = new FileWriter(outFile); |
| outproperties.write(charsetFilesWritten.size() + CRLF); |
| Iterator items = charsetFilesWritten.iterator(); |
| int n = 0; |
| while (items.hasNext()) { |
| String itemCreated = (String) items.next(); |
| String testMethod = createMethod(n++, itemCreated); |
| outproperties.write(testMethod + CRLF); |
| } |
| outproperties.close(); |
| |
| } |
| |
| /** |
| * I thought this used to be automatic, but doesn't seem to be now?! |
| */ |
| private void supplyBOMs(String charsetName, OutputStream outputStream, ByteArrayOutputStream bytesout) throws IOException { |
| byte[] nullBytes = new byte[]{0x00, 0x00}; |
| if (charsetName.equals("UTF-16")) { |
| outputStream.write(IContentDescription.BOM_UTF_16LE); |
| bytesout.write(IContentDescription.BOM_UTF_16LE); |
| } |
| if (charsetName.equals("UTF-16LE")) { |
| outputStream.write(IContentDescription.BOM_UTF_16LE); |
| bytesout.write(IContentDescription.BOM_UTF_16LE); |
| } |
| if (charsetName.equals("X-UnicodeLittle")) { |
| outputStream.write(IContentDescription.BOM_UTF_16LE); |
| bytesout.write(IContentDescription.BOM_UTF_16LE); |
| } |
| if (charsetName.equals("UTF-16BE")) { |
| outputStream.write(IContentDescription.BOM_UTF_16BE); |
| bytesout.write(IContentDescription.BOM_UTF_16BE); |
| } |
| if (charsetName.equals("X-UnicodeBig")) { |
| outputStream.write(IContentDescription.BOM_UTF_16BE); |
| bytesout.write(IContentDescription.BOM_UTF_16BE); |
| } |
| if (charsetName.equals("UTF-32")) { |
| outputStream.write(nullBytes); |
| outputStream.write(IContentDescription.BOM_UTF_16LE); |
| bytesout.write(nullBytes); |
| bytesout.write(IContentDescription.BOM_UTF_16LE); |
| } |
| if (charsetName.equals("UTF-32LE")) { |
| outputStream.write(nullBytes); |
| outputStream.write(IContentDescription.BOM_UTF_16LE); |
| bytesout.write(nullBytes); |
| bytesout.write(IContentDescription.BOM_UTF_16LE); |
| } |
| if (charsetName.equals("UTF-32BE")) { |
| outputStream.write(nullBytes); |
| outputStream.write(IContentDescription.BOM_UTF_16BE); |
| bytesout.write(nullBytes); |
| bytesout.write(IContentDescription.BOM_UTF_16BE); |
| } |
| } |
| |
| /** |
| * @param i |
| * @param itemCreated |
| */ |
| private String createMethod(int i, String itemCreated) { |
| String template = " public void testFile" + i + "() throws CoreException, IOException {\r\n" + " String charsetName = \"" + itemCreated + "\";\r\n" + " doGenTest(charsetName);\r\n" + " }"; |
| return template; |
| } |
| |
| private void printDebugInfo(boolean useCRLF, String header, String outputfilename, ByteArrayOutputStream bytesout) { |
| byte[] bytes = bytesout.toByteArray(); |
| int nBytes = bytes.length; |
| int nChars = 0; |
| if (useCRLF) { |
| nChars = header.length() + textUS_ASCII_CRLF.length(); |
| } |
| else { |
| nChars = header.length() + textUS_ASCII_LF.length(); |
| } |
| |
| System.out.println("Wrote " + nChars + " characters and " + nBytes + " bytes to " + outputfilename); |
| } |
| |
| // TODO: never used |
| boolean canEncodeSimpleString(String charsetName, CharsetEncoder charsetEncocder, String simpleString) { |
| // this method added since some encoders don't report that they can't |
| // encode something, but they obviously |
| // can't, at least in the normal meaning of the word. |
| // This seems to mostly apply to some IBM varieties where, apparently, |
| // the input can't be interpreted at all without knowing encoding |
| // (that is |
| // could not be used for content based encoding). |
| boolean result = false; |
| |
| String newAsciiString = null; |
| byte[] translatedBytes = null; |
| try { |
| translatedBytes = simpleString.getBytes(charsetName); |
| newAsciiString = new String(translatedBytes, "ascii"); |
| } |
| catch (UnsupportedEncodingException e) { |
| // impossible, since checked already |
| throw new Error(e); |
| } |
| result = simpleString.equals(newAsciiString); |
| if (!result) { |
| if (charsetEncocder.maxBytesPerChar() != 1) { |
| // don't check mulitbyte encoders, just assume true (for now). |
| result = true; |
| if (charsetEncocder.maxBytesPerChar() == 4) { |
| //except, let's just exclude four byte streams, for now. |
| result = false; |
| if (charsetEncocder.averageBytesPerChar() == 2) { |
| // except, for some reason UTF has max bytes of 4 |
| // (average bytes of 2). |
| result = false; |
| } |
| } |
| } |
| } |
| |
| return result; |
| } |
| |
| /** |
| * A very heuristic method. Should have table, someday. |
| */ |
| private boolean isEbcidic(String charsetName, CharsetEncoder charsetEncocder) { |
| boolean result = false; |
| String simpleString = "<?"; |
| String newAsciiString = null; |
| byte[] translatedBytes = null; |
| try { |
| translatedBytes = simpleString.getBytes(charsetName); |
| newAsciiString = new String(translatedBytes, "ascii"); |
| } |
| catch (UnsupportedEncodingException e) { |
| // impossible, since checked already |
| throw new Error(e); |
| } |
| // experimenting/debugging showed the known ebcidic onces always |
| // "mis" tranlated to characters L and o. |
| result = "Lo".equals(newAsciiString); |
| if (result) { |
| System.out.println(charsetName + " assumed to be Edcidic"); |
| } |
| return result; |
| } |
| |
| /** |
| * @param charset |
| */ |
| boolean canEncodeCRLF(String charsetName, CharsetEncoder charsetEncoder) { |
| boolean result = true; |
| //String charsetCononicalName = charsetEncoder.charset().name(); |
| if (!charsetEncoder.canEncode(LF)) { |
| if (DEBUGCRLF) { |
| String stringName = "LF"; |
| String testString = LF; |
| exploreConversion(charsetName, stringName, testString); |
| System.out.println("can not encode LF for " + charsetEncoder.charset().name()); |
| } |
| result = false; |
| } |
| if (!charsetEncoder.canEncode(CR)) { |
| if (DEBUGCRLF) { |
| String stringName = "CR"; |
| String testString = CR; |
| exploreConversion(charsetName, stringName, testString); |
| System.out.println("can not encode CR for " + charsetEncoder.charset().name()); |
| } |
| result = false; |
| } |
| if (!charsetEncoder.canEncode(CRLF)) { |
| if (DEBUGCRLF) { |
| String stringName = "CRLF"; |
| String testString = CRLF; |
| exploreConversion(charsetName, stringName, testString); |
| System.out.println("can not encode CRLF for " + charsetEncoder.charset().name()); |
| } |
| result = false; |
| } |
| return result; |
| |
| } |
| |
| private void exploreConversion(String charsetName, String stringName, String testString) throws Error { |
| try { |
| String newLF = new String(testString.getBytes(charsetName)); |
| System.out.print("old " + stringName + " (dec): "); |
| dumpString(System.out, testString); |
| System.out.println(); |
| System.out.print("new " + stringName + " (dec): "); |
| dumpString(System.out, newLF); |
| System.out.println(); |
| } |
| catch (UnsupportedEncodingException e) { |
| //should never happen, already checked |
| throw new Error(e); |
| } |
| } |
| |
| /** |
| * @param out |
| * @param lf2 |
| */ |
| private void dumpString(PrintStream out, String lf2) { |
| for (int i = 0; i < lf2.length(); i++) { |
| out.print((int) lf2.charAt(i) + " "); |
| } |
| |
| } |
| |
| public final static String getMainDirectoryBasedOnVMNameAndFileExtension() { |
| String mainDirectory = getMainDirectoryBasedOnVMName() + "/xml"; |
| return mainDirectory; |
| } |
| |
| private String getHeaderStart() { |
| return "<?xml version=\"1.0\" encoding=\""; |
| } |
| |
| private String getHeaderEnd() { |
| return "\"?>"; |
| } |
| |
| } |