blob: ac1224c631b19b1420d332ba70f3c84f08a326f0 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2001, 2005 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
* Jens Lukowski/Innoopract - initial renaming/restructuring
*
*******************************************************************************/
package org.eclipse.wst.sse.core.internal.encoding;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.CharArrayReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.UnmappableCharacterException;
import org.eclipse.core.resources.IFile;
import org.eclipse.core.runtime.CoreException;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.IStatus;
import org.eclipse.core.runtime.Platform;
import org.eclipse.core.runtime.Status;
import org.eclipse.core.runtime.content.IContentDescription;
import org.eclipse.core.runtime.content.IContentTypeManager;
import org.eclipse.core.runtime.jobs.Job;
import org.eclipse.wst.sse.core.internal.SSECoreMessages;
import org.eclipse.wst.sse.core.internal.SSECorePlugin;
import org.eclipse.wst.sse.core.internal.encoding.util.Assert;
import org.eclipse.wst.sse.core.internal.encoding.util.Logger;
import org.eclipse.wst.sse.core.internal.exceptions.CharConversionErrorWithDetail;
import org.eclipse.wst.sse.core.internal.exceptions.MalformedOutputExceptionWithDetail;
import org.eclipse.wst.sse.core.internal.exceptions.UnsupportedCharsetExceptionWithDetail;
public class CodedStreamCreator extends CodedIO {
private final static int INITIAL_BUFFER_SIZE = 1024 * 16;
// the 32 bytes used by default by ByteOutputStream is
// a little small
private static final String PROGRAM_ERROR__FAILED_TO_FIND_ANY_CHARSET_ANYWHERE_ = "Program error: failed to find any charset anywhere!"; //$NON-NLS-1$
private static final String UTF_16BE_CHARSET_NAME = "UTF-16BE"; //$NON-NLS-1$
private static final String UTF_16LE_CHARSET_NAME = "UTF-16LE"; //$NON-NLS-1$
// private static final String UTF_16_CHARSET_NAME = "UTF-16";
// //$NON-NLS-1$
private static final String UTF_8_CHARSET_NAME = "UTF-8"; //$NON-NLS-1$
private boolean fClientSuppliedReader;
// future_TODO: this 'checkConversion' can be a little
// pricey for large
// files, chould be a user preference, or something.
// private static final boolean checkConversion = true;
private EncodingMemento fCurrentEncodingMemento;
private EncodingMemento fEncodingMemento;
private String fFilename;
private boolean fHasBeenAnalyzed;
private IFile fIFile;
private EncodingMemento fPreviousEncodingMemento;
private Reader fReader;
private Reader fResettableReader;
private byte[] UTF16BEBOM = new byte[]{(byte) 0xFE, (byte) 0xFF};
private byte[] UTF16LEBOM = new byte[]{(byte) 0xFF, (byte) 0xFE};
private byte[] UTF3BYTEBOM = new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
public CodedStreamCreator() {
super();
}
public CodedStreamCreator(String filename, char[] characterArray) {
super();
fFilename = filename;
fReader = new CharArrayReader(characterArray);
}
public CodedStreamCreator(String filename, Reader reader) {
super();
fFilename = filename;
fReader = reader;
}
public CodedStreamCreator(String filename, String textString) {
super();
fFilename = filename;
fReader = new StringReader(textString);
}
/**
* The primary method which contains the highest level rules for how to
* decide appropriate decoding rules: 1. first check for unicode stream 2.
* then looked for encoding specified in content (according to the type of
* content that is it ... xml, html, jsp, etc. 3. then check for various
* settings: file settings first, if null check project settings, if null,
* check user preferences. 4. lastly (or, what is the last user
* preference) is to use "workbench defaults".
*/
private void analyze() throws CoreException, IOException {
Reader resettableReader = getResettableReader();
try {
if (fCurrentEncodingMemento == null) {
resettableReader.reset();
fCurrentEncodingMemento = checkForEncodingInContents();
}
// if encoding stratagy doesn't provide answer,
// then try file settings, project settings,
// user preferences, and
// finally workbench default.
//
if (fCurrentEncodingMemento == null || fCurrentEncodingMemento.getDetectedCharsetName() == null) {
resettableReader.reset();
fCurrentEncodingMemento = getEncodingMementoFromResourceAndPreference();
}
// use DefaultNameRules from NonContentBasedEncodingRules as the
// final default
if (fEncodingMemento == null) {
handleNotProvidedFromContentCase();
}
fHasBeenAnalyzed = true;
} finally {
if (resettableReader != null) {
resettableReader.reset();
}
}
}
/**
* Need to check conversion early on. There's some danger than old
* contents of a file are set to empty, if an exception occurs.
*
* @param allText
* @param encoding
* @param encodingRule
* @throws java.io.UnsupportedEncodingException
* @throws MalformedOutputExceptionWithDetail
* @deprecated - we need to find "cheaper" way to to this functionality so
* likely to go away in future
*/
private void checkConversion(EncodingMemento memento, EncodingRule encodingRule) throws IOException {
String javaEncoding = memento.getJavaCharsetName();
String detectedEncoding = memento.getDetectedCharsetName();
Charset charset = Charset.forName(javaEncoding);
CharsetEncoder charsetEncoder = charset.newEncoder();
charsetEncoder.onMalformedInput(CodingErrorAction.REPORT);
charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader reader = getResettableReader();
reader.reset();
int currentChar = reader.read();
int currentPos = 1;
try {
while (currentChar != -1) {
// note: this can probably be made more
// efficient later to
// check buffer by buffer, instead of
// character by character.
try {
boolean canConvert = charsetEncoder.canEncode((char) currentChar);
if (!canConvert) {
if (encodingRule == EncodingRule.IGNORE_CONVERSION_ERROR) {
// if we're told to ignore the
// encoding conversion
// error,
// notice we still want to detect
// and log it. We simply
// don't throw the exception, and
// we do continue with
// the
// save.
Logger.log(Logger.ERROR, "Encoding Conversion Error during save"); //$NON-NLS-1$
} else {
throw new MalformedOutputExceptionWithDetail(javaEncoding, detectedEncoding, currentPos);
}
}
currentChar = reader.read();
currentPos++;
}
// IBM's JRE seems to throw NPE when DBCS char is given to
// SBCS charsetEncoder
catch (NullPointerException e) {
throw new CharConversionErrorWithDetail(javaEncoding); //$NON-NLS-1$
}
}
// if we get all the way through loop without throwing exception,
// then there must
// be an error not detectable when going character by character.
throw new CharConversionErrorWithDetail(javaEncoding); //$NON-NLS-1$
} finally {
reader.reset();
}
}
private EncodingMemento checkForEncodingInContents() throws CoreException, IOException {
EncodingMemento result = null;
// if encoding memento already set, and no need to get again.
if (fEncodingMemento != null) {
result = fEncodingMemento;
} else {
if (fClientSuppliedReader) {
fReader.reset();
IContentTypeManager contentTypeManager = Platform.getContentTypeManager();
try {
IContentDescription contentDescription = contentTypeManager.getDescriptionFor(fReader, fFilename, IContentDescription.ALL);
if (contentDescription != null) {
fEncodingMemento = createMemento(contentDescription);
} else {
fEncodingMemento = CodedIO.createEncodingMemento("UTF-8"); //$NON-NLS-1$
}
} catch (NullPointerException e) {
// TODO: work around for 5/14 bug in base, should be
// removed when move up to 5/21
// just created a simple default one
fEncodingMemento = CodedIO.createEncodingMemento("UTF-8"); //$NON-NLS-1$
}
result = fEncodingMemento;
} else {
throw new IllegalStateException("unexpected state: encodingMemento was null but no input stream supplied"); //$NON-NLS-1$
}
}
// try {
// result = getEncodingDetector().getEncodingMemento();
// if (result != null && !result.isValid() && !forceDefault()) {
// throw new UnsupportedCharsetExceptionWithDetail(result);
// }
// }
// finally {
// handleStreamClose(fEncodingDetectorStream);
// }
return result;
}
private void dump(OutputStream outputStream, EncodingRule encodingRule, boolean use3ByteBOMifUTF8) throws CoreException, IOException {
getCurrentEncodingMemento();
String javaEncodingName = null;
if (encodingRule == EncodingRule.CONTENT_BASED) {
if (fCurrentEncodingMemento.isValid()) {
javaEncodingName = fCurrentEncodingMemento.getJavaCharsetName();
} else {
throw new UnsupportedCharsetExceptionWithDetail(fCurrentEncodingMemento);
}
} else if (encodingRule == EncodingRule.IGNORE_CONVERSION_ERROR)
javaEncodingName = fCurrentEncodingMemento.getJavaCharsetName();
else if (encodingRule == EncodingRule.FORCE_DEFAULT)
javaEncodingName = fCurrentEncodingMemento.getAppropriateDefault();
// write appropriate "header" unicode BOM bytes
// Note: Java seems to write appropriate header for
// UTF-16, but not
// UTF-8 nor UTF-16BE. This
// may vary by JRE version, so need to test well.
// Note: javaEncodingName can be null in invalid
// cases, so we no hard
// to skip whole check if that's the case.
if (javaEncodingName != null) {
if ((javaEncodingName.equals(UTF_8_CHARSET_NAME) && use3ByteBOMifUTF8) || (javaEncodingName.equals(UTF_8_CHARSET_NAME) && fCurrentEncodingMemento.isUTF83ByteBOMUsed())) {
outputStream.write(UTF3BYTEBOM);
} else if (javaEncodingName.equals(UTF_16LE_CHARSET_NAME)) {
outputStream.write(UTF16LEBOM);
} else if (javaEncodingName.equals(UTF_16BE_CHARSET_NAME)) {
outputStream.write(UTF16BEBOM);
}
}
// TODO add back in line delimiter handling the
// "right" way (updating
// markers, not requiring string, etc. .. may need
// to move to document
// level)
//allTextBuffer =
// handleLineDelimiter(allTextBuffer, document);
Reader reader = getResettableReader();
// be sure to test large "readers" ... we'll need
// to make sure they all
// can reset to initial position (StringReader,
// CharArrayReader, and
// DocumentReader should all work ok).
reader.reset();
// There must be cleaner logic somehow, but the
// idea is that
// javaEncodingName can be null
// if original detected encoding is not valid (and
// if FORCE_DEFAULT was
// not specified). Hence, we WANT the first
// Charset.forName to
// throw appropriate exception.
Charset charset = null;
// this call checks "override" properties file
javaEncodingName = CodedIO.getAppropriateJavaCharset(javaEncodingName);
if (javaEncodingName == null) {
charset = Charset.forName(fCurrentEncodingMemento.getDetectedCharsetName());
} else {
charset = Charset.forName(javaEncodingName);
}
CharsetEncoder charsetEncoder = charset.newEncoder();
if (!(encodingRule == EncodingRule.IGNORE_CONVERSION_ERROR)) {
charsetEncoder.onMalformedInput(CodingErrorAction.REPORT);
charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPORT);
} else {
charsetEncoder.onMalformedInput(CodingErrorAction.REPLACE);
charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
}
OutputStreamWriter outputStreamWriter = new OutputStreamWriter(outputStream, charsetEncoder);
//TODO: this may no longer be needed (and is at
// least wrong spot for
// it).
// if (checkConversion && (!(encodingRule ==
// EncodingRule.IGNORE_CONVERSION_ERROR))) {
// checkConversion(fCurrentEncodingMemento,
// encodingRule);
// }
char[] charbuf = new char[CodedIO.MAX_BUF_SIZE];
int nRead = 0;
try {
while (nRead != -1) {
nRead = reader.read(charbuf, 0, MAX_BUF_SIZE);
if (nRead > 0) {
outputStreamWriter.flush();
outputStreamWriter.write(charbuf, 0, nRead);
}
}
} catch (UnmappableCharacterException e) {
checkConversion(fCurrentEncodingMemento, encodingRule);
} finally {
// since we don't own the original output stream, we
// won't close it ours.
// the caller who passed it to us must close original one
// when appropriate.
// (but we do flush to be sure all up-to-date)
outputStreamWriter.flush();
}
}
private boolean get3ByteBOMPreference() {
return SSECorePlugin.getDefault().getPluginPreferences().getBoolean(CommonEncodingPreferenceNames.USE_3BYTE_BOM_WITH_UTF8);
}
public ByteArrayOutputStream getCodedByteArrayOutputStream() throws CoreException, IOException {
return getCodedByteArrayOutputStream(EncodingRule.CONTENT_BASED);
}
public ByteArrayOutputStream getCodedByteArrayOutputStream(EncodingRule encodingRule) throws CoreException, IOException {
//Assert.isNotNull(fPreviousEncodingMemento,
// "previousEncodingMemento
// needs to be set first");
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(INITIAL_BUFFER_SIZE);
dump(byteArrayOutputStream, encodingRule, get3ByteBOMPreference());
return byteArrayOutputStream;
}
public EncodingMemento getCurrentEncodingMemento() throws CoreException, IOException {
//Assert.isNotNull(fPreviousEncodingMemento,
// "previousEncodingMemento
// needs to be set first");
if (!fHasBeenAnalyzed) {
analyze();
}
// post condition
Assert.isNotNull(fCurrentEncodingMemento, "illegal post condition state"); //$NON-NLS-1$
// be sure to carry over appropriate encoding
// "state" that may be
// relevent.
if (fPreviousEncodingMemento != null) {
fCurrentEncodingMemento.setUTF83ByteBOMUsed(fPreviousEncodingMemento.isUTF83ByteBOMUsed());
}
return fCurrentEncodingMemento;
}
/*
* This method is called only when encoding is not detected in the file.
*
* Here is encoding lookup order we will try: - try resource content
* description (Eclipse Text file encoding) - try resource content
* properties (for JSP only) - try content type encoding preferences (for
* HTML only) - try resource content description (Eclipse Text file
* encoding, implicit check)
*
* Note: This method appears in both CodedReaderCreator and
* CodedStreamCreator (with just a minor difference). They should be kept
* the same.
*/
private EncodingMemento getEncodingMementoFromResourceAndPreference() throws IOException, CoreException {
EncodingMemento encodingMemento = fEncodingMemento;
// Follow Eclipse Platform's direction. Get the charset from IFile.
if (fIFile != null) {
String charset = fIFile.getCharset();
encodingMemento = CodedIO.createEncodingMemento(charset);
}
return encodingMemento;
}
private Reader getResettableReader() {
if (fResettableReader == null) {
if (fReader.markSupported()) {
fResettableReader = fReader;
} else {
fResettableReader = new BufferedReader(fReader);
try {
fResettableReader.mark(MAX_MARK_SIZE);
} catch (IOException e) {
// impossible, since we just checked if
// markable
throw new Error(e);
}
}
}
return fResettableReader;
}
protected void handleNotProvidedFromContentCase() {
// move to "detectors" if not already
String specDefault = null;
//specDefault = getEncodingDetector().getSpecDefaultEncoding();
String charset = NonContentBasedEncodingRules.useDefaultNameRules(specDefault);
Assert.isNotNull(charset, PROGRAM_ERROR__FAILED_TO_FIND_ANY_CHARSET_ANYWHERE_);
fCurrentEncodingMemento = CodedIO.createEncodingMemento(charset);
}
// TODO We just copy the content properties encoding to current resource's
// encoding for now. May improve the UI later by setting an informational
// message and/or disable the content properties encoding field.
// TODO make priviate if needed, else remove
void migrateContentPropertiesEncoding(String encoding) throws CoreException {
if (fIFile != null)
fIFile.setCharset(encoding, null);
final IFile file = fIFile;
final String charset = encoding;
// TODO: externalize string later
Job migrater = new Job(SSECoreMessages.Migrate_Charset) { //$NON-NLS-1$
protected IStatus run(IProgressMonitor monitor) {
if (file != null) {
try {
file.setCharset(charset, null);
} catch (CoreException e) {
Logger.logException(e);
}
}
return Status.OK_STATUS;
}
};
migrater.setSystem(true);
migrater.schedule();
}
/**
*
*/
private void resetAll() {
fFilename = null;
fReader = null;
fPreviousEncodingMemento = null;
fCurrentEncodingMemento = null;
fHasBeenAnalyzed = false;
fClientSuppliedReader = false;
}
public void set(IFile file, Reader reader) {
fIFile = file;
set(file.getName(), reader);
}
public void set(String filename, char[] characterArray) {
resetAll();
fFilename = filename;
fReader = new CharArrayReader(characterArray);
}
public void set(String filename, Reader reader) {
resetAll();
fFilename = filename;
fReader = reader;
fClientSuppliedReader = true;
}
public void set(String filename, String textString) {
set(filename, new StringReader(textString));
}
public void setPreviousEncodingMemento(EncodingMemento previousEncodingMemento) {
fPreviousEncodingMemento = previousEncodingMemento;
}
}