blob: e3c9d995d453e12ec9e90403f968fa973acf18f0 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2001, 2005 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
* Jens Lukowski/Innoopract - initial renaming/restructuring
*
*******************************************************************************/
package org.eclipse.wst.sse.core.internal.encoding.util;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import org.eclipse.core.resources.IStorage;
import org.eclipse.core.runtime.CoreException;
import org.eclipse.wst.sse.core.internal.encoding.CodedIO;
import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento;
import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector;
/**
* This is a "common function" class to decide if an input stream, is a
* unicode stream.
*/
public class UnicodeBOMEncodingDetector implements IResourceCharsetDetector {
//private static final String UTF_16_CHARSET_NAME = "UTF-16";
// //$NON-NLS-1$
public static class NotEnoughInputForBOMException extends IOException {
/**
* Default <code>serialVersionUID</code>
*/
private static final long serialVersionUID = 1L;
public NotEnoughInputForBOMException() {
super();
}
public NotEnoughInputForBOMException(String s) {
super(s);
}
}
private final static byte BB = (byte) 0xBB;
private final static byte BF = (byte) 0xBF;
private final static byte EF = (byte) 0xEF;
private final static byte FE = (byte) -2;
private final static byte FF = (byte) -1;
private static final String UTF_16BE_CHARSET_NAME = "UTF-16BE"; //$NON-NLS-1$
private static final String UTF_16LE_CHARSET_NAME = "UTF-16LE"; //$NON-NLS-1$
private static final String UTF_8_CHARSET_NAME = "UTF-8"; //$NON-NLS-1$
private InputStream fInputStream = null;
private boolean fNoBOMPossible;
private EncodingMemento checkForBOM(InputStream inputStream) {
EncodingMemento result = null;
try {
byte b1 = getNextByte(inputStream);
byte b2 = getNextByte(inputStream);
if (b1 == FE && b2 == FF) {
result = createEncodingMemento(UTF_16BE_CHARSET_NAME);
result.setUnicodeStream(true);
} else {
if (b1 == FF && b2 == FE) {
result = createEncodingMemento(UTF_16LE_CHARSET_NAME);
result.setUnicodeStream(true);
} else {
byte b3 = getNextByte((inputStream));
if (b1 == EF && b2 == BB && b3 == BF) {
result = createEncodingMemento(UTF_8_CHARSET_NAME);
result.setUTF83ByteBOMUsed(true);
}
}
}
} catch (NotEnoughInputForBOMException e) {
// This is sort of unexpected for normal cases, but can occur for
// empty
// streams. And, this can occur "normally" for non-BOM streams
// that
// have only two
// bytes, and for which those two bytes match the first two bytes
// of UTF-8
// BOM In any case, we'll simply return null;
result = null;
} catch (IOException e) {
// other errors should be impossible
throw new Error(e);
}
return result;
}
private EncodingMemento createEncodingMemento(String javaEncodingName) {
EncodingMemento encodingMemento = new EncodingMemento();
encodingMemento.setJavaCharsetName(javaEncodingName);
String ianaName = Charset.forName(javaEncodingName).name();
encodingMemento.setDetectedCharsetName(ianaName);
if (javaEncodingName.equals(UTF_8_CHARSET_NAME)) {
encodingMemento.setUTF83ByteBOMUsed(true);
}
return encodingMemento;
}
public String getEncoding() throws IOException {
return getEncodingMemento().getDetectedCharsetName();
}
/**
* Returns IANA encoding name if BOM detected in stream. If a BOM is
* detected, the stream is left positioned after readying the BOM. If a
* BOM is not detected, the steam is reset.
*
* 0xFEFF UTF-16, big-endian 0xFFFE UTF-16, little-endian 0xEFBBBF UTF-8
* (BOM is optional)
*
* @param inputStream -
* must be a resetable (mark supported) stream so it can be
* reset, if not BOM encoded stream
* @return String - IANA encodingname (may not work well on 1.3, but 1.4
* seems to have good support for IANA names)
*/
public EncodingMemento getEncodingMemento() {
EncodingMemento result = null;
if (!fNoBOMPossible) {
if (fInputStream == null)
throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$
if (!fInputStream.markSupported()) {
throw new IllegalArgumentException("inputStream must be resetable"); //$NON-NLS-1$
}
result = checkForBOM(fInputStream);
}
return result;
}
private byte getNextByte(InputStream inputStream) throws IOException {
int byteCharAsInt = -1;
// be sure we won't block
if (inputStream.available() > 0) {
byteCharAsInt = inputStream.read();
byteCharAsInt = byteCharAsInt & 0XFF;
}
// to avoid confustion over meaning of returned byte,
// throw exception if EOF reached.
if (byteCharAsInt == -1)
throw new NotEnoughInputForBOMException("typically not an error"); //$NON-NLS-1$
return (byte) byteCharAsInt;
}
/**
*
*/
public String getSpecDefaultEncoding() {
// There is no default for this case
return null;
}
/**
*
*/
private void resetAll() {
fNoBOMPossible = false;
fInputStream = null;
}
/**
*
*/
public void set(InputStream inputStream) {
resetAll();
fInputStream = inputStream;
}
public void set(IStorage iStorage) throws CoreException {
set(new BufferedInputStream(iStorage.getContents(), CodedIO.MAX_BUF_SIZE));
}
public void set(Reader reader) {
if (reader instanceof ByteReader) {
ByteReader byteReader = (ByteReader) reader;
fInputStream = byteReader.fInputStream;
} else {
fNoBOMPossible = true;
}
}
}