bundles/org.eclipse.wst.sse.core/src-encoding/org/eclipse/wst/sse/core/internal/encoding/util/UnicodeBOMEncodingDetector.java - sourceediting/webtools.sourceediting - Git at Google

 /*******************************************************************************
  * Copyright (c) 2001, 2005 IBM Corporation and others.
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the Eclipse Public License v1.0
  * which accompanies this distribution, and is available at
  * http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors:
  *     IBM Corporation - initial API and implementation
  *     Jens Lukowski/Innoopract - initial renaming/restructuring
  *
  *******************************************************************************/
 package org.eclipse.wst.sse.core.internal.encoding.util;

 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.nio.charset.Charset;

 import org.eclipse.core.resources.IStorage;
 import org.eclipse.core.runtime.CoreException;
 import org.eclipse.wst.sse.core.internal.encoding.CodedIO;
 import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento;
 import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector;


 /**
  * This is a "common function" class to decide if an input stream, is a
  * unicode stream.
  */
 public class UnicodeBOMEncodingDetector implements IResourceCharsetDetector {

 	//private static final String UTF_16_CHARSET_NAME = "UTF-16";
 	// //$NON-NLS-1$

 	public static class NotEnoughInputForBOMException extends IOException {

 		/**
 		 * Default <code>serialVersionUID</code>
 		 */
 		private static final long serialVersionUID = 1L;

 		public NotEnoughInputForBOMException() {
 			super();
 		}

 		public NotEnoughInputForBOMException(String s) {
 			super(s);
 		}

 	}

 	private final static byte BB = (byte) 0xBB;
 	private final static byte BF = (byte) 0xBF;
 	private final static byte EF = (byte) 0xEF;
 	private final static byte FE = (byte) -2;

 	private final static byte FF = (byte) -1;
 	private static final String UTF_16BE_CHARSET_NAME = "UTF-16BE"; //$NON-NLS-1$
 	private static final String UTF_16LE_CHARSET_NAME = "UTF-16LE"; //$NON-NLS-1$

 	private static final String UTF_8_CHARSET_NAME = "UTF-8"; //$NON-NLS-1$

 	private InputStream fInputStream = null;
 	private boolean fNoBOMPossible;

 	private EncodingMemento checkForBOM(InputStream inputStream) {
 		EncodingMemento result = null;

 		try {
 			byte b1 = getNextByte(inputStream);
 			byte b2 = getNextByte(inputStream);
 			if (b1 == FE && b2 == FF) {
 				result = createEncodingMemento(UTF_16BE_CHARSET_NAME);
 				result.setUnicodeStream(true);
 			} else {
 				if (b1 == FF && b2 == FE) {
 					result = createEncodingMemento(UTF_16LE_CHARSET_NAME);
 					result.setUnicodeStream(true);
 				} else {
 					byte b3 = getNextByte((inputStream));
 					if (b1 == EF && b2 == BB && b3 == BF) {
 						result = createEncodingMemento(UTF_8_CHARSET_NAME);
 						result.setUTF83ByteBOMUsed(true);
 					}
 				}
 			}
 		} catch (NotEnoughInputForBOMException e) {
 			// This is sort of unexpected for normal cases, but can occur for
 			// empty
 			// streams. And, this can occur "normally" for non-BOM streams
 			// that
 			// have only two
 			// bytes, and for which those two bytes match the first two bytes
 			// of UTF-8
 			// BOM In any case, we'll simply return null;
 			result = null;
 		} catch (IOException e) {
 			// other errors should be impossible
 			throw new Error(e);
 		}

 		return result;
 	}

 	private EncodingMemento createEncodingMemento(String javaEncodingName) {
 		EncodingMemento encodingMemento = new EncodingMemento();
 		encodingMemento.setJavaCharsetName(javaEncodingName);
 		String ianaName = Charset.forName(javaEncodingName).name();
 		encodingMemento.setDetectedCharsetName(ianaName);
 		if (javaEncodingName.equals(UTF_8_CHARSET_NAME)) {
 			encodingMemento.setUTF83ByteBOMUsed(true);
 		}
 		return encodingMemento;
 	}

 	public String getEncoding() throws IOException {

 		return getEncodingMemento().getDetectedCharsetName();
 	}

 	/**
 	 * Returns IANA encoding name if BOM detected in stream. If a BOM is
 	 * detected, the stream is left positioned after readying the BOM. If a
 	 * BOM is not detected, the steam is reset.
 	 *
 	 * 0xFEFF UTF-16, big-endian 0xFFFE UTF-16, little-endian 0xEFBBBF UTF-8
 	 * (BOM is optional)
 	 *
 	 * @param inputStream -
 	 *            must be a resetable (mark supported) stream so it can be
 	 *            reset, if not BOM encoded stream
 	 * @return String - IANA encodingname (may not work well on 1.3, but 1.4
 	 *         seems to have good support for IANA names)
 	 */
 	public EncodingMemento getEncodingMemento() {

 		EncodingMemento result = null;
 		if (!fNoBOMPossible) {

 			if (fInputStream == null)
 				throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$

 			if (!fInputStream.markSupported()) {
 				throw new IllegalArgumentException("inputStream must be resetable"); //$NON-NLS-1$
 			}

 			result = checkForBOM(fInputStream);
 		}

 		return result;

 	}

 	private byte getNextByte(InputStream inputStream) throws IOException {

 		int byteCharAsInt = -1;
 		// be sure we won't block
 		if (inputStream.available() > 0) {
 			byteCharAsInt = inputStream.read();
 			byteCharAsInt = byteCharAsInt & 0XFF;
 		}
 		// to avoid confustion over meaning of returned byte,
 		// throw exception if EOF reached.
 		if (byteCharAsInt == -1)
 			throw new NotEnoughInputForBOMException("typically not an error"); //$NON-NLS-1$
 		return (byte) byteCharAsInt;
 	}

 	/**
 	 *
 	 */

 	public String getSpecDefaultEncoding() {
 		// There is no default for this case
 		return null;
 	}

 	/**
 	 *
 	 */
 	private void resetAll() {
 		fNoBOMPossible = false;
 		fInputStream = null;

 	}

 	/**
 	 *
 	 */

 	public void set(InputStream inputStream) {
 		resetAll();
 		fInputStream = inputStream;
 	}

 	public void set(IStorage iStorage) throws CoreException {
 		set(new BufferedInputStream(iStorage.getContents(), CodedIO.MAX_BUF_SIZE));

 	}

 	public void set(Reader reader) {
 		if (reader instanceof ByteReader) {
 			ByteReader byteReader = (ByteReader) reader;
 			fInputStream = byteReader.fInputStream;
 		} else {
 			fNoBOMPossible = true;
 		}

 	}

 }
	/*******************************************************************************
	* Copyright (c) 2001, 2005 IBM Corporation and others.
	* All rights reserved. This program and the accompanying materials
	* are made available under the terms of the Eclipse Public License v1.0
	* which accompanies this distribution, and is available at
	* http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors:
	* IBM Corporation - initial API and implementation
	* Jens Lukowski/Innoopract - initial renaming/restructuring
	*
	*******************************************************************************/
	package org.eclipse.wst.sse.core.internal.encoding.util;

	import java.io.BufferedInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.Reader;
	import java.nio.charset.Charset;

	import org.eclipse.core.resources.IStorage;
	import org.eclipse.core.runtime.CoreException;
	import org.eclipse.wst.sse.core.internal.encoding.CodedIO;
	import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento;
	import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector;


	/**
	* This is a "common function" class to decide if an input stream, is a
	* unicode stream.
	*/
	public class UnicodeBOMEncodingDetector implements IResourceCharsetDetector {

	//private static final String UTF_16_CHARSET_NAME = "UTF-16";
	// //$NON-NLS-1$

	public static class NotEnoughInputForBOMException extends IOException {

	/**
	* Default <code>serialVersionUID</code>
	*/
	private static final long serialVersionUID = 1L;

	public NotEnoughInputForBOMException() {
	super();
	}

	public NotEnoughInputForBOMException(String s) {
	super(s);
	}

	}

	private final static byte BB = (byte) 0xBB;
	private final static byte BF = (byte) 0xBF;
	private final static byte EF = (byte) 0xEF;
	private final static byte FE = (byte) -2;

	private final static byte FF = (byte) -1;
	private static final String UTF_16BE_CHARSET_NAME = "UTF-16BE"; //$NON-NLS-1$
	private static final String UTF_16LE_CHARSET_NAME = "UTF-16LE"; //$NON-NLS-1$

	private static final String UTF_8_CHARSET_NAME = "UTF-8"; //$NON-NLS-1$

	private InputStream fInputStream = null;
	private boolean fNoBOMPossible;

	private EncodingMemento checkForBOM(InputStream inputStream) {
	EncodingMemento result = null;

	try {
	byte b1 = getNextByte(inputStream);
	byte b2 = getNextByte(inputStream);
	if (b1 == FE && b2 == FF) {
	result = createEncodingMemento(UTF_16BE_CHARSET_NAME);
	result.setUnicodeStream(true);
	} else {
	if (b1 == FF && b2 == FE) {
	result = createEncodingMemento(UTF_16LE_CHARSET_NAME);
	result.setUnicodeStream(true);
	} else {
	byte b3 = getNextByte((inputStream));
	if (b1 == EF && b2 == BB && b3 == BF) {
	result = createEncodingMemento(UTF_8_CHARSET_NAME);
	result.setUTF83ByteBOMUsed(true);
	}
	}
	}
	} catch (NotEnoughInputForBOMException e) {
	// This is sort of unexpected for normal cases, but can occur for
	// empty
	// streams. And, this can occur "normally" for non-BOM streams
	// that
	// have only two
	// bytes, and for which those two bytes match the first two bytes
	// of UTF-8
	// BOM In any case, we'll simply return null;
	result = null;
	} catch (IOException e) {
	// other errors should be impossible
	throw new Error(e);
	}

	return result;
	}

	private EncodingMemento createEncodingMemento(String javaEncodingName) {
	EncodingMemento encodingMemento = new EncodingMemento();
	encodingMemento.setJavaCharsetName(javaEncodingName);
	String ianaName = Charset.forName(javaEncodingName).name();
	encodingMemento.setDetectedCharsetName(ianaName);
	if (javaEncodingName.equals(UTF_8_CHARSET_NAME)) {
	encodingMemento.setUTF83ByteBOMUsed(true);
	}
	return encodingMemento;
	}

	public String getEncoding() throws IOException {

	return getEncodingMemento().getDetectedCharsetName();
	}

	/**
	* Returns IANA encoding name if BOM detected in stream. If a BOM is
	* detected, the stream is left positioned after readying the BOM. If a
	* BOM is not detected, the steam is reset.
	*
	* 0xFEFF UTF-16, big-endian 0xFFFE UTF-16, little-endian 0xEFBBBF UTF-8
	* (BOM is optional)
	*
	* @param inputStream -
	* must be a resetable (mark supported) stream so it can be
	* reset, if not BOM encoded stream
	* @return String - IANA encodingname (may not work well on 1.3, but 1.4
	* seems to have good support for IANA names)
	*/
	public EncodingMemento getEncodingMemento() {

	EncodingMemento result = null;
	if (!fNoBOMPossible) {

	if (fInputStream == null)
	throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$

	if (!fInputStream.markSupported()) {
	throw new IllegalArgumentException("inputStream must be resetable"); //$NON-NLS-1$
	}

	result = checkForBOM(fInputStream);
	}

	return result;

	}

	private byte getNextByte(InputStream inputStream) throws IOException {

	int byteCharAsInt = -1;
	// be sure we won't block
	if (inputStream.available() > 0) {
	byteCharAsInt = inputStream.read();
	byteCharAsInt = byteCharAsInt & 0XFF;
	}
	// to avoid confustion over meaning of returned byte,
	// throw exception if EOF reached.
	if (byteCharAsInt == -1)
	throw new NotEnoughInputForBOMException("typically not an error"); //$NON-NLS-1$
	return (byte) byteCharAsInt;
	}

	/**
	*
	*/

	public String getSpecDefaultEncoding() {
	// There is no default for this case
	return null;
	}

	/**
	*
	*/
	private void resetAll() {
	fNoBOMPossible = false;
	fInputStream = null;

	}

	/**
	*
	*/

	public void set(InputStream inputStream) {
	resetAll();
	fInputStream = inputStream;
	}

	public void set(IStorage iStorage) throws CoreException {
	set(new BufferedInputStream(iStorage.getContents(), CodedIO.MAX_BUF_SIZE));

	}

	public void set(Reader reader) {
	if (reader instanceof ByteReader) {
	ByteReader byteReader = (ByteReader) reader;
	fInputStream = byteReader.fInputStream;
	} else {
	fNoBOMPossible = true;
	}

	}

	}