| /******************************************************************************* |
| * Copyright (c) 2001, 2004 IBM Corporation and others. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: |
| * IBM Corporation - initial API and implementation |
| * Jens Lukowski/Innoopract - initial renaming/restructuring |
| * |
| *******************************************************************************/ |
| package org.eclipse.wst.sse.core.internal.encoding.util; |
| |
| import java.io.BufferedInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.Reader; |
| import java.nio.charset.Charset; |
| |
| import org.eclipse.core.resources.IStorage; |
| import org.eclipse.core.runtime.CoreException; |
| import org.eclipse.wst.sse.core.internal.encoding.CodedIO; |
| import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento; |
| import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector; |
| |
| |
| /** |
| * This is a "common function" class to decide if an input stream, is a |
| * unicode stream. |
| */ |
| public class UnicodeBOMEncodingDetector implements IResourceCharsetDetector { |
| |
| //private static final String UTF_16_CHARSET_NAME = "UTF-16"; |
| // //$NON-NLS-1$ |
| |
| public static class NotEnoughInputForBOMException extends IOException { |
| |
| /** |
| * Default <code>serialVersionUID</code> |
| */ |
| private static final long serialVersionUID = 1L; |
| |
| public NotEnoughInputForBOMException() { |
| super(); |
| } |
| |
| public NotEnoughInputForBOMException(String s) { |
| super(s); |
| } |
| |
| } |
| |
| private final static byte BB = (byte) 0xBB; |
| private final static byte BF = (byte) 0xBF; |
| private final static byte EF = (byte) 0xEF; |
| private final static byte FE = (byte) -2; |
| |
| private final static byte FF = (byte) -1; |
| private static final String UTF_16BE_CHARSET_NAME = "UTF-16BE"; //$NON-NLS-1$ |
| private static final String UTF_16LE_CHARSET_NAME = "UTF-16LE"; //$NON-NLS-1$ |
| |
| private static final String UTF_8_CHARSET_NAME = "UTF-8"; //$NON-NLS-1$ |
| |
| private InputStream fInputStream = null; |
| private boolean fNoBOMPossible; |
| |
| private EncodingMemento checkForBOM(InputStream inputStream) { |
| EncodingMemento result = null; |
| |
| try { |
| byte b1 = getNextByte(inputStream); |
| byte b2 = getNextByte(inputStream); |
| if (b1 == FE && b2 == FF) { |
| result = createEncodingMemento(UTF_16BE_CHARSET_NAME); |
| result.setUnicodeStream(true); |
| } else { |
| if (b1 == FF && b2 == FE) { |
| result = createEncodingMemento(UTF_16LE_CHARSET_NAME); |
| result.setUnicodeStream(true); |
| } else { |
| byte b3 = getNextByte((inputStream)); |
| if (b1 == EF && b2 == BB && b3 == BF) { |
| result = createEncodingMemento(UTF_8_CHARSET_NAME); |
| result.setUTF83ByteBOMUsed(true); |
| } |
| } |
| } |
| } catch (NotEnoughInputForBOMException e) { |
| // This is sort of unexpected for normal cases, but can occur for |
| // empty |
| // streams. And, this can occur "normally" for non-BOM streams |
| // that |
| // have only two |
| // bytes, and for which those two bytes match the first two bytes |
| // of UTF-8 |
| // BOM In any case, we'll simply return null; |
| result = null; |
| } catch (IOException e) { |
| // other errors should be impossible |
| throw new Error(e); |
| } |
| |
| return result; |
| } |
| |
| private EncodingMemento createEncodingMemento(String javaEncodingName) { |
| EncodingMemento encodingMemento = new EncodingMemento(); |
| encodingMemento.setJavaCharsetName(javaEncodingName); |
| String ianaName = Charset.forName(javaEncodingName).name(); |
| encodingMemento.setDetectedCharsetName(ianaName); |
| if (javaEncodingName.equals(UTF_8_CHARSET_NAME)) { |
| encodingMemento.setUTF83ByteBOMUsed(true); |
| } |
| return encodingMemento; |
| } |
| |
| public String getEncoding() throws IOException { |
| |
| return getEncodingMemento().getDetectedCharsetName(); |
| } |
| |
| /** |
| * Returns IANA encoding name if BOM detected in stream. If a BOM is |
| * detected, the stream is left positioned after readying the BOM. If a |
| * BOM is not detected, the steam is reset. |
| * |
| * 0xFEFF UTF-16, big-endian 0xFFFE UTF-16, little-endian 0xEFBBBF UTF-8 |
| * (BOM is optional) |
| * |
| * @param inputStream - |
| * must be a resetable (mark supported) stream so it can be |
| * reset, if not BOM encoded stream |
| * @return String - IANA encodingname (may not work well on 1.3, but 1.4 |
| * seems to have good support for IANA names) |
| */ |
| public EncodingMemento getEncodingMemento() { |
| |
| EncodingMemento result = null; |
| if (!fNoBOMPossible) { |
| |
| if (fInputStream == null) |
| throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$ |
| |
| if (!fInputStream.markSupported()) { |
| throw new IllegalArgumentException("inputStream must be resetable"); //$NON-NLS-1$ |
| } |
| |
| result = checkForBOM(fInputStream); |
| } |
| |
| return result; |
| |
| } |
| |
| private byte getNextByte(InputStream inputStream) throws IOException { |
| |
| int byteCharAsInt = -1; |
| // be sure we won't block |
| if (inputStream.available() > 0) { |
| byteCharAsInt = inputStream.read(); |
| byteCharAsInt = byteCharAsInt & 0XFF; |
| } |
| // to avoid confustion over meaning of returned byte, |
| // throw exception if EOF reached. |
| if (byteCharAsInt == -1) |
| throw new NotEnoughInputForBOMException("typically not an error"); //$NON-NLS-1$ |
| return (byte) byteCharAsInt; |
| } |
| |
| /** |
| * |
| */ |
| |
| public String getSpecDefaultEncoding() { |
| // There is no default for this case |
| return null; |
| } |
| |
| /** |
| * |
| */ |
| private void resetAll() { |
| fNoBOMPossible = false; |
| fInputStream = null; |
| |
| } |
| |
| /** |
| * |
| */ |
| |
| public void set(InputStream inputStream) { |
| resetAll(); |
| fInputStream = inputStream; |
| } |
| |
| public void set(IStorage iStorage) throws CoreException { |
| set(new BufferedInputStream(iStorage.getContents(), CodedIO.MAX_BUF_SIZE)); |
| |
| } |
| |
| public void set(Reader reader) { |
| if (reader instanceof ByteReader) { |
| ByteReader byteReader = (ByteReader) reader; |
| fInputStream = byteReader.fInputStream; |
| } else { |
| fNoBOMPossible = true; |
| } |
| |
| } |
| |
| } |