blob: 6aeba07cc68850d9fc8eefc65e42bc4b96076788 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2001, 2004 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
* Jens Lukowski/Innoopract - initial renaming/restructuring
*
*******************************************************************************/
package org.eclipse.wst.sse.core.internal.encoding;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.StringTokenizer;
/**
* EncodingHelper is used to determine the IANA tag / java encoding from the
* processing instruction. From this processing instruction: <?xml
* version="1.0" encoding="UTF-8"?>getEncodingTag() would return "UTF-8" and
* getEncoding() would return UTF8. The processing instruction is searched for
* via an input stream, or a byte array, depending on the constructor used.
*
* @deprecated - users of this class should move to base support: see
* iFile.getCharset (and related functions with bom detection in
* IContentDescription
*
*/
public class EncodingHelper {
protected static final int encodingNameSearchLimit = 1000;
protected static final org.eclipse.wst.sse.core.internal.encoding.util.SupportedJavaEncoding javaEncodingConverterHelper = new org.eclipse.wst.sse.core.internal.encoding.util.SupportedJavaEncoding();
/**
* @deprecated Encoding preferences are stored on a per content type
* basis. See the EncodingHelper.java class in b2butil for an
* example of how to extract the default encoding for a given
* content type.
*/
static public String getDefaultEncoding() {
return javaEncodingConverterHelper.getJavaConverterName(getDefaultEncodingTag());
}
/**
* @deprecated Encoding preferences are stored on a per content type
* basis. See the EncodingHelper.java class in b2butil for an
* example of how to extract the default encoding for a given
* content type.
*/
static public String getDefaultEncodingTag() {
return "UTF-8"; //$NON-NLS-1$
}
private String encodingName = null;
private String encodingTag;
/**
* Method EncodingHelper. Determine the encoding based on the byte array
* passed in.
*
* @param headerBytes
* @param length
*/
public EncodingHelper(byte[] headerBytes, int length) {
determineEncoding(headerBytes, length);
}
/**
* Method EncodingHelper. Determine the encoding based on the input
* stream. Only the first 1000 bytes will be searched
*
* @param inStream
*/
public EncodingHelper(InputStream inStream) {
determineEncoding(inStream);
}
private void determineEncoding(byte[] headerBytes, int length) {
try {
if (headerBytes.length >= 4) {
if ((headerBytes[0] == -2) && (headerBytes[1] == -1)) // Byte
// Order
// Mark
// ==
// 0xFEFF
{
// UTF-16, big-endian
encodingName = "UnicodeBig"; //$NON-NLS-1$
// encodingTag = "UTF-16";//$NON-NLS-1$
} else if ((headerBytes[0] == -1) && (headerBytes[1] == -2)) // Byte
// Order
// Mark
// ==
// 0xFFFE
{
// UTF-16, little-endian
encodingName = "UnicodeLittle"; //$NON-NLS-1$
// encodingTag = "UTF-16";//$NON-NLS-1$
} else if ((headerBytes[0] == -17) && (headerBytes[1] == -69) && (headerBytes[2] == -65)) // Byte
// Order
// Mark
// ==
// 0xEF
// BB
// BF
{
// UTF-8
encodingName = "UTF8"; //$NON-NLS-1$
}
// // Otherwise, we need to check the document's content.
// // ( UTF-8, US-ASCII, ISO-8859, Shift-JIS, EUC, ... )
// if ((encodingName != null) && (encodingName.length() == 0))
// // special
// {
// encodingName = getEncodingNameByAuto(smallBuffer,
// smallBuffer.length);
// }
// else
// XMLEncodingPlugin.getPlugin().getMsgLogger().write("encoding
// name from BOM = " + encodingName);
// System.out.println("encoding name from BOM = " +
// encodingName);
// Now determine the encoding tag
encodingTag = getEncodingName(headerBytes, headerBytes.length);
if (encodingName == null) {
if (encodingTag != null) {
encodingName = javaEncodingConverterHelper.getJavaConverterName(encodingTag);
} else {
encodingName = "UTF8"; //$NON-NLS-1$
}
}
if (encodingTag == null || encodingTag.trim().equals("")) { //$NON-NLS-1$
encodingTag = javaEncodingConverterHelper.getIANAEncodingName(encodingName);
}
// System.out.println("encodingTag = " + encodingTag);
}
} catch (UnsupportedEncodingException ex) {
// if this classn't deprecated, this should not be ignored
}
}
protected void determineEncoding(InputStream inStream) {
try {
// try and get at least first four bytes for auto encoding
// detection
byte[] headerBytes = getBytes(inStream, encodingNameSearchLimit);
determineEncoding(headerBytes, encodingNameSearchLimit);
} catch (IOException exception) {
// if exception, no bytes}
}
}
private byte[] getBytes(InputStream inputStream, int max) throws IOException {
byte[] allHeaderBytes = new byte[max];
int nRead = inputStream.read(allHeaderBytes, 0, max);
byte[] headerBytes = null;
if (nRead != max) {
headerBytes = new byte[nRead];
System.arraycopy(allHeaderBytes, 0, headerBytes, 0, nRead);
} else {
headerBytes = allHeaderBytes;
}
return headerBytes;
}
/**
* <code>getEncoding</code> Retrieve the java converter name from the
* document.
*
* @return a <code>String</code> value for the java converter
*/
public String getEncoding() {
return encodingName;
}
/**
* Use the encoding information in the document.
*/
protected String getEncodingName(byte[] string, int length) throws UnsupportedEncodingException {
String enc = null;
enc = getEncodingNameInDocument(string, length);
return (enc);
}
protected String getEncodingNameInDocument(byte[] string, int length) throws UnsupportedEncodingException {
String encoding = null;
final String content;
if (encodingName != null) {
content = new String(string, encodingName);
} else {
content = new String(string); // use default Java encoder
}
String prologTag = "<?xml"; //$NON-NLS-1$
String encodingKeyword = "encoding"; //$NON-NLS-1$
int indexStart = content.indexOf(prologTag);
if (indexStart != -1) {
int indexEnd = content.indexOf(">", indexStart); //$NON-NLS-1$
if (indexEnd != -1) {
String prolog = content.substring(indexStart, indexEnd);
int encodingStart = prolog.indexOf(encodingKeyword);
if (encodingStart != -1) {
String encodingString = prolog.substring(encodingStart + encodingKeyword.length());
String delimiter = encodingString.indexOf("'") != -1 ? "'" : "\""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
StringTokenizer tokenizer = new StringTokenizer(encodingString, delimiter);
String equalSign = "="; //$NON-NLS-1$
while (tokenizer.hasMoreElements()) {
String currToken = tokenizer.nextToken();
if (currToken.trim().equals(equalSign)) {
if (tokenizer.hasMoreElements()) {
encoding = tokenizer.nextToken();
}
break;
}
}
}
}
}
if (encoding != null) {
final int len = encoding.length();
if (len > 0) {
if ((encoding.charAt(0) == '"') && (encoding.charAt(len - 1) == '"')) {
encoding = encoding.substring(1, len - 1);
} else if ((encoding.charAt(0) == '\'') && (encoding.charAt(len - 1) == '\'')) {
encoding = encoding.substring(1, len - 1);
}
}
}
return encoding;
}
/**
* <code>getEncodingTag</code> Retrieve the encoding tag from the
* document.
*
* @return a <code>String</code> value for the encoding tag
*/
public String getEncodingTag() {
return encodingTag;
}
public boolean isSameEncoding(String oldEncoding) {
if (oldEncoding == null) {
return false;
}
if (oldEncoding.equals(getEncoding())) {
return true;
}
return false;
}
}