bundles/org.eclipse.wst.sse.core/src-encoding/org/eclipse/wst/sse/core/internal/encoding/EncodingMemento.java - sourceediting/webtools.sourceediting - Git at Google

 /*******************************************************************************
  * Copyright (c) 2001, 2005 IBM Corporation and others.
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the Eclipse Public License v1.0
  * which accompanies this distribution, and is available at
  * http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors:
  *     IBM Corporation - initial API and implementation
  *     Jens Lukowski/Innoopract - initial renaming/restructuring
  *
  *******************************************************************************/
 package org.eclipse.wst.sse.core.internal.encoding;

 import org.eclipse.core.runtime.content.IContentDescription;


 /**
  * This class is to simply hold information and data about the type of
  * encoding found for a resource. It not only includes names, etc., but also
  * gives hints about the algorithm, or rule, that the encodng was determined.
  * Having all this info in a central object, associated with the Document
  * (technically, IStructuredDocument), allows for better user error messages,
  * and better handling of knowing how to dump a file, given we know how it was
  * loaded.
  *
  * Note: the data in this class is only valid if its has actually gone through
  * the loading or dumping sequence. It is not accurate, for example, if a
  * structuredDocument is simply created and then setText called. In this type
  * of case, accuracy for loading and dumping is not required, since its all
  * re-discovered. One limitation is that structuredDocument's created "from
  * scratch" this way, don't have any encoding information to count on, and
  * would have to arrange the processing to be done. (And it is done,
  * automatically if going through loader or dumper, but perhaps not in future
  * new uses. TODO: this can be inproved in future versions.)
  *
  * isInitialized is set when the loader or dumper processes have been used,
  * but even this can't be counted on 100% if the document has been modified
  * since.
  *
  */
 public class EncodingMemento implements Cloneable {

 	public final static String CLONED = "cloned"; //$NON-NLS-1$
 	public final static String DEFAULTS_ASSUMED_FOR_EMPTY_INPUT = "DefaultsAssumedForEmptyInput"; //$NON-NLS-1$
 	public final static String DEFAULTS_USED_DUE_TO_SMALL_STREAM = "defaultsUsedDueToSmallStream"; //$NON-NLS-1$


 	/*
 	 * Strings to be used for tracing. TODO: need to clean this up, we no
 	 * longer use all of them
 	 */
 	public final static String DETECTED_STANDARD_UNICODE_BYTES = "detectedStandardUnicodeBytes"; //$NON-NLS-1$
 	public final static String FOUND_ENCODING_IN_CONTENT = "foundEncodingInContent"; //$NON-NLS-1$
 	public final static String FOUND_ENCODING_IN_STREAM = "foundEncodingInStream"; //$NON-NLS-1$
 	public final static String FOUND_ENCODING_IN_STRUCTURED_DOCUMENT = "foundEncodingInStructuredDocument"; //$NON-NLS-1$
 	public final static String GUESSED_ENCODING_FROM_STREAM = "GuessEncodingFromStream"; //$NON-NLS-1$
 	public final static String JAVA_NAME_FOUND_AS_IANA_NAME = "noMappingFoundButJavaNameFoundToBeIANAName"; //$NON-NLS-1$
 	public final static String JAVA_NAME_FOUND_IN_ALIAS_NAME = "noMappingFoundButJavaNameFoundInAliasTable"; //$NON-NLS-1$
 	public final static String NO_IANA_NAME_FOUND = "noMappingFoundFromJavaNameToIANAName"; //$NON-NLS-1$
 	public final static String USED_CONTENT_TYPE_DEFAULT = "UsedContentTypeDefault"; //$NON-NLS-1$
 	public final static String USED_JAVA_DEFAULT = "UsedJavaDefault"; //$NON-NLS-1$
 	public final static String USED_MEMENTO_FROM_LOAD = "usedMementoFromLoad"; //$NON-NLS-1$
 	public final static String USED_PROPERTY_SETTINGS = "USED_PROPERTY_SETTINGS"; //$NON-NLS-1$
 	public final static String USED_USER_SPECIFIED_PREFERENCE = "UsedUserSpecifiedPreference"; //$NON-NLS-1$
 	public final static String USED_WORKSPACE_DEFAULT = "UsedWorkspaceDefault"; //$NON-NLS-1$
 	public final static String USER_IS_USING_JAVA_ENCODING = "UserIsUsingJavaEncoding"; //$NON-NLS-1$
 	private String fAppropriateDefault;
 	private String fDetectedCharsetName;
 	private String fInvalidEncoding;


 	private String fJavaCharsetName;
 	private boolean fUnicodeStream;
 	private boolean fUTF83ByteBOMUsed;

 	public EncodingMemento() {
 		super();
 	}

 	/**
 	 * Returns a clone of this object.
 	 */
 	public Object clone() {
 		EncodingMemento object = null;
 		try {
 			object = (EncodingMemento) super.clone();
 		}
 		catch (CloneNotSupportedException e) {
 			// impossible, since we're implementing here
 		}

 		return object;

 	}

 	/**
 	 * Returns the appropriateDefault. This is only set if an invalid encoding
 	 * was found, and contains an charset appropriate to use as a default
 	 * value, if, for example, the user decides to load the document anyway,
 	 * even though the charset was found to be invalid.
 	 *
 	 * @return String
 	 */
 	public String getAppropriateDefault() {
 		if (fAppropriateDefault == null) {
 			fAppropriateDefault = NonContentBasedEncodingRules.useDefaultNameRules(null);
 		}
 		return fAppropriateDefault;
 	}

 	/**
 	 * Returns the charset name, if it is different from the charset name
 	 * found in getJavaCharsetName. This can happen, for example, if there are
 	 * differences in case. This method might return SHIFT_JIS, and the the
 	 * getJavaCharsetName might return Shift_JIS -- if SHIFT_JIS was detected
 	 * in file/document. If the original file contained the correct case, then
 	 * this method would return null. The getJavaCharsetName is typically the
 	 * one that should always be used, and this one only used for certain
 	 * error conditions, or or if when creating a "duplicate" resource, it was
 	 * desired to use exactly the charset name as in the original document. As
 	 * an example of this later case, the original document might contain
 	 * ISO-8859-9, but the detected charset name might contain ISO-8859-9-I.
 	 *
 	 * @return String
 	 */
 	public String getDetectedCharsetName() {
 		return fDetectedCharsetName;
 	}

 	/**
 	 * Returns a charset name that was detected, but not found to be a charset
 	 * suppoorted by the VM.
 	 *
 	 * @return String
 	 */
 	public String getInvalidEncoding() {
 		return fInvalidEncoding;
 	}

 	/**
 	 * Returns the java cononical charset name.
 	 *
 	 * @return String
 	 */
 	public String getJavaCharsetName() {
 		return fJavaCharsetName;
 	}

 	/**
 	 * Note: we may be able to remove this method, if it turns out this work
 	 * is done by "text" type.
 	 *
 	 * @deprecated -
 	 */
 	public byte[] getUnicodeBOM() {
 		byte[] bom = null;
 		if (isUTF83ByteBOMUsed())
 			bom = IContentDescription.BOM_UTF_8;
 		else if (isUnicodeStream()) {
 			if (getJavaCharsetName().equals("UTF-16") || getJavaCharsetName().equals("UTF-16LE")) { //$NON-NLS-1$ //$NON-NLS-2$
 				bom = IContentDescription.BOM_UTF_16LE;
 			}
 			else if (getJavaCharsetName().equals("UTF-16BE")) { //$NON-NLS-1$
 				bom = IContentDescription.BOM_UTF_16BE;
 			}

 		}
 		return bom;
 	}

 	/**
 	 * Note: in our implementation, the stream is a unicode stream if the
 	 * charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is
 	 * not considered unicode stream here.
 	 *
 	 * @return returns true if is a unicode (UTF-16) stream
 	 */
 	public boolean isUnicodeStream() {
 		return fUnicodeStream;
 	}

 	/**
 	 * Note: in our implementation, the stream is a unicode stream if the
 	 * charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is
 	 * not considered unicode stream here.
 	 *
 	 * Set during load, can be used by dumper to write 3 byte BOM, which Java
 	 * does not normally do. This helps maintain compatibility with other
 	 * programs (those that wrote the 3 byte BOM there to begin with.
 	 *
 	 * @return boolean
 	 */
 	public boolean isUTF83ByteBOMUsed() {
 		return fUTF83ByteBOMUsed;
 	}

 	public boolean isValid() {
 		return getInvalidEncoding() == null;
 	}

 	/**
 	 * Sets the appropriateDefault.
 	 *
 	 * @param appropriateDefault
 	 *            The appropriateDefault to set
 	 */
 	public void setAppropriateDefault(String appropriateDefault) {
 		fAppropriateDefault = appropriateDefault;
 	}


 	public void setDetectedCharsetName(String detectedCharsetName) {
 		fDetectedCharsetName = detectedCharsetName;
 	}

 	public void setInvalidEncoding(String invalidEncoding) {
 		fInvalidEncoding = invalidEncoding;
 	}

 	/**
 	 * Sets the javaEncodingName.
 	 *
 	 * @param javaEncodingName
 	 *            The javaEncodingName to set
 	 */
 	public void setJavaCharsetName(String javaCharsetName) {
 		fJavaCharsetName = javaCharsetName;
 	}

 	/**
 	 * @param b
 	 */
 	public void setUnicodeStream(boolean unicodeStream) {
 		fUnicodeStream = unicodeStream;

 	}

 	/**
 	 * Sets the uTF83ByteBOMfound.
 	 *
 	 * @param uTF83ByteBOMfound
 	 *            The uTF83ByteBOMfound to set
 	 */
 	public void setUTF83ByteBOMUsed(boolean uTF83ByteBOMUsed) {
 		fUTF83ByteBOMUsed = uTF83ByteBOMUsed;
 	}

 }
	/*******************************************************************************
	* Copyright (c) 2001, 2005 IBM Corporation and others.
	* All rights reserved. This program and the accompanying materials
	* are made available under the terms of the Eclipse Public License v1.0
	* which accompanies this distribution, and is available at
	* http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors:
	* IBM Corporation - initial API and implementation
	* Jens Lukowski/Innoopract - initial renaming/restructuring
	*
	*******************************************************************************/
	package org.eclipse.wst.sse.core.internal.encoding;

	import org.eclipse.core.runtime.content.IContentDescription;


	/**
	* This class is to simply hold information and data about the type of
	* encoding found for a resource. It not only includes names, etc., but also
	* gives hints about the algorithm, or rule, that the encodng was determined.
	* Having all this info in a central object, associated with the Document
	* (technically, IStructuredDocument), allows for better user error messages,
	* and better handling of knowing how to dump a file, given we know how it was
	* loaded.
	*
	* Note: the data in this class is only valid if its has actually gone through
	* the loading or dumping sequence. It is not accurate, for example, if a
	* structuredDocument is simply created and then setText called. In this type
	* of case, accuracy for loading and dumping is not required, since its all
	* re-discovered. One limitation is that structuredDocument's created "from
	* scratch" this way, don't have any encoding information to count on, and
	* would have to arrange the processing to be done. (And it is done,
	* automatically if going through loader or dumper, but perhaps not in future
	* new uses. TODO: this can be inproved in future versions.)
	*
	* isInitialized is set when the loader or dumper processes have been used,
	* but even this can't be counted on 100% if the document has been modified
	* since.
	*
	*/
	public class EncodingMemento implements Cloneable {

	public final static String CLONED = "cloned"; //$NON-NLS-1$
	public final static String DEFAULTS_ASSUMED_FOR_EMPTY_INPUT = "DefaultsAssumedForEmptyInput"; //$NON-NLS-1$
	public final static String DEFAULTS_USED_DUE_TO_SMALL_STREAM = "defaultsUsedDueToSmallStream"; //$NON-NLS-1$


	/*
	* Strings to be used for tracing. TODO: need to clean this up, we no
	* longer use all of them
	*/
	public final static String DETECTED_STANDARD_UNICODE_BYTES = "detectedStandardUnicodeBytes"; //$NON-NLS-1$
	public final static String FOUND_ENCODING_IN_CONTENT = "foundEncodingInContent"; //$NON-NLS-1$
	public final static String FOUND_ENCODING_IN_STREAM = "foundEncodingInStream"; //$NON-NLS-1$
	public final static String FOUND_ENCODING_IN_STRUCTURED_DOCUMENT = "foundEncodingInStructuredDocument"; //$NON-NLS-1$
	public final static String GUESSED_ENCODING_FROM_STREAM = "GuessEncodingFromStream"; //$NON-NLS-1$
	public final static String JAVA_NAME_FOUND_AS_IANA_NAME = "noMappingFoundButJavaNameFoundToBeIANAName"; //$NON-NLS-1$
	public final static String JAVA_NAME_FOUND_IN_ALIAS_NAME = "noMappingFoundButJavaNameFoundInAliasTable"; //$NON-NLS-1$
	public final static String NO_IANA_NAME_FOUND = "noMappingFoundFromJavaNameToIANAName"; //$NON-NLS-1$
	public final static String USED_CONTENT_TYPE_DEFAULT = "UsedContentTypeDefault"; //$NON-NLS-1$
	public final static String USED_JAVA_DEFAULT = "UsedJavaDefault"; //$NON-NLS-1$
	public final static String USED_MEMENTO_FROM_LOAD = "usedMementoFromLoad"; //$NON-NLS-1$
	public final static String USED_PROPERTY_SETTINGS = "USED_PROPERTY_SETTINGS"; //$NON-NLS-1$
	public final static String USED_USER_SPECIFIED_PREFERENCE = "UsedUserSpecifiedPreference"; //$NON-NLS-1$
	public final static String USED_WORKSPACE_DEFAULT = "UsedWorkspaceDefault"; //$NON-NLS-1$
	public final static String USER_IS_USING_JAVA_ENCODING = "UserIsUsingJavaEncoding"; //$NON-NLS-1$
	private String fAppropriateDefault;
	private String fDetectedCharsetName;
	private String fInvalidEncoding;


	private String fJavaCharsetName;
	private boolean fUnicodeStream;
	private boolean fUTF83ByteBOMUsed;

	public EncodingMemento() {
	super();
	}

	/**
	* Returns a clone of this object.
	*/
	public Object clone() {
	EncodingMemento object = null;
	try {
	object = (EncodingMemento) super.clone();
	}
	catch (CloneNotSupportedException e) {
	// impossible, since we're implementing here
	}

	return object;

	}

	/**
	* Returns the appropriateDefault. This is only set if an invalid encoding
	* was found, and contains an charset appropriate to use as a default
	* value, if, for example, the user decides to load the document anyway,
	* even though the charset was found to be invalid.
	*
	* @return String
	*/
	public String getAppropriateDefault() {
	if (fAppropriateDefault == null) {
	fAppropriateDefault = NonContentBasedEncodingRules.useDefaultNameRules(null);
	}
	return fAppropriateDefault;
	}

	/**
	* Returns the charset name, if it is different from the charset name
	* found in getJavaCharsetName. This can happen, for example, if there are
	* differences in case. This method might return SHIFT_JIS, and the the
	* getJavaCharsetName might return Shift_JIS -- if SHIFT_JIS was detected
	* in file/document. If the original file contained the correct case, then
	* this method would return null. The getJavaCharsetName is typically the
	* one that should always be used, and this one only used for certain
	* error conditions, or or if when creating a "duplicate" resource, it was
	* desired to use exactly the charset name as in the original document. As
	* an example of this later case, the original document might contain
	* ISO-8859-9, but the detected charset name might contain ISO-8859-9-I.
	*
	* @return String
	*/
	public String getDetectedCharsetName() {
	return fDetectedCharsetName;
	}

	/**
	* Returns a charset name that was detected, but not found to be a charset
	* suppoorted by the VM.
	*
	* @return String
	*/
	public String getInvalidEncoding() {
	return fInvalidEncoding;
	}

	/**
	* Returns the java cononical charset name.
	*
	* @return String
	*/
	public String getJavaCharsetName() {
	return fJavaCharsetName;
	}

	/**
	* Note: we may be able to remove this method, if it turns out this work
	* is done by "text" type.
	*
	* @deprecated -
	*/
	public byte[] getUnicodeBOM() {
	byte[] bom = null;
	if (isUTF83ByteBOMUsed())
	bom = IContentDescription.BOM_UTF_8;
	else if (isUnicodeStream()) {
	if (getJavaCharsetName().equals("UTF-16") \|\| getJavaCharsetName().equals("UTF-16LE")) { //$NON-NLS-1$ //$NON-NLS-2$
	bom = IContentDescription.BOM_UTF_16LE;
	}
	else if (getJavaCharsetName().equals("UTF-16BE")) { //$NON-NLS-1$
	bom = IContentDescription.BOM_UTF_16BE;
	}

	}
	return bom;
	}

	/**
	* Note: in our implementation, the stream is a unicode stream if the
	* charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is
	* not considered unicode stream here.
	*
	* @return returns true if is a unicode (UTF-16) stream
	*/
	public boolean isUnicodeStream() {
	return fUnicodeStream;
	}

	/**
	* Note: in our implementation, the stream is a unicode stream if the
	* charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is
	* not considered unicode stream here.
	*
	* Set during load, can be used by dumper to write 3 byte BOM, which Java
	* does not normally do. This helps maintain compatibility with other
	* programs (those that wrote the 3 byte BOM there to begin with.
	*
	* @return boolean
	*/
	public boolean isUTF83ByteBOMUsed() {
	return fUTF83ByteBOMUsed;
	}

	public boolean isValid() {
	return getInvalidEncoding() == null;
	}

	/**
	* Sets the appropriateDefault.
	*
	* @param appropriateDefault
	* The appropriateDefault to set
	*/
	public void setAppropriateDefault(String appropriateDefault) {
	fAppropriateDefault = appropriateDefault;
	}


	public void setDetectedCharsetName(String detectedCharsetName) {
	fDetectedCharsetName = detectedCharsetName;
	}

	public void setInvalidEncoding(String invalidEncoding) {
	fInvalidEncoding = invalidEncoding;
	}

	/**
	* Sets the javaEncodingName.
	*
	* @param javaEncodingName
	* The javaEncodingName to set
	*/
	public void setJavaCharsetName(String javaCharsetName) {
	fJavaCharsetName = javaCharsetName;
	}

	/**
	* @param b
	*/
	public void setUnicodeStream(boolean unicodeStream) {
	fUnicodeStream = unicodeStream;

	}

	/**
	* Sets the uTF83ByteBOMfound.
	*
	* @param uTF83ByteBOMfound
	* The uTF83ByteBOMfound to set
	*/
	public void setUTF83ByteBOMUsed(boolean uTF83ByteBOMUsed) {
	fUTF83ByteBOMUsed = uTF83ByteBOMUsed;
	}

	}