| /******************************************************************************* |
| * Copyright (c) 2004 IBM Corporation and others. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: |
| * IBM Corporation - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.wst.css.core.internal.contenttype; |
| |
| import java.io.BufferedInputStream; |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.Reader; |
| import java.nio.charset.Charset; |
| import java.nio.charset.IllegalCharsetNameException; |
| import java.nio.charset.UnsupportedCharsetException; |
| |
| import org.eclipse.core.resources.IStorage; |
| import org.eclipse.core.runtime.CoreException; |
| import org.eclipse.wst.sse.core.internal.encoding.CodedIO; |
| import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento; |
| import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector; |
| import org.eclipse.wst.sse.core.internal.encoding.NonContentBasedEncodingRules; |
| import org.eclipse.wst.xml.core.internal.contenttype.EncodingParserConstants; |
| |
| |
| |
| public class CSSResourceEncodingDetector implements IResourceCharsetDetector { |
| class NullMemento extends EncodingMemento { |
| /** |
| * |
| */ |
| public NullMemento() { |
| super(); |
| String defaultCharset = NonContentBasedEncodingRules.useDefaultNameRules(null); |
| setJavaCharsetName(defaultCharset); |
| setAppropriateDefault(defaultCharset); |
| setDetectedCharsetName(null); |
| } |
| } |
| |
| |
| private CSSHeadTokenizer fTokenizer; |
| private EncodingMemento fEncodingMemento; |
| private boolean fHeaderParsed; |
| private Reader fReader; |
| |
| /** |
| * There is no spec defined encoding for CSS, so Null is returned. |
| */ |
| public String getSpecDefaultEncoding() { |
| // should match what's in plugin.xml (or look it up from there). |
| return null; |
| } |
| |
| private boolean canHandleAsUnicodeStream(String tokenType) { |
| boolean canHandleAsUnicodeStream = false; |
| if (tokenType == EncodingParserConstants.UTF83ByteBOM) { |
| canHandleAsUnicodeStream = true; |
| String enc = "UTF-8"; //$NON-NLS-1$ |
| createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| fEncodingMemento.setUTF83ByteBOMUsed(true); |
| } |
| else if (tokenType == EncodingParserConstants.UTF16BE) { |
| canHandleAsUnicodeStream = true; |
| String enc = "UTF-16BE"; //$NON-NLS-1$ |
| createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| } |
| else if (tokenType == EncodingParserConstants.UTF16LE) { |
| canHandleAsUnicodeStream = true; |
| String enc = "UTF-16"; //$NON-NLS-1$ |
| createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| } |
| return canHandleAsUnicodeStream; |
| } |
| |
| /** |
| * @return Returns the tokenizer. |
| */ |
| private CSSHeadTokenizer getTokenizer() { |
| if (fTokenizer == null) { |
| fTokenizer = new CSSHeadTokenizer(); |
| } |
| return fTokenizer; |
| } |
| |
| private boolean isLegalString(String valueTokenType) { |
| boolean result = false; |
| if (valueTokenType != null) { |
| result = valueTokenType.equals(EncodingParserConstants.StringValue) || valueTokenType.equals(EncodingParserConstants.UnDelimitedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTerminatedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue); |
| } |
| return result; |
| } |
| |
| private void parseInput() throws IOException { |
| checkInContents(); |
| if (fEncodingMemento == null) { |
| checkHeuristics(); |
| } |
| } |
| |
| private void checkInContents() throws IOException { |
| CSSHeadTokenizer tokenizer = getTokenizer(); |
| tokenizer.reset(fReader); |
| HeadParserToken token = null; |
| String tokenType = null; |
| do { |
| token = tokenizer.getNextToken(); |
| tokenType = token.getType(); |
| if (canHandleAsUnicodeStream(tokenType)) { |
| // side effect of canHandle is to create appropriate memento |
| } |
| else if (tokenType == CSSHeadTokenizerConstants.CHARSET_RULE) { |
| if (tokenizer.hasMoreTokens()) { |
| HeadParserToken valueToken = tokenizer.getNextToken(); |
| String valueTokenType = valueToken.getType(); |
| if (isLegalString(valueTokenType)) { |
| createEncodingMemento(valueToken.getText(), EncodingMemento.FOUND_ENCODING_IN_CONTENT); |
| |
| } |
| } |
| } |
| |
| } |
| while (tokenizer.hasMoreTokens()); |
| } |
| |
| /** |
| * |
| */ |
| private void checkHeuristics() throws IOException { |
| boolean noHeuristic = false; |
| String heuristicEncoding = null; |
| try { |
| fReader.reset(); |
| byte[] bytes = new byte[3]; |
| int nRead = 0; |
| for (int i = 0; i < bytes.length; i++) { |
| if (fReader.ready()) { |
| int oneByte = fReader.read(); |
| nRead++; |
| if (oneByte <= 0xFF) { |
| bytes[i] = (byte) oneByte; |
| } |
| else { |
| noHeuristic = true; |
| } |
| } |
| else { |
| noHeuristic = true; |
| break; |
| } |
| } |
| if (!noHeuristic && nRead == 3) { |
| heuristicEncoding = EncodingGuesser.guessEncoding(bytes, 3); |
| } |
| } |
| catch (IOException e) { |
| // if any IO exception, then not a heuristic case |
| } |
| finally { |
| fReader.reset(); |
| } |
| if (heuristicEncoding != null) { |
| createEncodingMemento(heuristicEncoding, EncodingMemento.GUESSED_ENCODING_FROM_STREAM); |
| } |
| |
| } |
| |
| /** |
| * Note: once this instance is created, trace info still needs to be |
| * appended by caller, depending on the context its created. |
| */ |
| private void createEncodingMemento(String detectedCharsetName) { |
| fEncodingMemento = new EncodingMemento(); |
| fEncodingMemento.setJavaCharsetName(getAppropriateJavaCharset(detectedCharsetName)); |
| fEncodingMemento.setDetectedCharsetName(detectedCharsetName); |
| // TODO: if detectedCharset and spec default is |
| // null, need to use "work |
| // bench based" defaults. |
| fEncodingMemento.setAppropriateDefault(getSpecDefaultEncoding()); |
| } |
| |
| /** |
| * convience method all subclasses can use (but not override) |
| * |
| * @param detectedCharsetName |
| * @param reason |
| */ |
| private void createEncodingMemento(String detectedCharsetName, String reason) { |
| createEncodingMemento(detectedCharsetName); |
| } |
| |
| /** |
| * convience method all subclasses can use (but not override) |
| */ |
| private final void ensureInputSet() { |
| if (fReader == null) { |
| throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$ |
| } |
| } |
| |
| /** |
| * This method can return null, if invalid charset name (in which case |
| * "appropriateDefault" should be used, if a name is really need for some |
| * "save anyway" cases). |
| * |
| * @param detectedCharsetName |
| * @return |
| */ |
| private String getAppropriateJavaCharset(String detectedCharsetName) { |
| String result = null; |
| // 1. Check explicit mapping overrides from |
| // property file -- its here we pick up "rules" for cases |
| // that are not even in Java |
| result = CodedIO.checkMappingOverrides(detectedCharsetName); |
| // 2. Use the "canonical" name from JRE mappings |
| // Note: see Charset JavaDoc, the name you get one |
| // with can be alias, |
| // the name you get back is "standard" name. |
| Charset javaCharset = null; |
| try { |
| javaCharset = Charset.forName(detectedCharsetName); |
| } |
| catch (UnsupportedCharsetException e) { |
| // only set invalid, if result is same as detected -- they won't |
| // be equal if |
| // overridden |
| if (result != null && result.equals(detectedCharsetName)) { |
| fEncodingMemento.setInvalidEncoding(detectedCharsetName); |
| } |
| } |
| catch (IllegalCharsetNameException e) { |
| // only set invalid, if result is same as detected -- they won't |
| // be equal if |
| // overridden |
| if (result != null && result.equals(detectedCharsetName)) { |
| fEncodingMemento.setInvalidEncoding(detectedCharsetName); |
| } |
| } |
| // give priority to java cononical name, if present |
| if (javaCharset != null) { |
| result = javaCharset.name(); |
| // but still allow overrides |
| result = CodedIO.checkMappingOverrides(result); |
| } |
| return result; |
| } |
| |
| public String getEncoding() throws IOException { |
| return getEncodingMemento().getDetectedCharsetName(); |
| } |
| |
| public EncodingMemento getEncodingMemento() throws IOException { |
| ensureInputSet(); |
| if (!fHeaderParsed) { |
| parseInput(); |
| // we keep track of if header's already been |
| // parse, so can make |
| // multiple 'get' calls, without causing |
| // reparsing. |
| fHeaderParsed = true; |
| // Note: there is a "hidden assumption" here |
| // that an empty |
| // string in content should be treated same as |
| // not present. |
| } |
| if (fEncodingMemento == null) { |
| handleSpecDefault(); |
| } |
| if (fEncodingMemento == null) { |
| // safty net |
| fEncodingMemento = new NullMemento(); |
| } |
| return fEncodingMemento; |
| } |
| |
| public EncodingMemento getSpecDefaultEncodingMemento() { |
| resetAll(); |
| EncodingMemento result = null; |
| String enc = getSpecDefaultEncoding(); |
| if (enc != null) { |
| createEncodingMemento(enc, EncodingMemento.DEFAULTS_ASSUMED_FOR_EMPTY_INPUT); |
| fEncodingMemento.setAppropriateDefault(enc); |
| result = fEncodingMemento; |
| } |
| return result; |
| } |
| |
| private void handleSpecDefault() { |
| String encodingName; |
| encodingName = getSpecDefaultEncoding(); |
| if (encodingName != null) { |
| // createEncodingMemento(encodingName, |
| // EncodingMemento.USED_CONTENT_TYPE_DEFAULT); |
| fEncodingMemento = new EncodingMemento(); |
| fEncodingMemento.setJavaCharsetName(encodingName); |
| fEncodingMemento.setAppropriateDefault(encodingName); |
| } |
| } |
| |
| /** |
| * |
| */ |
| private void resetAll() { |
| fReader = null; |
| fHeaderParsed = false; |
| fEncodingMemento = null; |
| } |
| |
| /** |
| * |
| */ |
| public void set(InputStream inputStream) { |
| resetAll(); |
| fReader = new ByteReader(inputStream); |
| try { |
| fReader.mark(CodedIO.MAX_MARK_SIZE); |
| } |
| catch (IOException e) { |
| // impossible, since we know ByteReader |
| // supports marking |
| throw new Error(e); |
| } |
| } |
| |
| /** |
| * |
| */ |
| public void set(IStorage iStorage) throws CoreException { |
| resetAll(); |
| InputStream inputStream = iStorage.getContents(); |
| InputStream resettableStream = new BufferedInputStream(inputStream, CodedIO.MAX_BUF_SIZE); |
| resettableStream.mark(CodedIO.MAX_MARK_SIZE); |
| set(resettableStream); |
| // TODO we'll need to "remember" IFile, or |
| // get its (or its project's) settings, in case |
| // those are needed to handle cases when the |
| // encoding is not in the file stream. |
| } |
| |
| /** |
| * Note: this is not part of interface to help avoid confusion ... it |
| * expected this Reader is a well formed character reader ... that is, its |
| * all ready been determined to not be a unicode marked input stream. And, |
| * its assumed to be in the correct position, at position zero, ready to |
| * read first character. |
| */ |
| public void set(Reader reader) { |
| resetAll(); |
| fReader = reader; |
| if (!fReader.markSupported()) { |
| fReader = new BufferedReader(fReader); |
| } |
| try { |
| fReader.mark(CodedIO.MAX_MARK_SIZE); |
| } |
| catch (IOException e) { |
| // impossble, since we just checked if markable |
| throw new Error(e); |
| } |
| } |
| |
| } |