| /******************************************************************************* |
| * Copyright (c) 2004 IBM Corporation and others. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: |
| * IBM Corporation - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.jst.jsp.core.internal.contenttype; |
| |
| import java.io.BufferedInputStream; |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.Reader; |
| import java.nio.charset.Charset; |
| import java.nio.charset.IllegalCharsetNameException; |
| import java.nio.charset.UnsupportedCharsetException; |
| import java.util.regex.Pattern; |
| |
| import org.eclipse.core.resources.IStorage; |
| import org.eclipse.core.runtime.CoreException; |
| import org.eclipse.wst.sse.core.internal.encoding.CodedIO; |
| import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento; |
| import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector; |
| import org.eclipse.wst.sse.core.internal.encoding.NonContentBasedEncodingRules; |
| import org.eclipse.wst.sse.core.internal.encoding.util.ByteReader; |
| import org.eclipse.wst.xml.core.internal.contenttype.EncodingParserConstants; |
| import org.eclipse.wst.xml.core.internal.contenttype.XMLHeadTokenizerConstants; |
| |
| public class JSPResourceEncodingDetector implements IResourceCharsetDetector { |
| |
| private String fCharset; |
| |
| private String fContentType; |
| |
| private String fContentTypeValue; |
| |
| private String fLanguage; |
| |
| private String fPageEncodingValue; |
| |
| private JSPHeadTokenizer fTokenizer; |
| |
| private String fXMLDecEncodingName; |
| |
| private boolean unicodeCase; |
| |
| private EncodingMemento fEncodingMemento; |
| |
| private boolean fHeaderParsed; |
| |
| private Reader fReader; |
| |
| |
| /** |
| * No Arg constructor. |
| */ |
| public JSPResourceEncodingDetector() { |
| super(); |
| } |
| |
| class NullMemento extends EncodingMemento { |
| /** |
| * |
| */ |
| public NullMemento() { |
| super(); |
| String defaultCharset = NonContentBasedEncodingRules.useDefaultNameRules(null); |
| setJavaCharsetName(defaultCharset); |
| setAppropriateDefault(defaultCharset); |
| setDetectedCharsetName(null); |
| } |
| |
| } |
| |
| /** |
| * @return Returns the contentType. |
| */ |
| public String getContentType() throws IOException { |
| ensureInputSet(); |
| if (!fHeaderParsed) { |
| parseInput(); |
| // we keep track of if header's already been parse, so can make |
| // multiple 'get' calls, without causing reparsing. |
| fHeaderParsed = true; |
| // Note: there is a "hidden assumption" here that an empty |
| // string in content should be treated same as not present. |
| } |
| return fContentType; |
| } |
| |
| public String getEncoding() throws IOException { |
| return getEncodingMemento().getDetectedCharsetName(); |
| } |
| |
| // to ensure consist overall rules used, we'll mark as |
| // final, |
| // and require subclasses to provide certain pieces of |
| // the |
| // implementation |
| public EncodingMemento getEncodingMemento() throws IOException { |
| ensureInputSet(); |
| if (!fHeaderParsed) { |
| parseInput(); |
| // we keep track of if header's already been |
| // parse, so can make |
| // multiple 'get' calls, without causing |
| // reparsing. |
| fHeaderParsed = true; |
| // Note: there is a "hidden assumption" here |
| // that an empty |
| // string in content should be treated same as |
| // not present. |
| } |
| if (fEncodingMemento == null) { |
| handleSpecDefault(); |
| } |
| if (fEncodingMemento == null) { |
| // safty net |
| fEncodingMemento = new NullMemento(); |
| } |
| return fEncodingMemento; |
| } |
| |
| public String getLanguage() throws IOException { |
| ensureInputSet(); |
| if (!fHeaderParsed) { |
| parseInput(); |
| fHeaderParsed = true; |
| } |
| return fLanguage; |
| } |
| |
| public String getSpecDefaultEncoding() { |
| // by JSP Spec |
| final String enc = "ISO-8859-1"; //$NON-NLS-1$ |
| return enc; |
| } |
| |
| public EncodingMemento getSpecDefaultEncodingMemento() { |
| resetAll(); |
| EncodingMemento result = null; |
| String enc = getSpecDefaultEncoding(); |
| if (enc != null) { |
| createEncodingMemento(enc, EncodingMemento.DEFAULTS_ASSUMED_FOR_EMPTY_INPUT); |
| fEncodingMemento.setAppropriateDefault(enc); |
| result = fEncodingMemento; |
| } |
| return result; |
| } |
| |
| /** |
| * |
| */ |
| public void set(InputStream inputStream) { |
| resetAll(); |
| fReader = new ByteReader(inputStream); |
| try { |
| fReader.mark(CodedIO.MAX_MARK_SIZE); |
| } |
| catch (IOException e) { |
| // impossible, since we know ByteReader |
| // supports marking |
| throw new Error(e); |
| } |
| } |
| |
| /** |
| * |
| */ |
| public void set(IStorage iStorage) throws CoreException { |
| resetAll(); |
| InputStream inputStream = iStorage.getContents(); |
| InputStream resettableStream = new BufferedInputStream(inputStream, CodedIO.MAX_BUF_SIZE); |
| resettableStream.mark(CodedIO.MAX_MARK_SIZE); |
| set(resettableStream); |
| // TODO we'll need to "remember" IFile, or |
| // get its (or its project's) settings, in case |
| // those are needed to handle cases when the |
| // encoding is not in the file stream. |
| } |
| |
| /** |
| * Note: this is not part of interface to help avoid confusion ... it |
| * expected this Reader is a well formed character reader ... that is, its |
| * all ready been determined to not be a unicode marked input stream. And, |
| * its assumed to be in the correct position, at position zero, ready to |
| * read first character. |
| */ |
| public void set(Reader reader) { |
| resetAll(); |
| fReader = reader; |
| if (!fReader.markSupported()) { |
| fReader = new BufferedReader(fReader); |
| } |
| try { |
| fReader.mark(CodedIO.MAX_MARK_SIZE); |
| } |
| catch (IOException e) { |
| // impossble, since we just checked if markable |
| throw new Error(e); |
| } |
| } |
| |
| private boolean canHandleAsUnicodeStream(String tokenType) { |
| boolean canHandleAsUnicode = false; |
| if (tokenType == EncodingParserConstants.UTF83ByteBOM) { |
| canHandleAsUnicode = true; |
| String enc = "UTF-8"; //$NON-NLS-1$ |
| createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| fEncodingMemento.setUTF83ByteBOMUsed(true); |
| } |
| else if (tokenType == EncodingParserConstants.UTF16BE) { |
| canHandleAsUnicode = true; |
| String enc = "UTF-16BE"; //$NON-NLS-1$ |
| createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| } |
| else if (tokenType == EncodingParserConstants.UTF16LE) { |
| canHandleAsUnicode = true; |
| String enc = "UTF-16"; //$NON-NLS-1$ |
| createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| } |
| return canHandleAsUnicode; |
| } |
| |
| /** |
| * Note: once this instance is created, trace info still needs to be |
| * appended by caller, depending on the context its created. |
| */ |
| private void createEncodingMemento(String detectedCharsetName) { |
| fEncodingMemento = new EncodingMemento(); |
| fEncodingMemento.setJavaCharsetName(getAppropriateJavaCharset(detectedCharsetName)); |
| fEncodingMemento.setDetectedCharsetName(detectedCharsetName); |
| // TODO: if detectedCharset and spec default is |
| // null, need to use "work |
| // bench based" defaults. |
| fEncodingMemento.setAppropriateDefault(getSpecDefaultEncoding()); |
| } |
| |
| /** |
| * There can sometimes be mulitple 'encodings' specified in a file. This |
| * is an attempt to centralize the rules for deciding between them. |
| * Returns encoding according to priority: 1. XML Declaration 2. page |
| * directive pageEncoding name 3. page directive contentType charset name |
| */ |
| private String getAppropriateEncoding() { |
| String result = null; |
| if (fXMLDecEncodingName != null) |
| result = fXMLDecEncodingName; |
| else if (fPageEncodingValue != null) |
| result = fPageEncodingValue; |
| else if (fCharset != null) |
| result = fCharset; |
| return result; |
| } |
| |
| /** |
| * This method can return null, if invalid charset name (in which case |
| * "appropriateDefault" should be used, if a name is really need for some |
| * "save anyway" cases). |
| * |
| * @param detectedCharsetName |
| * @return |
| */ |
| private String getAppropriateJavaCharset(String detectedCharsetName) { |
| String result = null; |
| // 1. Check explicit mapping overrides from |
| // property file -- its here we pick up "rules" for cases |
| // that are not even in Java |
| result = CodedIO.checkMappingOverrides(detectedCharsetName); |
| // 2. Use the "canonical" name from JRE mappings |
| // Note: see Charset JavaDoc, the name you get one |
| // with can be alias, |
| // the name you get back is "standard" name. |
| Charset javaCharset = null; |
| try { |
| javaCharset = Charset.forName(detectedCharsetName); |
| } |
| catch (UnsupportedCharsetException e) { |
| // only set invalid, if result is same as detected -- they won't |
| // be equal if |
| // overridden |
| if (result != null && result.equals(detectedCharsetName)) { |
| fEncodingMemento.setInvalidEncoding(detectedCharsetName); |
| } |
| } |
| catch (IllegalCharsetNameException e) { |
| // only set invalid, if result is same as detected -- they won't |
| // be equal if |
| // overridden |
| if (result != null && result.equals(detectedCharsetName)) { |
| fEncodingMemento.setInvalidEncoding(detectedCharsetName); |
| } |
| } |
| // give priority to java cononical name, if present |
| if (javaCharset != null) { |
| result = javaCharset.name(); |
| // but still allow overrides |
| result = CodedIO.checkMappingOverrides(result); |
| } |
| return result; |
| } |
| |
| private JSPHeadTokenizer getTokinizer() { |
| if (fTokenizer == null) { |
| fTokenizer = new JSPHeadTokenizer(); |
| } |
| return fTokenizer; |
| } |
| |
| private void handleSpecDefault() { |
| String encodingName; |
| encodingName = getSpecDefaultEncoding(); |
| if (encodingName != null) { |
| // createEncodingMemento(encodingName, |
| // EncodingMemento.USED_CONTENT_TYPE_DEFAULT); |
| fEncodingMemento = new EncodingMemento(); |
| fEncodingMemento.setJavaCharsetName(encodingName); |
| fEncodingMemento.setAppropriateDefault(encodingName); |
| } |
| } |
| |
| private boolean isLegalString(String valueTokenType) { |
| boolean result = false; |
| if (valueTokenType != null) { |
| result = valueTokenType.equals(EncodingParserConstants.StringValue) || valueTokenType.equals(EncodingParserConstants.UnDelimitedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTerminatedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue); |
| } |
| return result; |
| } |
| |
| private void parseContentTypeValue(String contentType) { |
| Pattern pattern = Pattern.compile(";\\s*charset\\s*=\\s*"); //$NON-NLS-1$ |
| String[] parts = pattern.split(contentType); |
| if (parts.length > 0) { |
| // if only one item, it can still be charset instead of |
| // contentType |
| if (parts.length == 1) { |
| if (parts[0].length() > 6) { |
| String checkForCharset = parts[0].substring(0, 7); |
| if (checkForCharset.equalsIgnoreCase("charset")) { //$NON-NLS-1$ |
| int eqpos = parts[0].indexOf('='); |
| eqpos = eqpos + 1; |
| if (eqpos < parts[0].length()) { |
| fCharset = parts[0].substring(eqpos); |
| fCharset = fCharset.trim(); |
| } |
| } |
| } |
| } |
| else { |
| fContentType = parts[0]; |
| } |
| } |
| if (parts.length > 1) { |
| fCharset = parts[1]; |
| } |
| } |
| |
| /** |
| * Looks for what ever encoding properties the tokenizer returns. Its the |
| * responsibility of the tokenizer to stop when appropriate and not go too |
| * far. |
| */ |
| private void parseHeader(JSPHeadTokenizer tokenizer) throws IOException { |
| fPageEncodingValue = null; |
| fCharset = null; |
| |
| HeadParserToken token = null; |
| do { |
| // don't use 'get' here (at least until reset issue fixed) |
| token = tokenizer.getNextToken(); |
| String tokenType = token.getType(); |
| if (canHandleAsUnicodeStream(tokenType)) |
| unicodeCase = true; |
| else { |
| |
| if (tokenType == XMLHeadTokenizerConstants.XMLDelEncoding) { |
| if (tokenizer.hasMoreTokens()) { |
| HeadParserToken valueToken = tokenizer.getNextToken(); |
| String valueTokenType = valueToken.getType(); |
| if (isLegalString(valueTokenType)) { |
| fXMLDecEncodingName = valueToken.getText(); |
| } |
| } |
| } |
| else if (tokenType == JSPHeadTokenizerConstants.PageEncoding) { |
| if (tokenizer.hasMoreTokens()) { |
| HeadParserToken valueToken = tokenizer.getNextToken(); |
| String valueTokenType = valueToken.getType(); |
| if (isLegalString(valueTokenType)) { |
| fPageEncodingValue = valueToken.getText(); |
| } |
| } |
| } |
| else if (tokenType == JSPHeadTokenizerConstants.PageContentType) { |
| if (tokenizer.hasMoreTokens()) { |
| HeadParserToken valueToken = tokenizer.getNextToken(); |
| String valueTokenType = valueToken.getType(); |
| if (isLegalString(valueTokenType)) { |
| fContentTypeValue = valueToken.getText(); |
| } |
| } |
| } |
| else if (tokenType == JSPHeadTokenizerConstants.PageLanguage) { |
| if (tokenizer.hasMoreTokens()) { |
| HeadParserToken valueToken = tokenizer.getNextToken(); |
| String valueTokenType = valueToken.getType(); |
| if (isLegalString(valueTokenType)) { |
| fLanguage = valueToken.getText(); |
| } |
| } |
| } |
| } |
| } |
| while (tokenizer.hasMoreTokens()); |
| if (fContentTypeValue != null) { |
| parseContentTypeValue(fContentTypeValue); |
| } |
| |
| } |
| |
| private void parseInput() throws IOException { |
| JSPHeadTokenizer tokenizer = getTokinizer(); |
| fReader.reset(); |
| tokenizer.reset(fReader); |
| parseHeader(tokenizer); |
| // unicode stream cases are created directly in parseHeader |
| if (!unicodeCase) { |
| String enc = getAppropriateEncoding(); |
| if (enc != null && enc.length() > 0) { |
| createEncodingMemento(enc, EncodingMemento.FOUND_ENCODING_IN_CONTENT); |
| } |
| } |
| } |
| |
| /** |
| * |
| */ |
| private void resetAll() { |
| fReader = null; |
| fHeaderParsed = false; |
| fEncodingMemento = null; |
| fCharset = null; |
| fContentTypeValue = null; |
| fPageEncodingValue = null; |
| fXMLDecEncodingName = null; |
| unicodeCase = false; |
| } |
| |
| |
| |
| /** |
| * convience method all subclasses can use (but not override) |
| * |
| * @param detectedCharsetName |
| * @param reason |
| */ |
| private void createEncodingMemento(String detectedCharsetName, String reason) { |
| createEncodingMemento(detectedCharsetName); |
| } |
| |
| /** |
| * convience method all subclasses can use (but not override) |
| */ |
| private void ensureInputSet() { |
| if (fReader == null) { |
| throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$ |
| } |
| } |
| } |