blob: e883649a5253ac356425d3e6eca444f22399df7d [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2000, 2016 IBM Corporation and others.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* IBM Corporation - initial API and implementation
*******************************************************************************/
package org.eclipse.jdt.internal.ui.text.spelling.engine;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.eclipse.core.runtime.IStatus;
import org.eclipse.core.runtime.Status;
import org.eclipse.core.resources.ResourcesPlugin;
import org.eclipse.jdt.internal.corext.util.Messages;
import org.eclipse.jdt.ui.JavaUI;
import org.eclipse.jdt.ui.PreferenceConstants;
import org.eclipse.jdt.internal.ui.JavaPlugin;
import org.eclipse.jdt.internal.ui.JavaUIMessages;
import org.eclipse.jdt.internal.core.manipulation.util.BasicElementLabels;
/**
* Partial implementation of a spell dictionary.
*
* @since 3.0
*/
public abstract class AbstractSpellDictionary implements ISpellDictionary {
/**
* Byte array wrapper
* @since 3.6
*/
private static class ByteArrayWrapper {
private static int hashCode(byte[] array) {
int prime= 31;
if (array == null)
return 0;
int result= 1;
for (int index= 0; index < array.length; index++) {
result= prime * result + array[index];
}
return result;
}
private byte[] byteArray;
public ByteArrayWrapper(byte[] byteArray) {
this.byteArray= byteArray;
}
@Override
public int hashCode() {
final int prime= 31;
int result= 1;
result= prime * result + ByteArrayWrapper.hashCode(byteArray);
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (!(obj instanceof ByteArrayWrapper))
return false;
ByteArrayWrapper other= (ByteArrayWrapper)obj;
if (!Arrays.equals(byteArray, other.byteArray))
return false;
return true;
}
}
/**
* Canonical name for UTF-8 encoding
* @since 3.6
*/
private static final String UTF_8= "UTF-8"; //$NON-NLS-1$
/** The bucket capacity */
protected static final int BUCKET_CAPACITY= 4;
/** The word buffer capacity */
protected static final int BUFFER_CAPACITY= 32;
/** The distance threshold */
protected static final int DISTANCE_THRESHOLD= 160;
/**
* The hash load factor
* @since 3.6
*/
protected static final float LOAD_FACTOR= 0.85f;
/** The phonetic distance algorithm */
private IPhoneticDistanceAlgorithm fDistanceAlgorithm= new DefaultPhoneticDistanceAlgorithm();
/** The mapping from phonetic hashes to word lists */
private final Map<ByteArrayWrapper, Object> fHashBuckets= new HashMap<>(getInitialSize(), LOAD_FACTOR);
/** The phonetic hash provider */
private IPhoneticHashProvider fHashProvider= new DefaultPhoneticHashProvider();
/** Is the dictionary already loaded? */
private boolean fLoaded= false;
/**
* Must the dictionary be loaded?
* @since 3.2
*/
private boolean fMustLoad= true;
/**
* Tells whether to strip non-letters at word boundaries.
* @since 3.3
*/
boolean fIsStrippingNonLetters= true;
/**
* Returns the initial size of dictionary.
*
* @return The initial size of dictionary.
* @since 3.6
*/
protected int getInitialSize() {
return 32;
}
/**
* Returns all candidates with the same phonetic hash.
*
* @param hash
* The hash to retrieve the candidates of
* @return Array of candidates for the phonetic hash
*/
protected final Object getCandidates(final String hash) {
ByteArrayWrapper hashBytes;
try {
hashBytes= new ByteArrayWrapper(hash.getBytes(UTF_8));
} catch (UnsupportedEncodingException e) {
JavaPlugin.log(e);
return null;
}
return fHashBuckets.get(hashBytes);
}
/**
* Returns all candidates that have a phonetic hash within a bounded
* distance to the specified word.
*
* @param word
* The word to find the nearest matches for
* @param sentence
* <code>true</code> iff the proposals start a new sentence,
* <code>false</code> otherwise
* @param hashs
* Array of close hashes to find the matches
* @return Set of ranked words with bounded distance to the specified word
*/
protected final Set<RankedWordProposal> getCandidates(final String word, final boolean sentence, final ArrayList<String> hashs) {
int distance= 0;
String hash= null;
final StringBuilder buffer= new StringBuilder(BUFFER_CAPACITY);
final HashSet<RankedWordProposal> result= new HashSet<>(BUCKET_CAPACITY * hashs.size());
for (int index= 0; index < hashs.size(); index++) {
hash= hashs.get(index);
final Object candidates= getCandidates(hash);
if (candidates == null)
continue;
else if (candidates instanceof byte[]) {
String candidate;
try {
candidate= new String((byte[])candidates, UTF_8);
} catch (UnsupportedEncodingException e) {
JavaPlugin.log(e);
return result;
}
distance= fDistanceAlgorithm.getDistance(word, candidate);
if (distance < DISTANCE_THRESHOLD) {
buffer.setLength(0);
buffer.append(candidate);
if (sentence)
buffer.setCharAt(0, Character.toUpperCase(buffer.charAt(0)));
result.add(new RankedWordProposal(buffer.toString(), -distance));
}
continue;
}
@SuppressWarnings("unchecked")
final ArrayList<byte[]> candidateList= (ArrayList<byte[]>)candidates;
int candidateSize= Math.min(500, candidateList.size()); // see https://bugs.eclipse.org/bugs/show_bug.cgi?id=195357
for (int offset= 0; offset < candidateSize; offset++) {
String candidate;
try {
candidate= new String(candidateList.get(offset), UTF_8);
} catch (UnsupportedEncodingException e) {
JavaPlugin.log(e);
return result;
}
distance= fDistanceAlgorithm.getDistance(word, candidate);
if (distance < DISTANCE_THRESHOLD) {
buffer.setLength(0);
buffer.append(candidate);
if (sentence)
buffer.setCharAt(0, Character.toUpperCase(buffer.charAt(0)));
result.add(new RankedWordProposal(buffer.toString(), -distance));
}
}
}
return result;
}
/**
* Returns all approximations that have a phonetic hash with smallest
* possible distance to the specified word.
*
* @param word
* The word to find the nearest matches for
* @param sentence
* <code>true</code> iff the proposals start a new sentence,
* <code>false</code> otherwise
* @param result
* Set of ranked words with smallest possible distance to the
* specified word
*/
protected final void getCandidates(final String word, final boolean sentence, final Set<RankedWordProposal> result) {
int distance= 0;
int minimum= Integer.MAX_VALUE;
StringBuilder buffer= new StringBuilder(BUFFER_CAPACITY);
final Object candidates= getCandidates(fHashProvider.getHash(word));
if (candidates == null)
return;
else if (candidates instanceof byte[]) {
String candidate;
try {
candidate= new String((byte[])candidates, UTF_8);
} catch (UnsupportedEncodingException e) {
JavaPlugin.log(e);
return;
}
distance= fDistanceAlgorithm.getDistance(word, candidate);
buffer.append(candidate);
if (sentence)
buffer.setCharAt(0, Character.toUpperCase(buffer.charAt(0)));
result.add(new RankedWordProposal(buffer.toString(), -distance));
return;
}
@SuppressWarnings("unchecked")
final ArrayList<byte[]> candidateList= (ArrayList<byte[]>)candidates;
final ArrayList<RankedWordProposal> matches= new ArrayList<>(candidateList.size());
for (int index= 0; index < candidateList.size(); index++) {
String candidate;
try {
candidate= new String(candidateList.get(index), UTF_8);
} catch (UnsupportedEncodingException e) {
JavaPlugin.log(e);
return;
}
distance= fDistanceAlgorithm.getDistance(word, candidate);
if (distance <= minimum) {
if (distance < minimum)
matches.clear();
buffer.setLength(0);
buffer.append(candidate);
if (sentence)
buffer.setCharAt(0, Character.toUpperCase(buffer.charAt(0)));
matches.add(new RankedWordProposal(buffer.toString(), -distance));
minimum= distance;
}
}
result.addAll(matches);
}
/**
* Tells whether this dictionary is empty.
*
* @return <code>true</code> if this dictionary is empty
* @since 3.3
*/
protected boolean isEmpty() {
return fHashBuckets.size() == 0;
}
/**
* Returns the used phonetic distance algorithm.
*
* @return The phonetic distance algorithm
*/
protected final IPhoneticDistanceAlgorithm getDistanceAlgorithm() {
return fDistanceAlgorithm;
}
/**
* Returns the used phonetic hash provider.
*
* @return The phonetic hash provider
*/
protected final IPhoneticHashProvider getHashProvider() {
return fHashProvider;
}
@Override
public Set<RankedWordProposal> getProposals(final String word, final boolean sentence) {
try {
if (!fLoaded) {
synchronized (this) {
fLoaded= load(getURL());
if (fLoaded)
compact();
}
}
} catch (MalformedURLException exception) {
// Do nothing
}
final String hash= fHashProvider.getHash(word);
final char[] mutators= fHashProvider.getMutators();
final ArrayList<String> neighborhood= new ArrayList<>((word.length() + 1) * (mutators.length + 2));
neighborhood.add(hash);
final Set<RankedWordProposal> candidates= getCandidates(word, sentence, neighborhood);
neighborhood.clear();
char previous= 0;
char next= 0;
char[] characters= word.toCharArray();
for (int index= 0; index < word.length() - 1; index++) {
next= characters[index];
previous= characters[index + 1];
characters[index]= previous;
characters[index + 1]= next;
neighborhood.add(fHashProvider.getHash(new String(characters)));
characters[index]= next;
characters[index + 1]= previous;
}
final String sentinel= word + " "; //$NON-NLS-1$
characters= sentinel.toCharArray();
int offset= characters.length - 1;
while (true) {
for (int index= 0; index < mutators.length; index++) {
characters[offset]= mutators[index];
neighborhood.add(fHashProvider.getHash(new String(characters)));
}
if (offset == 0)
break;
characters[offset]= characters[offset - 1];
--offset;
}
char mutated= 0;
characters= word.toCharArray();
for (int index= 0; index < word.length(); index++) {
mutated= characters[index];
for (int mutator= 0; mutator < mutators.length; mutator++) {
characters[index]= mutators[mutator];
neighborhood.add(fHashProvider.getHash(new String(characters)));
}
characters[index]= mutated;
}
characters= word.toCharArray();
final char[] deleted= new char[characters.length - 1];
for (int index= 0; index < deleted.length; index++)
deleted[index]= characters[index];
next= characters[characters.length - 1];
offset= deleted.length;
while (true) {
neighborhood.add(fHashProvider.getHash(new String(characters)));
if (offset == 0)
break;
previous= next;
next= deleted[offset - 1];
deleted[offset - 1]= previous;
--offset;
}
neighborhood.remove(hash);
final Set<RankedWordProposal> matches= getCandidates(word, sentence, neighborhood);
if (matches.size() == 0 && candidates.size() == 0)
getCandidates(word, sentence, candidates);
candidates.addAll(matches);
return candidates;
}
/**
* Returns the URL of the dictionary word list.
*
* @throws MalformedURLException
* if the URL could not be retrieved
* @return The URL of the dictionary word list
*/
protected abstract URL getURL() throws MalformedURLException;
/**
* Hashes the word into the dictionary.
*
* @param word
* The word to hash in the dictionary
*/
protected final void hashWord(final String word) {
final String hash= fHashProvider.getHash(word);
ByteArrayWrapper hashBytes;
byte[] wordBytes;
try {
hashBytes= new ByteArrayWrapper(hash.getBytes(UTF_8));
wordBytes= word.getBytes(UTF_8);
} catch (UnsupportedEncodingException e) {
JavaPlugin.log(e);
return;
}
Object bucket= fHashBuckets.get(hashBytes);
if (bucket == null) {
fHashBuckets.put(hashBytes, wordBytes);
} else if (bucket instanceof ArrayList) {
@SuppressWarnings("unchecked")
ArrayList<byte[]> bucketList= (ArrayList<byte[]>)bucket;
bucketList.add(wordBytes);
} else {
ArrayList<Object> list= new ArrayList<>(BUCKET_CAPACITY);
list.add(bucket);
list.add(wordBytes);
fHashBuckets.put(hashBytes, list);
}
}
@Override
public boolean isCorrect(String word) {
word= stripNonLetters(word);
try {
if (!fLoaded) {
synchronized (this) {
fLoaded= load(getURL());
if (fLoaded)
compact();
}
}
} catch (MalformedURLException exception) {
// Do nothing
}
final Object candidates= getCandidates(fHashProvider.getHash(word));
if (candidates == null)
return false;
else if (candidates instanceof byte[]) {
String candidate;
try {
candidate= new String((byte[])candidates, UTF_8);
} catch (UnsupportedEncodingException e) {
JavaPlugin.log(e);
return false;
}
if (candidate.equals(word) || candidate.equals(word.toLowerCase()))
return true;
return false;
}
@SuppressWarnings("unchecked")
final ArrayList<byte[]> candidateList= (ArrayList<byte[]>)candidates;
byte[] wordBytes;
byte[] lowercaseWordBytes;
try {
wordBytes= word.getBytes(UTF_8);
lowercaseWordBytes= word.toLowerCase().getBytes(UTF_8);
} catch (UnsupportedEncodingException e) {
JavaPlugin.log(e);
return false;
}
for (int index= 0; index < candidateList.size(); index++) {
byte[] candidate= candidateList.get(index);
if (Arrays.equals(candidate, wordBytes) || Arrays.equals(candidate, lowercaseWordBytes)) {
return true;
}
}
return false;
}
@Override
public void setStripNonLetters(boolean state) {
fIsStrippingNonLetters= state;
}
/**
* Strips non-letter characters from the given word.
* <p>
* This will only happen if the corresponding preference is enabled.
* </p>
*
* @param word the word to strip
* @return the stripped word
* @since 3.3
*/
protected String stripNonLetters(String word) {
if (!fIsStrippingNonLetters)
return word;
int i= 0;
int j= word.length() - 1;
while (i <= j && !Character.isLetter(word.charAt(i)))
i++;
if (i > j)
return ""; //$NON-NLS-1$
while (j > i && !Character.isLetter(word.charAt(j)))
j--;
return word.substring(i, j+1);
}
@Override
public synchronized final boolean isLoaded() {
return fLoaded || fHashBuckets.size() > 0;
}
/**
* Loads a dictionary word list from disk.
*
* @param url
* The URL of the word list to load
* @return <code>true</code> iff the word list could be loaded, <code>false</code>
* otherwise
*/
protected synchronized boolean load(final URL url) {
if (!fMustLoad)
return fLoaded;
if (url != null) {
InputStream stream= null;
int line= 0;
try {
stream= url.openStream();
if (stream != null) {
String word= null;
// Setup a reader with a decoder in order to read over malformed input if needed.
CharsetDecoder decoder= Charset.forName(getEncoding()).newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
final BufferedReader reader= new BufferedReader(new InputStreamReader(stream, decoder));
boolean doRead= true;
while (doRead) {
try {
word= reader.readLine();
} catch (MalformedInputException ex) {
// Tell the decoder to replace malformed input in order to read the line.
decoder.onMalformedInput(CodingErrorAction.REPLACE);
decoder.reset();
word= reader.readLine();
decoder.onMalformedInput(CodingErrorAction.REPORT);
String message= Messages.format(JavaUIMessages.AbstractSpellingDictionary_encodingError, new String[] { word, decoder.replacement(), BasicElementLabels.getURLPart(url.toString()) });
IStatus status= new Status(IStatus.ERROR, JavaUI.ID_PLUGIN, IStatus.OK, message, ex);
JavaPlugin.log(status);
doRead= word != null;
continue;
}
doRead= word != null;
if (doRead)
hashWord(word);
}
return true;
}
} catch (FileNotFoundException ex) {
String urlString= url.toString();
String lowercaseUrlString= urlString.toLowerCase();
if (urlString.equals(lowercaseUrlString))
JavaPlugin.log(ex);
else
try {
return load(new URL(lowercaseUrlString));
} catch (MalformedURLException e) {
JavaPlugin.log(e);
}
} catch (IOException exception) {
if (line > 0) {
String message= Messages.format(JavaUIMessages.AbstractSpellingDictionary_encodingError, new Object[] { Integer.valueOf(line), BasicElementLabels.getURLPart(url.toString()) });
IStatus status= new Status(IStatus.ERROR, JavaUI.ID_PLUGIN, IStatus.OK, message, exception);
JavaPlugin.log(status);
} else
JavaPlugin.log(exception);
} finally {
fMustLoad= false;
try {
if (stream != null)
stream.close();
} catch (IOException x) {
}
}
}
return false;
}
/**
* Compacts the dictionary.
*
* @since 3.3.
*/
private void compact() {
Iterator<Object> iter= fHashBuckets.values().iterator();
while (iter.hasNext()) {
Object element= iter.next();
if (element instanceof ArrayList)
((ArrayList<?>)element).trimToSize();
}
}
/**
* Sets the phonetic distance algorithm to use.
*
* @param algorithm
* The phonetic distance algorithm
*/
protected final void setDistanceAlgorithm(final IPhoneticDistanceAlgorithm algorithm) {
fDistanceAlgorithm= algorithm;
}
/**
* Sets the phonetic hash provider to use.
*
* @param provider
* The phonetic hash provider
*/
protected final void setHashProvider(final IPhoneticHashProvider provider) {
fHashProvider= provider;
}
@Override
public synchronized void unload() {
fLoaded= false;
fMustLoad= true;
fHashBuckets.clear();
}
@Override
public boolean acceptsWords() {
return false;
}
@Override
public void addWord(final String word) {
// Do nothing
}
/**
* Returns the encoding of this dictionary.
*
* @return the encoding of this dictionary
* @since 3.3
*/
protected String getEncoding() {
String encoding= JavaPlugin.getDefault().getPreferenceStore().getString(PreferenceConstants.SPELLING_USER_DICTIONARY_ENCODING);
if (encoding == null || encoding.length() == 0)
encoding= ResourcesPlugin.getEncoding();
return encoding;
}
}