core/org.eclipse.smila.solr/lib/source/org/apache/lucene/analysis/hunspell/Stemmer.java - smila/org.eclipse.smila.core - Git at Google

 package org.apache.lucene.analysis.hunspell;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.Outputs;

 /**
  * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.  It
  * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
  */
 final class Stemmer {
   private final Dictionary dictionary;
   private final BytesRef scratch = new BytesRef();
   private final StringBuilder segment = new StringBuilder();
   private final ByteArrayDataInput affixReader;

   // used for normalization
   private final StringBuilder scratchSegment = new StringBuilder();
   private char scratchBuffer[] = new char[32];

   // its '1' if we have no stem exceptions, otherwise every other form
   // is really an ID pointing to the exception table
   private final int formStep;

   /**
    * Constructs a new Stemmer which will use the provided Dictionary to create its stems.
    *
    * @param dictionary Dictionary that will be used to create the stems
    */
   public Stemmer(Dictionary dictionary) {
     this.dictionary = dictionary;
     this.affixReader = new ByteArrayDataInput(dictionary.affixData);
     for (int level = 0; level < 3; level++) {
       if (dictionary.prefixes != null) {
         prefixArcs[level] = new FST.Arc<>();
         prefixReaders[level] = dictionary.prefixes.getBytesReader();
       }
       if (dictionary.suffixes != null) {
         suffixArcs[level] = new FST.Arc<>();
         suffixReaders[level] = dictionary.suffixes.getBytesReader();
       }
     }
     formStep = dictionary.hasStemExceptions ? 2 : 1;
   }

   /**
    * Find the stem(s) of the provided word.
    *
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
   public List<CharsRef> stem(String word) {
     return stem(word.toCharArray(), word.length());
   }

   /**
    * Find the stem(s) of the provided word
    *
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
   public List<CharsRef> stem(char word[], int length) {

     if (dictionary.needsInputCleaning) {
       scratchSegment.setLength(0);
       scratchSegment.append(word, 0, length);
       CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
       scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
       length = segment.length();
       segment.getChars(0, length, scratchBuffer, 0);
       word = scratchBuffer;
     }

     int caseType = caseOf(word, length);
     if (caseType == UPPER_CASE) {
       // upper: union exact, title, lower
       caseFoldTitle(word, length);
       caseFoldLower(titleBuffer, length);
       List<CharsRef> list = doStem(word, length, false);
       list.addAll(doStem(titleBuffer, length, true));
       list.addAll(doStem(lowerBuffer, length, true));
       return list;
     } else if (caseType == TITLE_CASE) {
       // title: union exact, lower
       caseFoldLower(word, length);
       List<CharsRef> list = doStem(word, length, false);
       list.addAll(doStem(lowerBuffer, length, true));
       return list;
     } else {
       // exact match only
       return doStem(word, length, false);
     }
   }

   // temporary buffers for case variants
   private char[] lowerBuffer = new char[8];
   private char[] titleBuffer = new char[8];

   private static final int EXACT_CASE = 0;
   private static final int TITLE_CASE = 1;
   private static final int UPPER_CASE = 2;

   /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
   private int caseOf(char word[], int length) {
     if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
       return EXACT_CASE;
     }

     // determine if we are title or lowercase (or something funky, in which its exact)
     boolean seenUpper = false;
     boolean seenLower = false;
     for (int i = 1; i < length; i++) {
       boolean v = Character.isUpperCase(word[i]);
       seenUpper |= v;
       seenLower |= !v;
     }

     if (!seenLower) {
       return UPPER_CASE;
     } else if (!seenUpper) {
       return TITLE_CASE;
     } else {
       return EXACT_CASE;
     }
   }

   /** folds titlecase variant of word to titleBuffer */
   private void caseFoldTitle(char word[], int length) {
     titleBuffer = ArrayUtil.grow(titleBuffer, length);
     System.arraycopy(word, 0, titleBuffer, 0, length);
     for (int i = 1; i < length; i++) {
       titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
     }
   }

   /** folds lowercase variant of word (title cased) to lowerBuffer */
   private void caseFoldLower(char word[], int length) {
     lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
     System.arraycopy(word, 0, lowerBuffer, 0, length);
     lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
   }

   private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
     List<CharsRef> stems = new ArrayList<>();
     IntsRef forms = dictionary.lookupWord(word, 0, length);
     if (forms != null) {
       for (int i = 0; i < forms.length; i += formStep) {
         boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
         boolean checkNeedAffix = dictionary.needaffix != -1;
         boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
         if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
           dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
           char wordFlags[] = Dictionary.decodeFlags(scratch);
           // we are looking for a case variant, but this word does not allow it
           if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
             continue;
           }
           // we can't add this form, its a pseudostem requiring an affix
           if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) {
             continue;
           }
           // we can't add this form, it only belongs inside a compound word
           if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
             continue;
           }
         }
         stems.add(newStem(word, length, forms, i));
       }
     }
     try {
       boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
     } catch (IOException bogus) {
       throw new RuntimeException(bogus);
     }
     return stems;
   }

   /**
    * Find the unique stem(s) of the provided word
    *
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
   public List<CharsRef> uniqueStems(char word[], int length) {
     List<CharsRef> stems = stem(word, length);
     if (stems.size() < 2) {
       return stems;
     }
     CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
     List<CharsRef> deduped = new ArrayList<>();
     for (CharsRef s : stems) {
       if (!terms.contains(s)) {
         deduped.add(s);
         terms.add(s);
       }
     }
     return deduped;
   }

   private CharsRef newStem(char buffer[], int length, IntsRef forms, int formID) {
     final String exception;
     if (dictionary.hasStemExceptions) {
       int exceptionID = forms.ints[forms.offset + formID + 1];
       if (exceptionID > 0) {
         exception = dictionary.getStemException(exceptionID);
       } else {
         exception = null;
       }
     } else {
       exception = null;
     }

     if (dictionary.needsOutputCleaning) {
       scratchSegment.setLength(0);
       if (exception != null) {
         scratchSegment.append(exception);
       } else {
         scratchSegment.append(buffer, 0, length);
       }
       try {
         Dictionary.applyMappings(dictionary.oconv, scratchSegment);
       } catch (IOException bogus) {
         throw new RuntimeException(bogus);
       }
       char cleaned[] = new char[scratchSegment.length()];
       scratchSegment.getChars(0, cleaned.length, cleaned, 0);
       return new CharsRef(cleaned, 0, cleaned.length);
     } else {
       if (exception != null) {
         return new CharsRef(exception);
       } else {
         return new CharsRef(buffer, 0, length);
       }
     }
   }

   // ================================================= Helper Methods ================================================

   // some state for traversing FSTs
   final FST.BytesReader prefixReaders[] = new FST.BytesReader[3];
   @SuppressWarnings("unchecked")
   final FST.Arc<IntsRef> prefixArcs[] = new FST.Arc[3];

   final FST.BytesReader suffixReaders[] = new FST.BytesReader[3];
   @SuppressWarnings("unchecked")
   final FST.Arc<IntsRef> suffixArcs[] = new FST.Arc[3];


   /**
    * Generates a list of stems for the provided word
    *
    * @param word Word to generate the stems for
    * @param previous previous affix that was removed (so we dont remove same one twice)
    * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
    * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word
    * @param recursionDepth current recursiondepth
    * @param doPrefix true if we should remove prefixes
    * @param doSuffix true if we should remove suffixes
    * @param previousWasPrefix true if the previous removal was a prefix:
    *        if we are removing a suffix, and it has no continuation requirements, its ok.
    *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
    * @param circumfix true if the previous prefix removal was signed as a circumfix
    *        this means inner most suffix must also contain circumfix flag.
    * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
    * @return List of stems, or empty list if no stems are found
    */
   private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {

     // TODO: allow this stuff to be reused by tokenfilter
     List<CharsRef> stems = new ArrayList<>();

     if (doPrefix && dictionary.prefixes != null) {
       FST<IntsRef> fst = dictionary.prefixes;
       Outputs<IntsRef> outputs = fst.outputs;
       FST.BytesReader bytesReader = prefixReaders[recursionDepth];
       FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
       fst.getFirstArc(arc);
       IntsRef NO_OUTPUT = outputs.getNoOutput();
       IntsRef output = NO_OUTPUT;
       int limit = dictionary.fullStrip ? length : length-1;
       for (int i = 0; i < limit; i++) {
         if (i > 0) {
           int ch = word[i-1];
           if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
             break;
           } else if (arc.output != NO_OUTPUT) {
             output = fst.outputs.add(output, arc.output);
           }
         }
         IntsRef prefixes = null;
         if (!arc.isFinal()) {
           continue;
         } else {
           prefixes = fst.outputs.add(output, arc.nextFinalOutput);
         }

         for (int j = 0; j < prefixes.length; j++) {
           int prefix = prefixes.ints[prefixes.offset + j];
           if (prefix == previous) {
             continue;
           }
           affixReader.setPosition(8 * prefix);
           char flag = (char) (affixReader.readShort() & 0xffff);
           char stripOrd = (char) (affixReader.readShort() & 0xffff);
           int condition = (char) (affixReader.readShort() & 0xffff);
           boolean crossProduct = (condition & 1) == 1;
           condition >>>= 1;
           char append = (char) (affixReader.readShort() & 0xffff);

           final boolean compatible;
           if (recursionDepth == 0) {
             if (dictionary.onlyincompound == -1) {
               compatible = true;
             } else {
               // check if affix is allowed in a non-compound word
               dictionary.flagLookup.get(append, scratch);
               char appendFlags[] = Dictionary.decodeFlags(scratch);
               compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
             }
           } else if (crossProduct) {
             // cross check incoming continuation class (flag of previous affix) against list.
             dictionary.flagLookup.get(append, scratch);
             char appendFlags[] = Dictionary.decodeFlags(scratch);
             assert prevFlag >= 0;
             boolean allowed = dictionary.onlyincompound == -1 ||
                               !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
             compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, false);
           } else {
             compatible = false;
           }

           if (compatible) {
             int deAffixedStart = i;
             int deAffixedLength = length - deAffixedStart;

             int stripStart = dictionary.stripOffsets[stripOrd];
             int stripEnd = dictionary.stripOffsets[stripOrd+1];
             int stripLength = stripEnd - stripStart;

             if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
               continue;
             }

             char strippedWord[] = new char[stripLength + deAffixedLength];
             System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
             System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

             List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);

             stems.addAll(stemList);
           }
         }
       }
     }

     if (doSuffix && dictionary.suffixes != null) {
       FST<IntsRef> fst = dictionary.suffixes;
       Outputs<IntsRef> outputs = fst.outputs;
       FST.BytesReader bytesReader = suffixReaders[recursionDepth];
       FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
       fst.getFirstArc(arc);
       IntsRef NO_OUTPUT = outputs.getNoOutput();
       IntsRef output = NO_OUTPUT;
       int limit = dictionary.fullStrip ? 0 : 1;
       for (int i = length; i >= limit; i--) {
         if (i < length) {
           int ch = word[i];
           if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
             break;
           } else if (arc.output != NO_OUTPUT) {
             output = fst.outputs.add(output, arc.output);
           }
         }
         IntsRef suffixes = null;
         if (!arc.isFinal()) {
           continue;
         } else {
           suffixes = fst.outputs.add(output, arc.nextFinalOutput);
         }

         for (int j = 0; j < suffixes.length; j++) {
           int suffix = suffixes.ints[suffixes.offset + j];
           if (suffix == previous) {
             continue;
           }
           affixReader.setPosition(8 * suffix);
           char flag = (char) (affixReader.readShort() & 0xffff);
           char stripOrd = (char) (affixReader.readShort() & 0xffff);
           int condition = (char) (affixReader.readShort() & 0xffff);
           boolean crossProduct = (condition & 1) == 1;
           condition >>>= 1;
           char append = (char) (affixReader.readShort() & 0xffff);

           final boolean compatible;
           if (recursionDepth == 0) {
             if (dictionary.onlyincompound == -1) {
               compatible = true;
             } else {
               // check if affix is allowed in a non-compound word
               dictionary.flagLookup.get(append, scratch);
               char appendFlags[] = Dictionary.decodeFlags(scratch);
               compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
             }
           } else if (crossProduct) {
             // cross check incoming continuation class (flag of previous affix) against list.
             dictionary.flagLookup.get(append, scratch);
             char appendFlags[] = Dictionary.decodeFlags(scratch);
             assert prevFlag >= 0;
             boolean allowed = dictionary.onlyincompound == -1 ||
                               !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
             compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
           } else {
             compatible = false;
           }

           if (compatible) {
             int appendLength = length - i;
             int deAffixedLength = length - appendLength;

             int stripStart = dictionary.stripOffsets[stripOrd];
             int stripEnd = dictionary.stripOffsets[stripOrd+1];
             int stripLength = stripEnd - stripStart;

             if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
               continue;
             }

             char strippedWord[] = new char[stripLength + deAffixedLength];
             System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
             System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);

             List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);

             stems.addAll(stemList);
           }
         }
       }
     }

     return stems;
   }

   /** checks condition of the concatenation of two strings */
   // note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
   // but this is a little bit more complicated.
   private boolean checkCondition(int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
     if (condition != 0) {
       CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
       int state = pattern.getInitialState();
       for (int i = c1off; i < c1off + c1len; i++) {
         state = pattern.step(state, c1[i]);
         if (state == -1) {
           return false;
         }
       }
       for (int i = c2off; i < c2off + c2len; i++) {
         state = pattern.step(state, c2[i]);
         if (state == -1) {
           return false;
         }
       }
       return pattern.isAccept(state);
     }
     return true;
   }

   /**
    * Applies the affix rule to the given word, producing a list of stems if any are found
    *
    * @param strippedWord Word the affix has been removed and the strip added
    * @param length valid length of stripped word
    * @param affix HunspellAffix representing the affix rule itself
    * @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
    *                   so we must check dictionary form against both to add it as a stem!
    * @param recursionDepth current recursion depth
    * @param prefix true if we are removing a prefix (false if its a suffix)
    * @return List of stems for the word, or an empty list if none are found
    */
   List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws IOException {
     // TODO: just pass this in from before, no need to decode it twice
     affixReader.setPosition(8 * affix);
     char flag = (char) (affixReader.readShort() & 0xffff);
     affixReader.skipBytes(2); // strip
     int condition = (char) (affixReader.readShort() & 0xffff);
     boolean crossProduct = (condition & 1) == 1;
     condition >>>= 1;
     char append = (char) (affixReader.readShort() & 0xffff);

     List<CharsRef> stems = new ArrayList<>();

     IntsRef forms = dictionary.lookupWord(strippedWord, 0, length);
     if (forms != null) {
       for (int i = 0; i < forms.length; i += formStep) {
         dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
         char wordFlags[] = Dictionary.decodeFlags(scratch);
         if (Dictionary.hasFlag(wordFlags, flag)) {
           // confusing: in this one exception, we already chained the first prefix against the second,
           // so it doesnt need to be checked against the word
           boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
           if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.hasFlag(wordFlags, (char)prefixFlag)) {
             // see if we can chain prefix thru the suffix continuation class (only if it has any!)
             dictionary.flagLookup.get(append, scratch);
             char appendFlags[] = Dictionary.decodeFlags(scratch);
             if (!hasCrossCheckedFlag((char)prefixFlag, appendFlags, false)) {
               continue;
             }
           }

           // if circumfix was previously set by a prefix, we must check this suffix,
           // to ensure it has it, and vice versa
           if (dictionary.circumfix != -1) {
             dictionary.flagLookup.get(append, scratch);
             char appendFlags[] = Dictionary.decodeFlags(scratch);
             boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
             if (circumfix != suffixCircumfix) {
               continue;
             }
           }

           // we are looking for a case variant, but this word does not allow it
           if (caseVariant && dictionary.keepcase != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
             continue;
           }
           // we aren't decompounding (yet)
           if (dictionary.onlyincompound != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
             continue;
           }
           stems.add(newStem(strippedWord, length, forms, i));
         }
       }
     }

     // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
     if (dictionary.circumfix != -1 && !circumfix && prefix) {
       dictionary.flagLookup.get(append, scratch);
       char appendFlags[] = Dictionary.decodeFlags(scratch);
       circumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
     }

     if (crossProduct) {
       if (recursionDepth == 0) {
         if (prefix) {
           // we took away the first prefix.
           // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix
           // COMPLEXPREFIXES = false: combine with a suffix
           stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant));
         } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
           // we took away a suffix.
           // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
           // COMPLEXPREFIXES = false: combine with another suffix
           stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
         }
       } else if (recursionDepth == 1) {
         if (prefix && dictionary.complexPrefixes) {
           // we took away the second prefix: go look for another suffix
           stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant));
         } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
           // we took away a prefix, then a suffix: go look for another suffix
           stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
         }
       }
     }

     return stems;
   }

   /**
    * Checks if the given flag cross checks with the given array of flags
    *
    * @param flag Flag to cross check with the array of flags
    * @param flags Array of flags to cross check against.  Can be {@code null}
    * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
    */
   private boolean hasCrossCheckedFlag(char flag, char[] flags, boolean matchEmpty) {
     return (flags.length == 0 && matchEmpty) || Arrays.binarySearch(flags, flag) >= 0;
   }
 }