core/org.eclipse.smila.solr/lib/source/org/apache/lucene/analysis/synonym/SlowSynonymFilter.java - smila/org.eclipse.smila.core - Git at Google

 package org.apache.lucene.analysis.synonym;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.LinkedList;

 /** SynonymFilter handles multi-token synonyms with variable position increment offsets.
  * <p>
  * The matched tokens from the input stream may be optionally passed through (includeOrig=true)
  * or discarded.  If the original tokens are included, the position increments may be modified
  * to retain absolute positions after merging with the synonym tokenstream.
  * <p>
  * Generated synonyms will start at the same position as the first matched source token.
  * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
  */
 @Deprecated
 final class SlowSynonymFilter extends TokenFilter {

   private final SlowSynonymMap map;  // Map<String, SynonymMap>
   private Iterator<AttributeSource> replacement;  // iterator over generated tokens

   public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
     super(in);
     if (map == null)
       throw new IllegalArgumentException("map is required");

     this.map = map;
     // just ensuring these attributes exist...
     addAttribute(CharTermAttribute.class);
     addAttribute(PositionIncrementAttribute.class);
     addAttribute(OffsetAttribute.class);
     addAttribute(TypeAttribute.class);
   }


   /*
    * Need to worry about multiple scenarios:
    *  - need to go for the longest match
    *    a b => foo      #shouldn't match if "a b" is followed by "c d"
    *    a b c d => bar
    *  - need to backtrack - retry matches for tokens already read
    *     a b c d => foo
    *       b c => bar
    *     If the input stream is "a b c x", one will consume "a b c d"
    *     trying to match the first rule... all but "a" should be
    *     pushed back so a match may be made on "b c".
    *  - don't try and match generated tokens (thus need separate queue)
    *    matching is not recursive.
    *  - handle optional generation of original tokens in all these cases,
    *    merging token streams to preserve token positions.
    *  - preserve original positionIncrement of first matched token
    */
   @Override
   public boolean incrementToken() throws IOException {
     while (true) {
       // if there are any generated tokens, return them... don't try any
       // matches against them, as we specifically don't want recursion.
       if (replacement!=null && replacement.hasNext()) {
         copy(this, replacement.next());
         return true;
       }

       // common case fast-path of first token not matching anything
       AttributeSource firstTok = nextTok();
       if (firstTok == null) return false;
       CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
       SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
       if (result == null) {
         copy(this, firstTok);
         return true;
       }

       // fast-path failed, clone ourselves if needed
       if (firstTok == this)
         firstTok = cloneAttributes();
       // OK, we matched a token, so find the longest match.

       matched = new LinkedList<>();

       result = match(result);

       if (result==null) {
         // no match, simply return the first token read.
         copy(this, firstTok);
         return true;
       }

       // reuse, or create new one each time?
       ArrayList<AttributeSource> generated = new ArrayList<>(result.synonyms.length + matched.size() + 1);

       //
       // there was a match... let's generate the new tokens, merging
       // in the matched tokens (position increments need adjusting)
       //
       AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
       boolean includeOrig = result.includeOrig();

       AttributeSource origTok = includeOrig ? firstTok : null;
       PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
       int origPos = firstPosIncAtt.getPositionIncrement();  // position of origTok in the original stream
       int repPos=0; // curr position in replacement token stream
       int pos=0;  // current position in merged token stream

       for (int i=0; i<result.synonyms.length; i++) {
         Token repTok = result.synonyms[i];
         AttributeSource newTok = firstTok.cloneAttributes();
         CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
         OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
         PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);

         OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);

         newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
         newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
         repPos += repTok.getPositionIncrement();
         if (i==0) repPos=origPos;  // make position of first token equal to original

         // if necessary, insert original tokens and adjust position increment
         while (origTok != null && origPos <= repPos) {
           PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
           origPosInc.setPositionIncrement(origPos-pos);
           generated.add(origTok);
           pos += origPosInc.getPositionIncrement();
           origTok = matched.isEmpty() ? null : matched.removeFirst();
           if (origTok != null) {
             origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
             origPos += origPosInc.getPositionIncrement();
           }
         }

         newPosIncAtt.setPositionIncrement(repPos - pos);
         generated.add(newTok);
         pos += newPosIncAtt.getPositionIncrement();
       }

       // finish up any leftover original tokens
       while (origTok!=null) {
         PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
         origPosInc.setPositionIncrement(origPos-pos);
         generated.add(origTok);
         pos += origPosInc.getPositionIncrement();
         origTok = matched.isEmpty() ? null : matched.removeFirst();
         if (origTok != null) {
           origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
           origPos += origPosInc.getPositionIncrement();
         }
       }

       // what if we replaced a longer sequence with a shorter one?
       // a/0 b/5 =>  foo/0
       // should I re-create the gap on the next buffered token?

       replacement = generated.iterator();
       // Now return to the top of the loop to read and return the first
       // generated token.. The reason this is done is that we may have generated
       // nothing at all, and may need to continue with more matching logic.
     }
   }


   //
   // Defer creation of the buffer until the first time it is used to
   // optimize short fields with no matches.
   //
   private LinkedList<AttributeSource> buffer;
   private LinkedList<AttributeSource> matched;

   private boolean exhausted;

   private AttributeSource nextTok() throws IOException {
     if (buffer!=null && !buffer.isEmpty()) {
       return buffer.removeFirst();
     } else {
       if (!exhausted && input.incrementToken()) {
         return this;
       } else {
         exhausted = true;
         return null;
       }
     }
   }

   private void pushTok(AttributeSource t) {
     if (buffer==null) buffer=new LinkedList<>();
     buffer.addFirst(t);
   }

   private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
     SlowSynonymMap result = null;

     if (map.submap != null) {
       AttributeSource tok = nextTok();
       if (tok != null) {
         // clone ourselves.
         if (tok == this)
           tok = cloneAttributes();
         // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
         CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
         SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());

         if (subMap != null) {
           // recurse
           result = match(subMap);
         }

         if (result != null) {
           matched.addFirst(tok);
         } else {
           // push back unmatched token
           pushTok(tok);
         }
       }
     }

     // if no longer sequence matched, so if this node has synonyms, it's the match.
     if (result==null && map.synonyms!=null) {
       result = map;
     }

     return result;
   }

   private void copy(AttributeSource target, AttributeSource source) {
     if (target != source)
       source.copyTo(target);
   }

   @Override
   public void reset() throws IOException {
     input.reset();
     replacement = null;
     exhausted = false;
   }
 }
	package org.apache.lucene.analysis.synonym;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
	import org.apache.lucene.util.AttributeSource;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Iterator;
	import java.util.LinkedList;

	/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
	* <p>
	* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
	* or discarded. If the original tokens are included, the position increments may be modified
	* to retain absolute positions after merging with the synonym tokenstream.
	* <p>
	* Generated synonyms will start at the same position as the first matched source token.
	* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
	*/
	@Deprecated
	final class SlowSynonymFilter extends TokenFilter {

	private final SlowSynonymMap map; // Map<String, SynonymMap>
	private Iterator<AttributeSource> replacement; // iterator over generated tokens

	public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
	super(in);
	if (map == null)
	throw new IllegalArgumentException("map is required");

	this.map = map;
	// just ensuring these attributes exist...
	addAttribute(CharTermAttribute.class);
	addAttribute(PositionIncrementAttribute.class);
	addAttribute(OffsetAttribute.class);
	addAttribute(TypeAttribute.class);
	}


	/*
	* Need to worry about multiple scenarios:
	* - need to go for the longest match
	* a b => foo #shouldn't match if "a b" is followed by "c d"
	* a b c d => bar
	* - need to backtrack - retry matches for tokens already read
	* a b c d => foo
	* b c => bar
	* If the input stream is "a b c x", one will consume "a b c d"
	* trying to match the first rule... all but "a" should be
	* pushed back so a match may be made on "b c".
	* - don't try and match generated tokens (thus need separate queue)
	* matching is not recursive.
	* - handle optional generation of original tokens in all these cases,
	* merging token streams to preserve token positions.
	* - preserve original positionIncrement of first matched token
	*/
	@Override
	public boolean incrementToken() throws IOException {
	while (true) {
	// if there are any generated tokens, return them... don't try any
	// matches against them, as we specifically don't want recursion.
	if (replacement!=null && replacement.hasNext()) {
	copy(this, replacement.next());
	return true;
	}

	// common case fast-path of first token not matching anything
	AttributeSource firstTok = nextTok();
	if (firstTok == null) return false;
	CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
	SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
	if (result == null) {
	copy(this, firstTok);
	return true;
	}

	// fast-path failed, clone ourselves if needed
	if (firstTok == this)
	firstTok = cloneAttributes();
	// OK, we matched a token, so find the longest match.

	matched = new LinkedList<>();

	result = match(result);

	if (result==null) {
	// no match, simply return the first token read.
	copy(this, firstTok);
	return true;
	}

	// reuse, or create new one each time?
	ArrayList<AttributeSource> generated = new ArrayList<>(result.synonyms.length + matched.size() + 1);

	//
	// there was a match... let's generate the new tokens, merging
	// in the matched tokens (position increments need adjusting)
	//
	AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
	boolean includeOrig = result.includeOrig();

	AttributeSource origTok = includeOrig ? firstTok : null;
	PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
	int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
	int repPos=0; // curr position in replacement token stream
	int pos=0; // current position in merged token stream

	for (int i=0; i<result.synonyms.length; i++) {
	Token repTok = result.synonyms[i];
	AttributeSource newTok = firstTok.cloneAttributes();
	CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
	OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
	PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);

	OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);

	newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
	newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
	repPos += repTok.getPositionIncrement();
	if (i==0) repPos=origPos; // make position of first token equal to original

	// if necessary, insert original tokens and adjust position increment
	while (origTok != null && origPos <= repPos) {
	PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
	origPosInc.setPositionIncrement(origPos-pos);
	generated.add(origTok);
	pos += origPosInc.getPositionIncrement();
	origTok = matched.isEmpty() ? null : matched.removeFirst();
	if (origTok != null) {
	origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
	origPos += origPosInc.getPositionIncrement();
	}
	}

	newPosIncAtt.setPositionIncrement(repPos - pos);
	generated.add(newTok);
	pos += newPosIncAtt.getPositionIncrement();
	}

	// finish up any leftover original tokens
	while (origTok!=null) {
	PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
	origPosInc.setPositionIncrement(origPos-pos);
	generated.add(origTok);
	pos += origPosInc.getPositionIncrement();
	origTok = matched.isEmpty() ? null : matched.removeFirst();
	if (origTok != null) {
	origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
	origPos += origPosInc.getPositionIncrement();
	}
	}

	// what if we replaced a longer sequence with a shorter one?
	// a/0 b/5 => foo/0
	// should I re-create the gap on the next buffered token?

	replacement = generated.iterator();
	// Now return to the top of the loop to read and return the first
	// generated token.. The reason this is done is that we may have generated
	// nothing at all, and may need to continue with more matching logic.
	}
	}


	//
	// Defer creation of the buffer until the first time it is used to
	// optimize short fields with no matches.
	//
	private LinkedList<AttributeSource> buffer;
	private LinkedList<AttributeSource> matched;

	private boolean exhausted;

	private AttributeSource nextTok() throws IOException {
	if (buffer!=null && !buffer.isEmpty()) {
	return buffer.removeFirst();
	} else {
	if (!exhausted && input.incrementToken()) {
	return this;
	} else {
	exhausted = true;
	return null;
	}
	}
	}

	private void pushTok(AttributeSource t) {
	if (buffer==null) buffer=new LinkedList<>();
	buffer.addFirst(t);
	}

	private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
	SlowSynonymMap result = null;

	if (map.submap != null) {
	AttributeSource tok = nextTok();
	if (tok != null) {
	// clone ourselves.
	if (tok == this)
	tok = cloneAttributes();
	// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
	CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
	SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());

	if (subMap != null) {
	// recurse
	result = match(subMap);
	}

	if (result != null) {
	matched.addFirst(tok);
	} else {
	// push back unmatched token
	pushTok(tok);
	}
	}
	}

	// if no longer sequence matched, so if this node has synonyms, it's the match.
	if (result==null && map.synonyms!=null) {
	result = map;
	}

	return result;
	}

	private void copy(AttributeSource target, AttributeSource source) {
	if (target != source)
	source.copyTo(target);
	}

	@Override
	public void reset() throws IOException {
	input.reset();
	replacement = null;
	exhausted = false;
	}
	}