blob: 51c349db042b63ee20c682cace22ef38f8001770 [file] [log] [blame]
package org.apache.lucene.analysis.synonym;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
* <p>
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
* or discarded. If the original tokens are included, the position increments may be modified
* to retain absolute positions after merging with the synonym tokenstream.
* <p>
* Generated synonyms will start at the same position as the first matched source token.
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
*/
@Deprecated
final class SlowSynonymFilter extends TokenFilter {
private final SlowSynonymMap map; // Map<String, SynonymMap>
private Iterator<AttributeSource> replacement; // iterator over generated tokens
public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
super(in);
if (map == null)
throw new IllegalArgumentException("map is required");
this.map = map;
// just ensuring these attributes exist...
addAttribute(CharTermAttribute.class);
addAttribute(PositionIncrementAttribute.class);
addAttribute(OffsetAttribute.class);
addAttribute(TypeAttribute.class);
}
/*
* Need to worry about multiple scenarios:
* - need to go for the longest match
* a b => foo #shouldn't match if "a b" is followed by "c d"
* a b c d => bar
* - need to backtrack - retry matches for tokens already read
* a b c d => foo
* b c => bar
* If the input stream is "a b c x", one will consume "a b c d"
* trying to match the first rule... all but "a" should be
* pushed back so a match may be made on "b c".
* - don't try and match generated tokens (thus need separate queue)
* matching is not recursive.
* - handle optional generation of original tokens in all these cases,
* merging token streams to preserve token positions.
* - preserve original positionIncrement of first matched token
*/
@Override
public boolean incrementToken() throws IOException {
while (true) {
// if there are any generated tokens, return them... don't try any
// matches against them, as we specifically don't want recursion.
if (replacement!=null && replacement.hasNext()) {
copy(this, replacement.next());
return true;
}
// common case fast-path of first token not matching anything
AttributeSource firstTok = nextTok();
if (firstTok == null) return false;
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
if (result == null) {
copy(this, firstTok);
return true;
}
// fast-path failed, clone ourselves if needed
if (firstTok == this)
firstTok = cloneAttributes();
// OK, we matched a token, so find the longest match.
matched = new LinkedList<>();
result = match(result);
if (result==null) {
// no match, simply return the first token read.
copy(this, firstTok);
return true;
}
// reuse, or create new one each time?
ArrayList<AttributeSource> generated = new ArrayList<>(result.synonyms.length + matched.size() + 1);
//
// there was a match... let's generate the new tokens, merging
// in the matched tokens (position increments need adjusting)
//
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
boolean includeOrig = result.includeOrig();
AttributeSource origTok = includeOrig ? firstTok : null;
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
int repPos=0; // curr position in replacement token stream
int pos=0; // current position in merged token stream
for (int i=0; i<result.synonyms.length; i++) {
Token repTok = result.synonyms[i];
AttributeSource newTok = firstTok.cloneAttributes();
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
repPos += repTok.getPositionIncrement();
if (i==0) repPos=origPos; // make position of first token equal to original
// if necessary, insert original tokens and adjust position increment
while (origTok != null && origPos <= repPos) {
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
newPosIncAtt.setPositionIncrement(repPos - pos);
generated.add(newTok);
pos += newPosIncAtt.getPositionIncrement();
}
// finish up any leftover original tokens
while (origTok!=null) {
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
// what if we replaced a longer sequence with a shorter one?
// a/0 b/5 => foo/0
// should I re-create the gap on the next buffered token?
replacement = generated.iterator();
// Now return to the top of the loop to read and return the first
// generated token.. The reason this is done is that we may have generated
// nothing at all, and may need to continue with more matching logic.
}
}
//
// Defer creation of the buffer until the first time it is used to
// optimize short fields with no matches.
//
private LinkedList<AttributeSource> buffer;
private LinkedList<AttributeSource> matched;
private boolean exhausted;
private AttributeSource nextTok() throws IOException {
if (buffer!=null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
if (!exhausted && input.incrementToken()) {
return this;
} else {
exhausted = true;
return null;
}
}
}
private void pushTok(AttributeSource t) {
if (buffer==null) buffer=new LinkedList<>();
buffer.addFirst(t);
}
private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
SlowSynonymMap result = null;
if (map.submap != null) {
AttributeSource tok = nextTok();
if (tok != null) {
// clone ourselves.
if (tok == this)
tok = cloneAttributes();
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
if (subMap != null) {
// recurse
result = match(subMap);
}
if (result != null) {
matched.addFirst(tok);
} else {
// push back unmatched token
pushTok(tok);
}
}
}
// if no longer sequence matched, so if this node has synonyms, it's the match.
if (result==null && map.synonyms!=null) {
result = map;
}
return result;
}
private void copy(AttributeSource target, AttributeSource source) {
if (target != source)
source.copyTo(target);
}
@Override
public void reset() throws IOException {
input.reset();
replacement = null;
exhausted = false;
}
}