| /******************************************************************************* |
| * Copyright (c) 2019 Paul Pazderski, Thomas Wolf, and others. |
| * |
| * This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License 2.0 |
| * which accompanies this distribution, and is available at |
| * https://www.eclipse.org/legal/epl-2.0/ |
| * |
| * SPDX-License-Identifier: EPL-2.0 |
| * |
| * Contributors: |
| * Paul Pazderski; Thomas Wolf - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.jface.text; |
| |
| import java.util.HashMap; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Objects; |
| import java.util.stream.Collectors; |
| |
| /** |
| * Fast matcher to find the occurrences of any of a fixed set of constant strings. Supports finding |
| * all (possibly overlapping) matches, or only the leftmost longest match. |
| * |
| * @since 3.9 |
| */ |
| public class MultiStringMatcher { |
| |
| // An implementation of the Aho-Corasick algorithm (without the DFA construction from section 6 of the |
| // paper; just the failure and output links). |
| // |
| // See Aho, Alfred V.; Corasick, Margaret J.: "Efficient String Matching: An Aid to Bibliographic Search", |
| // CACM 18(6), 1975. |
| // |
| // The algorithm has been modified to support reporting either all matches or only leftmost longest matches. |
| |
| /** |
| * Describes a match result of {@link MultiStringMatcher#indexOf(CharSequence, int)}, giving |
| * access to the matched string and the offset in the text it was matched at. |
| */ |
| public static interface Match { |
| |
| /** |
| * Obtains the matched string. |
| * |
| * @return the text matched |
| */ |
| String getText(); |
| |
| /** |
| * Obtains the offset the {@link #getText() text} was matched at. |
| * |
| * @return the offset |
| */ |
| int getOffset(); |
| |
| } |
| |
| /** A Builder for creating a {@link MultiStringMatcher}. */ |
| public static interface Builder { |
| |
| /** |
| * Adds search strings to be looked for. {@code null} and empty strings in the arguments are |
| * ignored. |
| * |
| * @param searchStrings to add to be looked for by the matcher. |
| * @return this |
| * @throws IllegalStateException if the {@link MultiStringMatcher} was already built. |
| */ |
| Builder add(String... searchStrings); |
| |
| /** |
| * Returns the {@link MultiStringMatcher} built by this builder. |
| * <p> |
| * Note that a {@link Builder} instance can build only <em>one</em> |
| * {@link MultiStringMatcher} instance. This is by design; otherwise the builder would have |
| * to store all the searchStrings somewhere, which may be rather memory intensive if a lot |
| * of search strings are added. |
| * </p> |
| * |
| * @return the {@link MultiStringMatcher} |
| * @throws IllegalStateException if the {@link MultiStringMatcher} was already built. |
| */ |
| MultiStringMatcher build(); |
| } |
| |
| private static class BuilderImpl implements Builder { |
| |
| private MultiStringMatcher m; |
| |
| BuilderImpl() { |
| m= new MultiStringMatcher(); |
| } |
| |
| private void check() { |
| if (m == null) { |
| throw new IllegalStateException("Builder.build() was already called"); //$NON-NLS-1$ |
| } |
| } |
| |
| @Override |
| public Builder add(String... searchStrings) { |
| check(); |
| m.add(searchStrings); |
| return this; |
| } |
| |
| @Override |
| public MultiStringMatcher build() { |
| check(); |
| MultiStringMatcher result= m; |
| m= null; |
| if (!result.root.hasChildren()) { |
| // no search strings were added; return a specialized "matches nothing" matcher |
| return new MultiStringMatcher() { |
| @Override |
| public List<Match> find(CharSequence text, int offset) { |
| return new LinkedList<>(); |
| } |
| |
| @Override |
| public Match indexOf(CharSequence text, int offset) { |
| return null; |
| } |
| }; |
| } |
| result.buildLinks(); |
| return result; |
| } |
| } |
| |
| /** |
| * Creates an initially empty {@link Builder}. |
| * |
| * @return the {@link Builder} |
| */ |
| public static Builder builder() { |
| return new BuilderImpl(); |
| } |
| |
| private static class MatchResult implements Match { |
| |
| private final String match; |
| |
| private final int offset; |
| |
| public MatchResult(String match, int offset) { |
| this.match= match; |
| this.offset= offset; |
| } |
| |
| @Override |
| public String getText() { |
| return match; |
| } |
| |
| @Override |
| public int getOffset() { |
| return offset; |
| } |
| |
| @Override |
| public int hashCode() { |
| return Objects.hashCode(match) * 31 + Integer.hashCode(offset); |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) { |
| return true; |
| } |
| if (obj == null || getClass() != obj.getClass()) { |
| return false; |
| } |
| MatchResult other= (MatchResult) obj; |
| return offset == other.offset && Objects.equals(match, other.match); |
| } |
| |
| @Override |
| public String toString() { |
| return '[' + match + ", " + offset + ']'; //$NON-NLS-1$ |
| } |
| } |
| |
| /** A node in the trie built from the search strings. */ |
| private static class Node { |
| HashMap<Character, Node> children; |
| |
| String match; |
| |
| Node fail; |
| |
| Node output; |
| |
| Node next(Character c) { |
| return children == null ? null : children.get(c); |
| } |
| |
| Node add(char c) { |
| if (children == null) { |
| children= new HashMap<>(); |
| } |
| return children.computeIfAbsent(Character.valueOf(c), key -> new Node()); |
| } |
| |
| boolean hasChildren() { |
| return children != null; |
| } |
| |
| @Override |
| public String toString() { |
| return "Match: " + (match == null ? "null" : '>' + match + '<') //$NON-NLS-1$ //$NON-NLS-2$ |
| + " Children: " + (children == null ? "<none>" : children.keySet().stream().map(c -> c.toString()).collect(Collectors.joining(", "))); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ |
| } |
| } |
| |
| /** Root node of the trie. */ |
| private final Node root= new Node() { |
| @Override |
| Node next(Character c) { |
| // Implements the sentinel loop on the root node for all non-matching characters. |
| Node child= super.next(c); |
| return child == null ? this : child; |
| } |
| }; |
| |
| private MultiStringMatcher() { |
| // Always use a Builder or the static helper methods to create a MultiStringMatcher |
| } |
| |
| private void add(String... searchStrings) { |
| if (searchStrings != null) { |
| for (String searchString : searchStrings) { |
| if (searchString == null || searchString.isEmpty()) { |
| continue; |
| } |
| Node node= root; |
| for (char c : searchString.toCharArray()) { |
| node= node.add(c); |
| } |
| node.match= searchString; |
| } |
| } |
| } |
| |
| private void buildLinks() { |
| // Build the fail and output links. See the paper referenced at the top; this |
| // is a one-to-one implementation of the original algorithm. Variable names |
| // s, r, and state are kept as in the paper. |
| List<Node> queue= new LinkedList<>(); |
| for (Node s : root.children.values()) { |
| if (s.children != null) { |
| // No need to queue nodes without children since we don't do anything |
| // with them anyway. |
| queue.add(s); |
| } |
| s.fail= root; |
| } |
| while (!queue.isEmpty()) { |
| Node r= queue.remove(0); |
| for (Map.Entry<Character, Node> entry : r.children.entrySet()) { |
| Character c= entry.getKey(); |
| Node s= entry.getValue(); |
| if (s.children != null) { |
| queue.add(s); |
| } |
| Node state= r.fail; |
| Node f; |
| while ((f= state.next(c)) == null) { |
| state= state.fail; |
| } |
| s.fail= f; |
| if (f.match != null) { |
| s.output= f; |
| } else if (f.output != null) { |
| s.output= f.output; |
| } |
| } |
| } |
| } |
| |
| /** |
| * Find the next occurrence of any of the search strings of the {@link MultiStringMatcher} in |
| * the given {@code text} starting at the given {@code offset}. |
| * <p> |
| * Performs a simultaneous search for all the strings, returning the leftmost match. If multiple |
| * search strings match at the same index, the longest match is returned. |
| * </p> |
| * |
| * @param text to search (not {@code null}) |
| * @param offset to start searching at |
| * @return the leftmost longest match found, or {@code null} if no match was found. |
| */ |
| public Match indexOf(CharSequence text, int offset) { |
| // Main search loop of the Aho-Corasick algorithm, modified to stop after |
| // the leftmost longest match. |
| // |
| // To find a match, we pursue a primary goal (lowest offset) and a secondary goal |
| // (longest match). We differentiate between primary and sub-matches. Matching starts |
| // by walking down one path of the trie. Any match we find on this path is a primary |
| // match and any new primary match is better than the one before. A sub-match is a |
| // matching prefix of a suffix of the text currently scanned along the path. These |
| // sub-matches occur on paths off the one we're currently following, and they are |
| // linked in the trie via the output links. Their offset is always greater than that |
| // of a primary match, and sub-matches further into the 'output' chain are shorter. |
| // Therefore we are interested only in the first such sub-match. While walking down |
| // a path, sub-matches off this path are not found in offset order, so we have to |
| // check whether a new sub-match is better (lower offset, or longer) than a previously |
| // found sub-match. |
| // |
| // When we can't continue matching on the current path, the algorithm uses the fail |
| // links to try to find an alternate path to match (which would match a suffix of |
| // what was traversed so far). Therefore, if we already had a primary match, it is |
| // returned, since any other match must have a higher offset. If there is no alternate |
| // path, we fall off the trie (the algorithm bring us back to root, and would start |
| // again from the top). If we have any match, we may stop and return it. If we _do_ |
| // change to an alternate path but there's a sub-match with a lower offset, we also |
| // may return that. Otherwise we continue normally on the new path. |
| int textEnd= text.length(); |
| Match primaryMatch= null; |
| Match subMatch= null; |
| boolean failover= false; |
| Node node= root; |
| for (int i= offset; i < textEnd; i++) { |
| Character c= Character.valueOf(text.charAt(i)); |
| Node next= node.next(c); |
| if (next == null) { |
| // Fell off the trie. |
| if (primaryMatch != null) { |
| // Return primary match because any other match must have a higher offset. |
| return primaryMatch; |
| } |
| // Search for other trie to change to. |
| do { |
| node= node.fail; |
| } while ((next= node.next(c)) == null); |
| failover= (node != root); |
| if (!failover && subMatch != null) { |
| // We fell of the trie and could not switch to another. Return best sub-match |
| // if possible. |
| return subMatch; |
| } |
| } |
| node= next; |
| if (node.match != null) { |
| int newOffset= i - node.match.length() + 1; |
| // On a failover trie the sub match can be better. |
| // And if it is return it because nothing better will follow. |
| if (failover && subMatch != null && subMatch.getOffset() < newOffset) { |
| return subMatch; |
| } |
| // Any new primary match is better because all have the same offset but any new one |
| // must be longer. |
| primaryMatch= new MatchResult(node.match, newOffset); |
| if (!node.hasChildren()) { |
| // We will fall off the trie on the next character, so we can return right here |
| return primaryMatch; |
| } |
| } |
| // Check for sub matches but only if there is no primary match because only another |
| // primary match can be better. |
| if (primaryMatch == null) { |
| Node out= node.output; |
| if (out != null) { |
| int newOffset= i - out.match.length() + 1; |
| if (subMatch == null |
| || newOffset < subMatch.getOffset() |
| || (newOffset == subMatch.getOffset() && out.match.length() > subMatch.getText().length())) { |
| subMatch= new MatchResult(out.match, newOffset); |
| } |
| } |
| } |
| } |
| return primaryMatch != null ? primaryMatch : subMatch; |
| } |
| |
| /** |
| * Finds all occurrences of any of the search strings of the {@link MultiStringMatcher} in the |
| * given {@code text} starting at the given {@code offset}, including overlapping occurrences. |
| * |
| * @param text to search (not {@code null}) |
| * @param offset to start searching at |
| * @return a possibly empty list of matches |
| */ |
| public List<Match> find(CharSequence text, int offset) { |
| // Main search loop of the standard Aho-Corasick algorithm. |
| int textEnd= text.length(); |
| List<Match> matches= new LinkedList<>(); |
| Node node= root; |
| for (int i= offset; i < textEnd; i++) { |
| Character c= Character.valueOf(text.charAt(i)); |
| Node next; |
| while ((next= node.next(c)) == null) { |
| node= node.fail; |
| } |
| node= next; |
| if (node.match != null) { |
| matches.add(new MatchResult(node.match, i - node.match.length() + 1)); |
| } |
| Node out= node.output; |
| while (out != null) { |
| matches.add(new MatchResult(out.match, i - out.match.length() + 1)); |
| out= out.output; |
| } |
| } |
| return matches; |
| } |
| |
| /** |
| * Finds the leftmost longest occurrence of any of the given {@code searchStrings} in the |
| * {@code text} starting at the given {@code offset}. |
| * <p> |
| * To match the same set of search strings repeatedly against texts it is more efficient to |
| * build and re-use a {@link MultiStringMatcher}. |
| * </p> |
| * |
| * @param text to search (not {@code null}) |
| * @param offset to start searching at |
| * @param searchStrings to look for; non-{@code null} and non-empty strings are ignored |
| * @return a {@link Match} describing the match found, or {@code null} if no match was found or |
| * there are no non-{@code null} non-empty {@code searchStrings} |
| */ |
| public static Match indexOf(CharSequence text, int offset, String... searchStrings) { |
| return create(searchStrings).indexOf(text, offset); |
| } |
| |
| /** |
| * Creates a {@link MultiStringMatcher} for the given {@code searchStrings}. |
| * <p> |
| * If there are no non-{@code null} non-empty {@code searchStrings}, the returned |
| * {@link MultiStringMatcher} will never match anything. |
| * </p> |
| * |
| * @param searchStrings to look for; non-{@code null} and non-empty strings are ignored |
| * @return the {@link MultiStringMatcher} |
| */ |
| public static MultiStringMatcher create(String... searchStrings) { |
| return builder().add(searchStrings).build(); |
| } |
| } |