lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dsmi...@apache.org
Subject lucene-solr:master: LUCENE-8332: New ConcatenateGraphFilter (from CompletionTokenStream). * Added a test for FingerprintFilter and clarified FF's end condition.
Date Tue, 05 Jun 2018 03:07:44 GMT
Repository: lucene-solr
Updated Branches:
  refs/heads/master f27d8a2db -> f9f5e8374


LUCENE-8332: New ConcatenateGraphFilter (from CompletionTokenStream).
* Added a test for FingerprintFilter and clarified FF's end condition.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f9f5e837
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f9f5e837
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f9f5e837

Branch: refs/heads/master
Commit: f9f5e837450e082ae7e1a82a0693760af7485a1b
Parents: f27d8a2
Author: David Smiley <dsmiley@apache.org>
Authored: Mon Jun 4 23:07:31 2018 -0400
Committer: David Smiley <dsmiley@apache.org>
Committed: Mon Jun 4 23:07:31 2018 -0400

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   5 +
 .../miscellaneous/ConcatenateGraphFilter.java   | 375 +++++++++++++++++++
 .../ConcatenateGraphFilterFactory.java          |  70 ++++
 .../miscellaneous/FingerprintFilter.java        |   4 +-
 ...ache.lucene.analysis.util.TokenFilterFactory |   1 +
 .../lucene/analysis/core/TestRandomChains.java  |   7 +-
 .../TestConcatenateGraphFilter.java             | 168 +++++++++
 .../TestConcatenateGraphFilterFactory.java      |  83 ++++
 .../miscellaneous/TestFingerprintFilter.java    |   9 +
 .../suggest/document/CompletionAnalyzer.java    |  21 +-
 .../suggest/document/CompletionQuery.java       |   2 +-
 .../suggest/document/CompletionTokenStream.java | 297 ++-------------
 .../search/suggest/document/ContextQuery.java   |   5 +-
 .../suggest/document/ContextSuggestField.java   |   1 +
 .../suggest/document/FuzzyCompletionQuery.java  |   7 +-
 .../suggest/document/NRTSuggesterBuilder.java   |   3 +-
 .../suggest/document/PrefixCompletionQuery.java |   5 +-
 .../search/suggest/document/SuggestField.java   |   3 +-
 .../document/CompletionTokenStreamTest.java     | 177 ---------
 .../document/TestContextSuggestField.java       |  13 +-
 .../suggest/document/TestSuggestField.java      |  29 +-
 21 files changed, 793 insertions(+), 492 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 55e0367..9eecc42 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -210,6 +210,11 @@ New Features
   as such once it's introduced and can't be changed after the fact.
   (Nhat Nguyen via Simon Willnauer)
 
+* LUCENE-8332: New ConcatenateGraphFilter for concatenating all tokens into one (or more
+  in the event of a graph input).  This is useful for fast analyzed exact-match lookup,
+  suggesters, and as a component of a named entity recognition system.  This was excised
+  out of CompletionTokenStream in the NRT doc suggester.  (David Smiley, Jim Ferenczi)
+
 Bug Fixes
 
 * LUCENE-8221: MoreLikeThis.setMaxDocFreqPct can easily int-overflow on larger

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java
new file mode 100644
index 0000000..b6c4f22
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java
@@ -0,0 +1,375 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
+import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.fst.Util;
+
+/**
+ * Concatenates/Joins every incoming token with a separator into one output token for every path through the
+ * token stream (which is a graph).  In simple cases this yields one token, but in the presence of any tokens with
+ * a zero positionIncrmeent (e.g. synonyms) it will be more.  This filter uses the token bytes, position increment,
+ * and position length of the incoming stream.  Other attributes are not used or manipulated.
+ *
+ * @lucene.experimental
+ */
+public final class ConcatenateGraphFilter extends TokenStream {
+
+  /*
+   * Token stream which converts a provided token stream to an automaton.
+   * The accepted strings enumeration from the automaton are available through the
+   * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
+   * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
+   * a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)})
+   */
+
+  /**
+   * Represents the separation between tokens, if
+   * <code>preserveSep</code> is <code>true</code>.
+   */
+  public final static int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;
+  public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+  public final static boolean DEFAULT_PRESERVE_SEP = true;
+  public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
+
+  private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  private final TokenStream inputTokenStream;
+  private final boolean preserveSep;
+  private final boolean preservePositionIncrements;
+  private final int maxGraphExpansions;
+
+  private LimitedFiniteStringsIterator finiteStrings;
+  private CharTermAttribute charTermAttribute;
+  private boolean wasReset = false;
+  private int endOffset;
+
+  /**
+   * Creates a token stream to convert <code>input</code> to a token stream
+   * of accepted strings by its token stream graph.
+   * <p>
+   * This constructor uses the default settings of the constants in this class.
+   */
+  public ConcatenateGraphFilter(TokenStream inputTokenStream) {
+    this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
+  }
+
+  /**
+   * Creates a token stream to convert <code>input</code> to a token stream
+   * of accepted strings by its token stream graph.
+   *
+   * @param inputTokenStream The input/incoming TokenStream
+   * @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token
+   * @param preservePositionIncrements Whether to add an empty token for missing positions.
+   *                                   The effect is a consecutive {@link #SEP_LABEL}.
+   *                                   When false, it's as if there were no missing positions
+   *                                     (we pretend the surrounding tokens were adjacent).
+   * @param maxGraphExpansions If the tokenStream graph has more than this many possible paths through, then we'll throw
+   *                           {@link TooComplexToDeterminizeException} to preserve the stability and memory of the
+   *                           machine.
+   * @throws TooComplexToDeterminizeException if the tokenStream graph has more than {@code maxGraphExpansions}
+   *         expansions
+   *
+   */
+  public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
+    // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
+    // the input stream entirely in the first call to incrementToken
+    this.inputTokenStream = inputTokenStream;
+    this.preserveSep = preserveSep;
+    this.preservePositionIncrements = preservePositionIncrements;
+    this.maxGraphExpansions = maxGraphExpansions;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    // we only capture this if we really need it to save the UTF-8 to UTF-16 conversion
+    charTermAttribute = getAttribute(CharTermAttribute.class); // may return null
+    wasReset = true;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (finiteStrings == null) {
+      if (wasReset == false) {
+        throw new IllegalStateException("reset() missing before incrementToken");
+      }
+      // lazy init/consume
+      Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream
+      finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
+      //note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll assume 0
+      endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
+    }
+
+    IntsRef string = finiteStrings.next();
+    if (string == null) {
+      return false;
+    }
+
+    clearAttributes();
+
+    if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
+      posIncrAtt.setPositionIncrement(0); // stacked
+    }
+
+    offsetAtt.setOffset(0, endOffset);
+
+    Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
+    if (charTermAttribute != null) {
+      charTermAttribute.setLength(0);
+      charTermAttribute.append(bytesAtt.toUTF16());
+    }
+
+    return true;
+  }
+
+  @Override
+  public void end() throws IOException {
+    super.end();
+    if (finiteStrings == null) { // thus inputTokenStream hasn't yet received end()
+      inputTokenStream.end(); // the input TS may really want to see "end()" called even if incrementToken hasn't.
+    } // else we already eagerly consumed inputTokenStream including end()
+    if (endOffset != -1) {
+      offsetAtt.setOffset(0, endOffset);
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    super.close();
+    //delegate lifecycle.  Note toAutomaton does not close the stream
+    inputTokenStream.close();
+    finiteStrings = null;
+    wasReset = false;//reset
+    endOffset = -1;//reset
+  }
+
+  /**
+   * Converts the tokenStream to an automaton, treating the transition labels as utf-8.  Does *not* close it.
+   */
+  public Automaton toAutomaton() throws IOException {
+    return toAutomaton(false);
+  }
+
+  /**
+   * Converts the tokenStream to an automaton.  Does *not* close it.
+   */
+  public Automaton toAutomaton(boolean unicodeAware) throws IOException {
+    // TODO refactor this
+    // maybe we could hook up a modified automaton from TermAutomatonQuery here?
+
+    // Create corresponding automaton: labels are bytes
+    // from each analyzed token, with byte 0 used as
+    // separator between tokens:
+    final TokenStreamToAutomaton tsta;
+    if (preserveSep) {
+      tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL);
+    } else {
+      // When we're not preserving sep, we don't steal 0xff
+      // byte, so we don't need to do any escaping:
+      tsta = new TokenStreamToAutomaton();
+    }
+    tsta.setPreservePositionIncrements(preservePositionIncrements);
+    tsta.setUnicodeArcs(unicodeAware);
+
+    Automaton automaton = tsta.toAutomaton(inputTokenStream);
+
+    // TODO: we can optimize this somewhat by determinizing
+    // while we convert
+    automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
+    // This automaton should not blow up during determinize:
+    return Operations.determinize(automaton, maxGraphExpansions);
+  }
+
+  /**
+   * Just escapes the {@link #SEP_LABEL} byte with an extra.
+   */
+  private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
+
+    final BytesRefBuilder spare = new BytesRefBuilder();
+    final byte sepLabel;
+
+    public EscapingTokenStreamToAutomaton(int sepLabel) {
+      assert sepLabel <= Byte.MAX_VALUE;
+      this.sepLabel = (byte) sepLabel;
+    }
+
+    @Override
+    protected BytesRef changeToken(BytesRef in) {
+      int upto = 0;
+      for (int i = 0; i < in.length; i++) {
+        byte b = in.bytes[in.offset + i];
+        if (b == sepLabel) {
+          spare.grow(upto + 2);
+          spare.setByteAt(upto++, sepLabel);
+          spare.setByteAt(upto++, b);
+        } else {
+          spare.grow(upto + 1);
+          spare.setByteAt(upto++, b);
+        }
+      }
+      spare.setLength(upto);
+      return spare.get();
+    }
+  }
+
+  // Replaces SEP with epsilon or remaps them if
+  // we were asked to preserve them:
+  private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
+
+    Automaton result = new Automaton();
+
+    // Copy all states over
+    int numStates = a.getNumStates();
+    for (int s = 0; s < numStates; s++) {
+      result.createState();
+      result.setAccept(s, a.isAccept(s));
+    }
+
+    // Go in reverse topo sort so we know we only have to
+    // make one pass:
+    Transition t = new Transition();
+    int[] topoSortStates = Operations.topoSortStates(a);
+    for (int i = 0; i < topoSortStates.length; i++) {
+      int state = topoSortStates[topoSortStates.length - 1 - i];
+      int count = a.initTransition(state, t);
+      for (int j = 0; j < count; j++) {
+        a.getNextTransition(t);
+        if (t.min == TokenStreamToAutomaton.POS_SEP) {
+          assert t.max == TokenStreamToAutomaton.POS_SEP;
+          if (preserveSep) {
+            // Remap to SEP_LABEL:
+            result.addTransition(state, t.dest, sepLabel);
+          } else {
+            result.addEpsilon(state, t.dest);
+          }
+        } else if (t.min == TokenStreamToAutomaton.HOLE) {
+          assert t.max == TokenStreamToAutomaton.HOLE;
+
+          // Just remove the hole: there will then be two
+          // SEP tokens next to each other, which will only
+          // match another hole at search time.  Note that
+          // it will also match an empty-string token ... if
+          // that's somehow a problem we can always map HOLE
+          // to a dedicated byte (and escape it in the
+          // input).
+          result.addEpsilon(state, t.dest);
+        } else {
+          result.addTransition(state, t.dest, t.min, t.max);
+        }
+      }
+    }
+
+    result.finishState();
+
+    return result;
+  }
+
+  /**
+   * Attribute providing access to the term builder and UTF-16 conversion
+   * @lucene.internal
+   */
+  public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute {
+    /**
+     * Returns the builder from which the term is derived.
+     */
+    BytesRefBuilder builder();
+
+    /**
+     * Returns the term represented as UTF-16
+     */
+    CharSequence toUTF16();
+  }
+
+  /**
+   * Implementation of {@link BytesRefBuilderTermAttribute}
+   * @lucene.internal
+   */
+  public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute {
+    private final BytesRefBuilder bytes = new BytesRefBuilder();
+    private transient CharsRefBuilder charsRef;
+
+    /**
+     * Sole constructor
+     * no-op
+     */
+    public BytesRefBuilderTermAttributeImpl() {
+    }
+
+    @Override
+    public BytesRefBuilder builder() {
+      return bytes;
+    }
+
+    @Override
+    public BytesRef getBytesRef() {
+      return bytes.get();
+    }
+
+    @Override
+    public void clear() {
+      bytes.clear();
+    }
+
+    @Override
+    public void copyTo(AttributeImpl target) {
+      BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target;
+      other.bytes.copyBytes(bytes);
+    }
+
+    @Override
+    public AttributeImpl clone() {
+      BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl();
+      copyTo(other);
+      return other;
+    }
+
+    @Override
+    public void reflectWith(AttributeReflector reflector) {
+      reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef());
+    }
+
+    @Override
+    public CharSequence toUTF16() {
+      if (charsRef == null) {
+        charsRef = new CharsRefBuilder();
+      }
+      charsRef.copyUTF8Bytes(getBytesRef());
+      return charsRef.get();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java
new file mode 100644
index 0000000..5d8ccba
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
+
+/**
+ * Factory for {@link ConcatenateGraphFilter}.
+ *
+ * <ul>
+ *   <li><tt>preserveSep</tt>:
+ *                            Whether {@link ConcatenateGraphFilter#SEP_LABEL}
+ *                            should separate the input tokens in the concatenated token
+ *                            </li>
+ *   <li><tt>preservePositionIncrements</tt>:
+ *                                       Whether to add an empty token for missing positions.
+ *                                       The effect is a consecutive {@link ConcatenateGraphFilter#SEP_LABEL}.
+ *                                       When false, it's as if there were no missing positions
+ *                                         (we pretend the surrounding tokens were adjacent).
+ *                                       </li>
+ *   <li><tt>maxGraphExpansions</tt>:
+ *                            If the tokenStream graph has more than this many possible paths through, then we'll throw
+ *                            {@link TooComplexToDeterminizeException} to preserve the stability and memory of the
+ *                            machine.
+ *                            </li>
+ * </ul>
+ * @see ConcatenateGraphFilter
+ * @since 7.4.0
+ */
+public class ConcatenateGraphFilterFactory extends TokenFilterFactory {
+
+  private boolean preserveSep;
+  private boolean preservePositionIncrements;
+  private int maxGraphExpansions;
+
+  public ConcatenateGraphFilterFactory(Map<String, String> args) {
+    super(args);
+
+    preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
+    preservePositionIncrements = getBoolean(args, "preservePositionIncrements", ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS);
+    maxGraphExpansions = getInt(args, "maxGraphExpansions", ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
+
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new ConcatenateGraphFilter(input, preserveSep, preservePositionIncrements, maxGraphExpansions);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
index dfe06c8..71dab42 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
@@ -81,8 +81,7 @@ public class FingerprintFilter extends TokenFilter {
 
   @Override
   public final boolean incrementToken() throws IOException {
-    if (uniqueTerms != null) {
-      // We have already built the single output token - there's no more 
+    if (inputEnded) {
       return false;
     }
     boolean result = buildSingleOutputToken();
@@ -177,6 +176,7 @@ public class FingerprintFilter extends TokenFilter {
       }
     });
 
+    //TODO lets append directly to termAttribute?
     StringBuilder sb = new StringBuilder();
     for (Object item : items) {
       if (sb.length() >= 1) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index 1811920..df868a0 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -64,6 +64,7 @@ org.apache.lucene.analysis.minhash.MinHashFilterFactory
 org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
 org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
 org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
+org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory
 org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
 org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 8cb1591..d94b396 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -72,6 +72,7 @@ import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.hunspell.Dictionary;
 import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
 import org.apache.lucene.analysis.minhash.MinHashFilter;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
 import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
@@ -119,10 +120,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
 
   private static final Set<Class<?>> avoidConditionals = new HashSet<>();
   static {
-    // Fingerprint filter needs to consume the whole tokenstream, so conditionals don't make sense here
+    // These filters needs to consume the whole tokenstream, so conditionals don't make sense here
     avoidConditionals.add(FingerprintFilter.class);
-    // Ditto MinHashFilter
     avoidConditionals.add(MinHashFilter.class);
+    avoidConditionals.add(ConcatenateGraphFilter.class);
   }
 
   private static final Map<Constructor<?>,Predicate<Object[]>> brokenConstructors = new HashMap<>();
@@ -156,7 +157,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
               return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
           });
       for (Class<?> c : Arrays.<Class<?>>asList(
-          // doesn't actual reset itself!
+          // doesn't actual reset itself!  TODO this statement is probably obsolete as of LUCENE-6121 ?
           CachingTokenFilter.class,
           // LUCENE-8092: doesn't handle graph inputs
           CJKBigramFilter.class,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java
new file mode 100644
index 0000000..453dcbf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.junit.Test;
+
+public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
+
+  private static final char SEP_LABEL = (char) ConcatenateGraphFilter.SEP_LABEL;
+  
+  @Test
+  public void testBasic() throws Exception {
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+    String input = "mykeyword";
+    tokenStream.setReader(new StringReader(input));
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream);
+    assertTokenStreamContents(stream, new String[] {input}, null, null, new int[] { 1 });
+  }
+
+  @Test
+  public void testWithNoPreserveSep() throws Exception {
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+    String input = "mykeyword another keyword";
+    tokenStream.setReader(new StringReader(input));
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, false, false, 100);
+    assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new int[] { 1 });
+  }
+
+  @Test
+  public void testWithMultipleTokens() throws Exception {
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+    String input = "mykeyword another keyword";
+    tokenStream.setReader(new StringReader(input));
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream);
+    CharsRefBuilder builder = new CharsRefBuilder();
+    builder.append("mykeyword");
+    builder.append(SEP_LABEL);
+    builder.append("another");
+    builder.append(SEP_LABEL);
+    builder.append("keyword");
+    assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new int[]{1});
+  }
+
+  @Test
+  public void testWithSynonym() throws Exception {
+    SynonymMap.Builder builder = new SynonymMap.Builder(true);
+    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
+    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+    tokenizer.setReader(new StringReader("mykeyword"));
+    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter);
+    assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new int[] { 1, 0 });
+  }
+
+  @Test
+  public void testWithSynonyms() throws Exception {
+    SynonymMap.Builder builder = new SynonymMap.Builder(true);
+    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+    String input = "mykeyword another keyword";
+    tokenStream.setReader(new StringReader(input));
+    SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, true, false, 100);
+    String[] expectedOutputs = new String[2];
+    CharsRefBuilder expectedOutput = new CharsRefBuilder();
+    expectedOutput.append("mykeyword");
+    expectedOutput.append(SEP_LABEL);
+    expectedOutput.append("another");
+    expectedOutput.append(SEP_LABEL);
+    expectedOutput.append("keyword");
+    expectedOutputs[0] = expectedOutput.toCharsRef().toString();
+    expectedOutput.clear();
+    expectedOutput.append("mysynonym");
+    expectedOutput.append(SEP_LABEL);
+    expectedOutput.append("another");
+    expectedOutput.append(SEP_LABEL);
+    expectedOutput.append("keyword");
+    expectedOutputs[1] = expectedOutput.toCharsRef().toString();
+    assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0});
+  }
+
+  @Test
+  public void testWithStopword() throws Exception {
+    for (boolean preservePosInc : new boolean[]{true, false}) {
+      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
+      tokenStream.setReader(new StringReader(input));
+      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
+      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, true, preservePosInc, 10);
+      CharsRefBuilder builder = new CharsRefBuilder();
+      if (preservePosInc) {
+        builder.append(SEP_LABEL);
+      }
+      builder.append("mykeyword");
+      builder.append(SEP_LABEL);
+      if (preservePosInc) {
+        builder.append(SEP_LABEL);
+      }
+      builder.append("keyword");
+//      if (preservePosInc) { LUCENE-8344 uncomment
+//        builder.append(SEP_LABEL);
+//      }
+      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
+    }
+  }
+
+  @Test
+  public void testValidNumberOfExpansions() throws IOException {
+    SynonymMap.Builder builder = new SynonymMap.Builder(true);
+    for (int i = 0; i < 256; i++) {
+      builder.add(new CharsRef("" + (i+1)), new CharsRef("" + (1000 + (i+1))), true);
+    }
+    StringBuilder valueBuilder = new StringBuilder();
+    for (int i = 0 ; i < 8 ; i++) {
+      valueBuilder.append(i+1);
+      valueBuilder.append(" ");
+    }
+    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+    tokenizer.setReader(new StringReader(valueBuilder.toString()));
+    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
+
+    int count;
+    try (ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter)) {
+      stream.reset();
+      ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class);
+      count = 0;
+      while (stream.incrementToken()) {
+        count++;
+        assertNotNull(attr.getBytesRef());
+        assertTrue(attr.getBytesRef().length > 0);
+      }
+    }
+    assertEquals(count, 256);
+  }
+
+  public void testEmpty() throws IOException {
+    Tokenizer tokenizer = whitespaceMockTokenizer("");
+    ConcatenateGraphFilter filter = new ConcatenateGraphFilter(tokenizer);
+    assertTokenStreamContents(filter, new String[0]);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java
new file mode 100644
index 0000000..1e149f0
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTestCase {
+  public void test() throws Exception {
+    for (final boolean consumeAll : new boolean[]{true, false}) {
+      final String input = "A1 B2 A1 D4 C3";
+      Reader reader = new StringReader(input);
+      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+      tokenizer.setReader(reader);
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = tokenizer;
+      stream = tokenFilterFactory("ConcatenateGraph").create(stream);
+      assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
+    }
+  }
+
+  public void testPreserveSep() throws Exception {
+    final String input = "A1 B2 A1 D4 C3";
+    final String output = "A1A1D4C3";
+    Reader reader = new StringReader(input);
+    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    tokenizer.setReader(reader);
+    TokenStream stream = tokenizer;
+    stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
+    stream = tokenFilterFactory("ConcatenateGraph",
+        "preserveSep", "false"
+    ).create(stream);
+    assertTokenStreamContents(stream, new String[]{output});
+  }
+
+  public void testPreservePositionIncrements() throws Exception {
+    final String input = "A1 B2 A1 D4 C3";
+    final String output = "A1 A1 D4 C3";
+    Reader reader = new StringReader(input);
+    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    tokenizer.setReader(reader);
+    TokenStream stream = tokenizer;
+    stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
+    stream = tokenFilterFactory("ConcatenateGraph",
+        "preservePositionIncrements", "false"
+        ).create(stream);
+    assertTokenStreamContents(stream, new String[]{output.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
+  }
+
+  public void testRequired() throws Exception {
+    // no params are required
+    tokenFilterFactory("ConcatenateGraph");
+  }
+
+  /**
+   * Test that bogus arguments result in exception
+   */
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
+        tokenFilterFactory("ConcatenateGraph", "bogusArg", "bogusValue"));
+    assertTrue(expected.getMessage().contains("Unknown parameters"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
index 450447a..76bd617 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
@@ -69,4 +69,13 @@ public class TestFingerprintFilter extends BaseTokenStreamTestCase {
     }
   }
 
+  public void testEmpty() throws Exception {
+    for (final boolean consumeAll : new boolean[] { true, false }) {
+      MockTokenizer tokenizer = whitespaceMockTokenizer("");
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = new FingerprintFilter(tokenizer);
+      assertTokenStreamContents(stream, new String[0]);
+    }
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
index 13bd392..8888382 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
@@ -19,7 +19,7 @@ package org.apache.lucene.search.suggest.document;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.AnalyzerWrapper;
 import org.apache.lucene.analysis.TokenStreamToAutomaton;
-import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
 
 /**
  * Wraps an {@link org.apache.lucene.analysis.Analyzer}
@@ -38,23 +38,10 @@ import org.apache.lucene.util.automaton.Operations;
 public final class CompletionAnalyzer extends AnalyzerWrapper {
 
   /**
-   * Represents the separation between tokens, if
-   * <code>preserveSep</code> is <code>true</code>
-   * <p>
-   * Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream}
-   * payload
-   */
-  final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP;
-
-  /**
    * Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton}
    */
   final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE;
 
-  final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
-  final static boolean DEFAULT_PRESERVE_SEP = true;
-  final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
-
   private final Analyzer analyzer;
 
   /**
@@ -101,7 +88,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper {
    * preserving token separation, position increments and no limit on graph expansions
    */
   public CompletionAnalyzer(Analyzer analyzer) {
-    this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
+    this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
   }
 
   /**
@@ -109,7 +96,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper {
    * with no limit on graph expansions
    */
   public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) {
-    this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS);
+    this(analyzer, preserveSep, preservePositionIncrements, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
   }
 
   /**
@@ -117,7 +104,7 @@ public final class CompletionAnalyzer extends AnalyzerWrapper {
    * preserving token separation and position increments
    */
   public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) {
-    this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
+    this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java
index 49fe7d0..6be0c91 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java
@@ -27,7 +27,7 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.suggest.BitsProducer;
 
 import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.HOLE_CHARACTER;
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
+import static org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter.SEP_LABEL;
 
 /**
  * Abstract {@link Query} that match documents containing terms with a specified prefix

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
index 7308e65..d3bec8e 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
@@ -14,71 +14,43 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.lucene.search.suggest.document;
 
 import java.io.IOException;
 
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.TokenStreamToAutomaton;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.AttributeReflector;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.FiniteStringsIterator;
-import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.Transition;
-import org.apache.lucene.util.fst.Util;
-
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS;
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS;
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP;
-import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
 
 /**
- * Token stream which converts a provided token stream to an automaton.
- * The accepted strings enumeration from the automaton are available through the
- * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
- * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
- * a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)})
- *
+ * A {@link ConcatenateGraphFilter} but we can set the payload and provide access to config options.
  * @lucene.experimental
  */
-public final class CompletionTokenStream extends TokenStream {
+public final class CompletionTokenStream extends TokenFilter {
 
   private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
-  private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class);
 
+  // package accessible on purpose
   final TokenStream inputTokenStream;
   final boolean preserveSep;
   final boolean preservePositionIncrements;
   final int maxGraphExpansions;
 
-  private FiniteStringsIterator finiteStrings;
-  private BytesRef payload;
-  private CharTermAttribute charTermAttribute;
+  private BytesRef payload; // note doesn't participate in TokenStream lifecycle; it's effectively constant
 
-  /**
-   * Creates a token stream to convert <code>input</code> to a token stream
-   * of accepted strings by its automaton.
-   * <p>
-   * The token stream <code>input</code> is converted to an automaton
-   * with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer}
-   */
   CompletionTokenStream(TokenStream inputTokenStream) {
-    this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
+    this(inputTokenStream,
+        ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP,
+        ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS,
+        ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
   }
 
   CompletionTokenStream(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
-    // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
-    // the input stream entirely in the first call to incrementToken
+    super(new ConcatenateGraphFilter(inputTokenStream, preserveSep, preservePositionIncrements, maxGraphExpansions));
     this.inputTokenStream = inputTokenStream;
     this.preserveSep = preserveSep;
     this.preservePositionIncrements = preservePositionIncrements;
@@ -94,248 +66,23 @@ public final class CompletionTokenStream extends TokenStream {
 
   @Override
   public boolean incrementToken() throws IOException {
-    clearAttributes();
-    if (finiteStrings == null) {
-      Automaton automaton = toAutomaton();
-      finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
-    }
-
-    IntsRef string = finiteStrings.next();
-    if (string == null) {
+    if (input.incrementToken()) {
+      payloadAttr.setPayload(payload);
+      return true;
+    } else {
       return false;
     }
-
-    Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
-    if (charTermAttribute != null) {
-      charTermAttribute.setLength(0);
-      charTermAttribute.append(bytesAtt.toUTF16());
-    }
-    if (payload != null) {
-      payloadAttr.setPayload(this.payload);
-    }
-
-    return true;
-  }
-
-  @Override
-  public void end() throws IOException {
-    super.end();
-    if (finiteStrings == null) {
-      inputTokenStream.end();
-    }
-  }
-
-  @Override
-  public void close() throws IOException {
-    if (finiteStrings == null) {
-      inputTokenStream.close();
-    }
   }
 
-  @Override
-  public void reset() throws IOException {
-    super.reset();
-    if (hasAttribute(CharTermAttribute.class)) {
-      // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
-      charTermAttribute = getAttribute(CharTermAttribute.class);
-    }
-    finiteStrings = null;
-  }
-
-  /**
-   * Converts the token stream to an automaton,
-   * treating the transition labels as utf-8
-   */
+  /** Delegates to...At
+   * @see ConcatenateGraphFilter#toAutomaton()  */
   public Automaton toAutomaton() throws IOException {
-    return toAutomaton(false);
+    return ((ConcatenateGraphFilter)input).toAutomaton();
   }
 
-  /**
-   * Converts the tokenStream to an automaton
-   */
+  /** Delegates to...
+   *  @see ConcatenateGraphFilter#toAutomaton(boolean) */
   public Automaton toAutomaton(boolean unicodeAware) throws IOException {
-    // TODO refactor this
-    // maybe we could hook up a modified automaton from TermAutomatonQuery here?
-    Automaton automaton = null;
-    try {
-      // Create corresponding automaton: labels are bytes
-      // from each analyzed token, with byte 0 used as
-      // separator between tokens:
-      final TokenStreamToAutomaton tsta;
-      if (preserveSep) {
-        tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
-      } else {
-        // When we're not preserving sep, we don't steal 0xff
-        // byte, so we don't need to do any escaping:
-        tsta = new TokenStreamToAutomaton();
-      }
-      tsta.setPreservePositionIncrements(preservePositionIncrements);
-      tsta.setUnicodeArcs(unicodeAware);
-
-      automaton = tsta.toAutomaton(inputTokenStream);
-    } finally {
-      IOUtils.closeWhileHandlingException(inputTokenStream);
-    }
-
-    // TODO: we can optimize this somewhat by determinizing
-    // while we convert
-    automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
-    // This automaton should not blow up during determinize:
-    return Operations.determinize(automaton, maxGraphExpansions);
-  }
-
-  /**
-   * Just escapes the 0xff byte (which we still for SEP).
-   */
-  private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
-
-    final BytesRefBuilder spare = new BytesRefBuilder();
-    private char sepLabel;
-
-    public EscapingTokenStreamToAutomaton(char sepLabel) {
-      this.sepLabel = sepLabel;
-    }
-
-    @Override
-    protected BytesRef changeToken(BytesRef in) {
-      int upto = 0;
-      for (int i = 0; i < in.length; i++) {
-        byte b = in.bytes[in.offset + i];
-        if (b == (byte) sepLabel) {
-          spare.grow(upto + 2);
-          spare.setByteAt(upto++, (byte) sepLabel);
-          spare.setByteAt(upto++, b);
-        } else {
-          spare.grow(upto + 1);
-          spare.setByteAt(upto++, b);
-        }
-      }
-      spare.setLength(upto);
-      return spare.get();
-    }
-  }
-
-  // Replaces SEP with epsilon or remaps them if
-  // we were asked to preserve them:
-  private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
-
-    Automaton result = new Automaton();
-
-    // Copy all states over
-    int numStates = a.getNumStates();
-    for (int s = 0; s < numStates; s++) {
-      result.createState();
-      result.setAccept(s, a.isAccept(s));
-    }
-
-    // Go in reverse topo sort so we know we only have to
-    // make one pass:
-    Transition t = new Transition();
-    int[] topoSortStates = Operations.topoSortStates(a);
-    for (int i = 0; i < topoSortStates.length; i++) {
-      int state = topoSortStates[topoSortStates.length - 1 - i];
-      int count = a.initTransition(state, t);
-      for (int j = 0; j < count; j++) {
-        a.getNextTransition(t);
-        if (t.min == TokenStreamToAutomaton.POS_SEP) {
-          assert t.max == TokenStreamToAutomaton.POS_SEP;
-          if (preserveSep) {
-            // Remap to SEP_LABEL:
-            result.addTransition(state, t.dest, sepLabel);
-          } else {
-            result.addEpsilon(state, t.dest);
-          }
-        } else if (t.min == TokenStreamToAutomaton.HOLE) {
-          assert t.max == TokenStreamToAutomaton.HOLE;
-
-          // Just remove the hole: there will then be two
-          // SEP tokens next to each other, which will only
-          // match another hole at search time.  Note that
-          // it will also match an empty-string token ... if
-          // that's somehow a problem we can always map HOLE
-          // to a dedicated byte (and escape it in the
-          // input).
-          result.addEpsilon(state, t.dest);
-        } else {
-          result.addTransition(state, t.dest, t.min, t.max);
-        }
-      }
-    }
-
-    result.finishState();
-
-    return result;
-  }
-
-  /**
-   * Attribute providing access to the term builder and UTF-16 conversion
-   */
-  public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute {
-    /**
-     * Returns the builder from which the term is derived.
-     */
-    BytesRefBuilder builder();
-
-    /**
-     * Returns the term represented as UTF-16
-     */
-    CharSequence toUTF16();
-  }
-
-  /**
-   * Custom attribute implementation for completion token stream
-   */
-  public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute {
-    private final BytesRefBuilder bytes = new BytesRefBuilder();
-    private transient CharsRefBuilder charsRef;
-
-    /**
-     * Sole constructor
-     * no-op
-     */
-    public BytesRefBuilderTermAttributeImpl() {
-    }
-
-    @Override
-    public BytesRefBuilder builder() {
-      return bytes;
-    }
-
-    @Override
-    public BytesRef getBytesRef() {
-      return bytes.get();
-    }
-
-    @Override
-    public void clear() {
-      bytes.clear();
-    }
-
-    @Override
-    public void copyTo(AttributeImpl target) {
-      BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target;
-      other.bytes.copyBytes(bytes);
-    }
-
-    @Override
-    public AttributeImpl clone() {
-      BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl();
-      copyTo(other);
-      return other;
-    }
-
-    @Override
-    public void reflectWith(AttributeReflector reflector) {
-      reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef());
-    }
-
-    @Override
-    public CharSequence toUTF16() {
-      if (charsRef == null) {
-        charsRef = new CharsRefBuilder();
-      }
-      charsRef.copyUTF8Bytes(getBytesRef());
-      return charsRef.get();
-    }
+    return ((ConcatenateGraphFilter)input).toAutomaton(unicodeAware);
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java
index 6217ca3..1a2680c 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java
@@ -22,6 +22,7 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeSet;
 
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.Weight;
@@ -178,7 +179,7 @@ public class ContextQuery extends CompletionQuery {
     // if separators are preserved the fst contains a SEP_LABEL
     // behind each gap. To have a matching automaton, we need to
     // include the SEP_LABEL in the query as well
-    Automaton optionalSepLabel = Operations.optional(Automata.makeChar(CompletionAnalyzer.SEP_LABEL));
+    Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_LABEL));
     Automaton prefixAutomaton = Operations.concatenate(optionalSepLabel, innerAutomaton);
     Automaton contextsAutomaton = Operations.concatenate(toContextAutomaton(contexts, matchAllContexts), prefixAutomaton);
     contextsAutomaton = Operations.determinize(contextsAutomaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
@@ -302,7 +303,7 @@ public class ContextQuery extends CompletionQuery {
           }
           ref.offset = ++i;
           assert ref.offset < ref.length : "input should not end with the context separator";
-          if (ref.ints[i] == CompletionAnalyzer.SEP_LABEL) {
+          if (ref.ints[i] == ConcatenateGraphFilter.SEP_LABEL) {
             ref.offset++;
             assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_LABEL";
           }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java
index 4cb91b8..cf462e1 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java
@@ -90,6 +90,7 @@ public class ContextSuggestField extends SuggestField {
     }
     CompletionTokenStream completionTokenStream;
     if (stream instanceof CompletionTokenStream) {
+      //TODO this is awkward; is there a better way avoiding re-creating the chain?
       completionTokenStream = (CompletionTokenStream) stream;
       PrefixTokenFilter prefixTokenFilter = new PrefixTokenFilter(completionTokenStream.inputTokenStream, (char) CONTEXT_SEPARATOR, contexts);
       completionTokenStream = new CompletionTokenStream(prefixTokenFilter,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java
index b243f4e..14479fe 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java
@@ -144,9 +144,12 @@ public class FuzzyCompletionQuery extends PrefixCompletionQuery {
 
   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
-    CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text());
+    final Automaton originalAutomata;
+    try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
+      originalAutomata = stream.toAutomaton(unicodeAware);
+    }
     Set<IntsRef> refs = new HashSet<>();
-    Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs);
+    Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
     if (unicodeAware) {
       Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
       utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
index 2704631..5ca4993 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
@@ -19,6 +19,7 @@ package org.apache.lucene.search.suggest.document;
 import java.io.IOException;
 import java.util.PriorityQueue;
 
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
@@ -42,7 +43,7 @@ final class NRTSuggesterBuilder {
    * Label used to separate surface form and docID
    * in the output
    */
-  public static final int PAYLOAD_SEP = '\u001F';
+  public static final int PAYLOAD_SEP = ConcatenateGraphFilter.SEP_LABEL;
 
   /**
    * Marks end of the analyzed input and start of dedup

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java
index 7bb75e9..a8da150 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java
@@ -68,8 +68,9 @@ public class PrefixCompletionQuery extends CompletionQuery {
 
   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
-    CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text());
-    return new CompletionWeight(this, stream.toAutomaton());
+    try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) {
+      return new CompletionWeight(this, stream.toAutomaton());
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
index 7f06328..b2d24c2 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.IndexOptions;
@@ -140,7 +141,7 @@ public class SuggestField extends Field {
 
   private boolean isReserved(char c) {
     switch (c) {
-      case CompletionAnalyzer.SEP_LABEL:
+      case ConcatenateGraphFilter.SEP_LABEL:
       case CompletionAnalyzer.HOLE_CHARACTER:
       case NRTSuggesterBuilder.END_BYTE:
         return true;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java
deleted file mode 100644
index 6f558d1..0000000
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.suggest.document;
-
-import java.io.IOException;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.synonym.SynonymFilter;
-import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.CharsRefBuilder;
-import org.junit.Test;
-
-public class CompletionTokenStreamTest extends BaseTokenStreamTestCase {
-
-  @Test
-  public void testBasic() throws Exception {
-    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
-    String input = "mykeyword";
-    BytesRef payload = new BytesRef("payload");
-    tokenStream.setReader(new StringReader(input));
-    CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream);
-    completionTokenStream.setPayload(payload);
-    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
-    assertTokenStreamContents(stream, new String[] {input}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null);
-  }
-
-  @Test
-  public void testWithNoPreserveSep() throws Exception {
-    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
-    String input = "mykeyword another keyword";
-    BytesRef payload = new BytesRef("payload");
-    tokenStream.setReader(new StringReader(input));
-    CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream, false, false, 100);
-    completionTokenStream.setPayload(payload);
-    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
-    assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null);
-  }
-
-  @Test
-  public void testWithMultipleTokens() throws Exception {
-    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
-    String input = "mykeyword another keyword";
-    tokenStream.setReader(new StringReader(input));
-    BytesRef payload = new BytesRef("payload");
-    CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream);
-    completionTokenStream.setPayload(payload);
-    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
-    CharsRefBuilder builder = new CharsRefBuilder();
-    builder.append("mykeyword");
-    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
-    builder.append("another");
-    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
-    builder.append("keyword");
-    assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null);
-  }
-
-  @Test
-  public void testWithSynonym() throws Exception {
-    SynonymMap.Builder builder = new SynonymMap.Builder(true);
-    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
-    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
-    tokenizer.setReader(new StringReader("mykeyword"));
-    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
-    CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter);
-    BytesRef payload = new BytesRef("payload");
-    completionTokenStream.setPayload(payload);
-    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
-    assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] { 1, 1 }, null, null);
-  }
-
-  @Test
-  public void testWithSynonyms() throws Exception {
-    SynonymMap.Builder builder = new SynonymMap.Builder(true);
-    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
-    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
-    String input = "mykeyword another keyword";
-    tokenStream.setReader(new StringReader(input));
-    SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
-    BytesRef payload = new BytesRef("payload");
-    CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter, true, false, 100);
-    completionTokenStream.setPayload(payload);
-    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
-    String[] expectedOutputs = new String[2];
-    CharsRefBuilder expectedOutput = new CharsRefBuilder();
-    expectedOutput.append("mykeyword");
-    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
-    expectedOutput.append("another");
-    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
-    expectedOutput.append("keyword");
-    expectedOutputs[0] = expectedOutput.toCharsRef().toString();
-    expectedOutput.clear();
-    expectedOutput.append("mysynonym");
-    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
-    expectedOutput.append("another");
-    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
-    expectedOutput.append("keyword");
-    expectedOutputs[1] = expectedOutput.toCharsRef().toString();
-    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null);
-  }
-
-  @Test
-  public void testValidNumberOfExpansions() throws IOException {
-    SynonymMap.Builder builder = new SynonymMap.Builder(true);
-    for (int i = 0; i < 256; i++) {
-      builder.add(new CharsRef("" + (i+1)), new CharsRef("" + (1000 + (i+1))), true);
-    }
-    StringBuilder valueBuilder = new StringBuilder();
-    for (int i = 0 ; i < 8 ; i++) {
-      valueBuilder.append(i+1);
-      valueBuilder.append(" ");
-    }
-    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
-    tokenizer.setReader(new StringReader(valueBuilder.toString()));
-    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
-
-    CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter);
-    completionTokenStream.setPayload(new BytesRef());
-    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
-    stream.reset();
-    CompletionTokenStream.BytesRefBuilderTermAttribute attr = stream.addAttribute(CompletionTokenStream.BytesRefBuilderTermAttribute.class);
-    PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
-    int maxPos = 0;
-    int count = 0;
-    while(stream.incrementToken()) {
-      count++;
-      assertNotNull(attr.getBytesRef());
-      assertTrue(attr.getBytesRef().length > 0);
-      maxPos += posAttr.getPositionIncrement();
-    }
-    stream.close();
-    assertEquals(count, 256);
-    assertEquals(count, maxPos);
-  }
-
-  public final static class PayloadAttrToTypeAttrFilter extends TokenFilter {
-    private PayloadAttribute payload = addAttribute(PayloadAttribute.class);
-    private TypeAttribute type = addAttribute(TypeAttribute.class);
-
-    protected PayloadAttrToTypeAttrFilter(TokenStream input) {
-      super(input);
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-      if (input.incrementToken()) {
-        // we move them over so we can assert them more easily in the tests
-        type.setType(payload.getPayload().utf8ToString());
-        return true;
-      }
-      return false;
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java
index 0c3b254..8beea12 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java
@@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
@@ -109,21 +110,21 @@ public class TestContextSuggestField extends LuceneTestCase {
     CharsRefBuilder builder = new CharsRefBuilder();
     builder.append("context1");
     builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
-    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
+    builder.append((char) ConcatenateGraphFilter.SEP_LABEL);
     builder.append("input");
     expectedOutputs[0] = builder.toCharsRef().toString();
     builder.clear();
     builder.append("context2");
     builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
-    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
+    builder.append((char) ConcatenateGraphFilter.SEP_LABEL);
     builder.append("input");
     expectedOutputs[1] = builder.toCharsRef().toString();
-    TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null));
-    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null);
+    TokenStream stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null));
+    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null);
 
     CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
-    stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null));
-    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null);
+    stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null));
+    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null);
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f9f5e837/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
index a6659e0..e6d7062 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
@@ -32,7 +32,11 @@ import java.util.concurrent.CyclicBarrier;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.lucene70.Lucene70Codec;
@@ -99,7 +103,7 @@ public class TestSuggestField extends LuceneTestCase {
   public void testReservedChars() throws Exception {
     CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
     charsRefBuilder.append("sugg");
-    charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.SEP_LABEL);
+    charsRefBuilder.setCharAt(2, (char) ConcatenateGraphFilter.SEP_LABEL);
     IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
       new SuggestField("name", charsRefBuilder.toString(), 1);
     });
@@ -144,11 +148,11 @@ public class TestSuggestField extends LuceneTestCase {
       output.writeByte(SuggestField.TYPE);
     }
     BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
-    TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null));
+    TokenStream stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null));
     assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null);
 
     CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
-    stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null));
+    stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null));
     assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null);
   }
 
@@ -894,4 +898,23 @@ public class TestSuggestField extends LuceneTestCase {
     iwc.setCodec(filterCodec);
     return iwc;
   }
+
+  public final static class PayloadAttrToTypeAttrFilter extends TokenFilter {
+    private PayloadAttribute payload = addAttribute(PayloadAttribute.class);
+    private TypeAttribute type = addAttribute(TypeAttribute.class);
+
+    protected PayloadAttrToTypeAttrFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        // we move them over so we can assert them more easily in the tests
+        type.setType(payload.getPayload().utf8ToString());
+        return true;
+      }
+      return false;
+    }
+  }
 }


Mime
View raw message