lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From a.@apache.org
Subject [13/38] lucene-solr:jira/solr-9857: LUCENE-7619: add WordDelimiterGraphFilter (replacing WordDelimiterFilter) to produce a correct token stream graph when splitting words
Date Thu, 19 Jan 2017 15:32:22 GMT
LUCENE-7619: add WordDelimiterGraphFilter (replacing WordDelimiterFilter) to produce a correct token stream graph when splitting words


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/637915b8
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/637915b8
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/637915b8

Branch: refs/heads/jira/solr-9857
Commit: 637915b890d9f0e5cfaa6887609f221029327a25
Parents: 7d7e5d2
Author: Mike McCandless <mikemccand@apache.org>
Authored: Tue Jan 17 10:38:07 2017 -0500
Committer: Mike McCandless <mikemccand@apache.org>
Committed: Tue Jan 17 10:38:07 2017 -0500

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   5 +
 .../analysis/core/FlattenGraphFilter.java       | 418 +++++++++
 .../core/FlattenGraphFilterFactory.java         |  44 +
 .../miscellaneous/WordDelimiterFilter.java      |   9 +-
 .../WordDelimiterFilterFactory.java             |   6 +
 .../miscellaneous/WordDelimiterGraphFilter.java | 692 ++++++++++++++
 .../WordDelimiterGraphFilterFactory.java        | 199 ++++
 .../miscellaneous/WordDelimiterIterator.java    |  59 +-
 .../analysis/synonym/FlattenGraphFilter.java    | 417 ---------
 .../synonym/FlattenGraphFilterFactory.java      |  44 -
 .../lucene/analysis/synonym/SynonymFilter.java  |   1 +
 .../analysis/synonym/SynonymFilterFactory.java  |   1 +
 .../analysis/synonym/SynonymGraphFilter.java    |  11 +-
 ...ache.lucene.analysis.util.TokenFilterFactory |   3 +-
 .../analysis/core/TestFlattenGraphFilter.java   | 284 ++++++
 .../miscellaneous/TestWordDelimiterFilter.java  |  69 ++
 .../TestWordDelimiterGraphFilter.java           | 897 +++++++++++++++++++
 .../synonym/TestFlattenGraphFilter.java         | 284 ------
 .../synonym/TestSynonymGraphFilter.java         |  51 +-
 .../lucene/analysis/TokenStreamToAutomaton.java |  39 +-
 .../tokenattributes/OffsetAttributeImpl.java    |   2 +-
 .../PackedTokenAttributeImpl.java               |   2 +-
 .../PositionIncrementAttributeImpl.java         |   3 +-
 .../PositionLengthAttributeImpl.java            |   3 +-
 .../lucene/analysis/TestGraphTokenizers.java    |  53 +-
 .../suggest/analyzing/AnalyzingSuggester.java   |   3 +-
 .../analysis/BaseTokenStreamTestCase.java       | 114 ++-
 .../lucene/analysis/TokenStreamToDot.java       |   5 +-
 28 files changed, 2899 insertions(+), 819 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 2e015a3..4df7a67 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -76,6 +76,11 @@ New Features
 * LUCENE-7623: Add FunctionScoreQuery and FunctionMatchQuery (Alan Woodward,
   Adrien Grand, David Smiley)
 
+* LUCENE-7619: Add WordDelimiterGraphFilter, just like
+  WordDelimiterFilter except it produces correct token graphs so that
+  proximity queries at search time will produce correct results (Mike
+  McCandless)
+
 Bug Fixes
 
 * LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java
new file mode 100644
index 0000000..01e1f6f
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java
@@ -0,0 +1,418 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.core;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.RollingBuffer;
+
+/**
+ * Converts an incoming graph token stream, such as one from
+ * {@link SynonymGraphFilter}, into a flat form so that
+ * all nodes form a single linear chain with no side paths.  Every
+ * path through the graph touches every node.  This is necessary
+ * when indexing a graph token stream, because the index does not
+ * save {@link PositionLengthAttribute} and so it cannot
+ * preserve the graph structure.  However, at search time,
+ * query parsers can correctly handle the graph and this token
+ * filter should <b>not</b> be used.
+ *
+ * <p>If the graph was not already flat to start, this
+ * is likely a lossy process, i.e. it will often cause the 
+ * graph to accept token sequences it should not, and to
+ * reject token sequences it should not.
+ *
+ * <p>However, when applying synonyms during indexing, this
+ * is necessary because Lucene already does not index a graph 
+ * and so the indexing process is already lossy
+ * (it ignores the {@link PositionLengthAttribute}).
+ *
+ * @lucene.experimental
+ */
+public final class FlattenGraphFilter extends TokenFilter {
+
+  /** Holds all tokens leaving a given input position. */
+  private final static class InputNode implements RollingBuffer.Resettable {
+    private final List<AttributeSource.State> tokens = new ArrayList<>();
+
+    /** Our input node, or -1 if we haven't been assigned yet */
+    int node = -1;
+
+    /** Maximum to input node for all tokens leaving here; we use this
+     *  to know when we can freeze. */
+    int maxToNode = -1;
+
+    /** Where we currently map to; this changes (can only
+     *  increase as we see more input tokens), until we are finished
+     *  with this position. */
+    int outputNode = -1;
+
+    /** Which token (index into {@link #tokens}) we will next output. */
+    int nextOut;
+
+    @Override
+    public void reset() {
+      tokens.clear();
+      node = -1;
+      outputNode = -1;
+      maxToNode = -1;
+      nextOut = 0;
+    }
+  }
+
+  /** Gathers up merged input positions into a single output position,
+   *  only for the current "frontier" of nodes we've seen but can't yet
+   *  output because they are not frozen. */
+  private final static class OutputNode implements RollingBuffer.Resettable {
+    private final List<Integer> inputNodes = new ArrayList<>();
+
+    /** Node ID for this output, or -1 if we haven't been assigned yet. */
+    int node = -1;
+
+    /** Which input node (index into {@link #inputNodes}) we will next output. */
+    int nextOut;
+    
+    /** Start offset of tokens leaving this node. */
+    int startOffset = -1;
+
+    /** End offset of tokens arriving to this node. */
+    int endOffset = -1;
+
+    @Override
+    public void reset() {
+      inputNodes.clear();
+      node = -1;
+      nextOut = 0;
+      startOffset = -1;
+      endOffset = -1;
+    }
+  }
+
+  private final RollingBuffer<InputNode> inputNodes = new RollingBuffer<InputNode>() {
+    @Override
+    protected InputNode newInstance() {
+      return new InputNode();
+    }
+  };
+
+  private final RollingBuffer<OutputNode> outputNodes = new RollingBuffer<OutputNode>() {
+    @Override
+    protected OutputNode newInstance() {
+      return new OutputNode();
+    }
+  };
+
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  /** Which input node the last seen token leaves from */
+  private int inputFrom;
+
+  /** We are currently releasing tokens leaving from this output node */
+  private int outputFrom;
+
+  // for debugging:
+  //private int retOutputFrom;
+
+  private boolean done;
+
+  private int lastOutputFrom;
+
+  private int finalOffset;
+
+  private int finalPosInc;
+
+  private int maxLookaheadUsed;
+
+  private int lastStartOffset;
+
+  public FlattenGraphFilter(TokenStream in) {
+    super(in);
+  }
+
+  private boolean releaseBufferedToken() {
+
+    // We only need the while loop (retry) if we have a hole (an output node that has no tokens leaving):
+    while (outputFrom < outputNodes.getMaxPos()) {
+      OutputNode output = outputNodes.get(outputFrom);
+      if (output.inputNodes.isEmpty()) {
+        // No tokens arrived to this node, which happens for the first node
+        // after a hole:
+        //System.out.println("    skip empty outputFrom=" + outputFrom);
+        outputFrom++;
+        continue;
+      }
+
+      int maxToNode = -1;
+      for(int inputNodeID : output.inputNodes) {
+        InputNode inputNode = inputNodes.get(inputNodeID);
+        assert inputNode.outputNode == outputFrom;
+        maxToNode = Math.max(maxToNode, inputNode.maxToNode);
+      }
+      //System.out.println("  release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom);
+
+      // TODO: we could shrink the frontier here somewhat if we
+      // always output posLen=1 as part of our "sausagizing":
+      if (maxToNode <= inputFrom || done) {
+        //System.out.println("  output node merged these inputs: " + output.inputNodes);
+        // These tokens are now frozen
+        assert output.nextOut < output.inputNodes.size(): "output.nextOut=" + output.nextOut + " vs output.inputNodes.size()=" + output.inputNodes.size();
+        InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut));
+        if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) {
+          return false;
+        }
+        if (inputNode.tokens.size() == 0) {
+          assert inputNode.nextOut == 0;
+          assert output.nextOut == 0;
+          // Hole dest nodes should never be merged since 1) we always
+          // assign them to a new output position, and 2) since they never
+          // have arriving tokens they cannot be pushed:
+          assert output.inputNodes.size() == 1: output.inputNodes.size();
+          outputFrom++;
+          inputNodes.freeBefore(output.inputNodes.get(0));
+          outputNodes.freeBefore(outputFrom);
+          continue;
+        }
+
+        assert inputNode.nextOut < inputNode.tokens.size();
+
+        restoreState(inputNode.tokens.get(inputNode.nextOut));
+
+        // Correct posInc
+        assert outputFrom >= lastOutputFrom;
+        posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom);
+        int toInputNodeID = inputNode.node + posLenAtt.getPositionLength();
+        InputNode toInputNode = inputNodes.get(toInputNodeID);
+
+        // Correct posLen
+        assert toInputNode.outputNode > outputFrom;
+        posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom);
+        lastOutputFrom = outputFrom;
+        inputNode.nextOut++;
+        //System.out.println("  ret " + this);
+
+        OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode);
+
+        // Correct offsets
+
+        // This is a bit messy; we must do this so offset don't go backwards,
+        // which would otherwise happen if the replacement has more tokens
+        // than the input:
+        int startOffset = Math.max(lastStartOffset, output.startOffset);
+
+        // We must do this in case the incoming tokens have broken offsets:
+        int endOffset = Math.max(startOffset, outputEndNode.endOffset);
+        
+        offsetAtt.setOffset(startOffset, endOffset);
+        lastStartOffset = startOffset;
+
+        if (inputNode.nextOut == inputNode.tokens.size()) {
+          output.nextOut++;
+          if (output.nextOut == output.inputNodes.size()) {
+            outputFrom++;
+            inputNodes.freeBefore(output.inputNodes.get(0));
+            outputNodes.freeBefore(outputFrom);
+          }
+        }
+
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    //System.out.println("    break false");
+    return false;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    //System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom);
+
+    while (true) {
+      if (releaseBufferedToken()) {
+        //retOutputFrom += posIncAtt.getPositionIncrement();
+        //System.out.println("    return buffered: " + termAtt + " " + retOutputFrom + "-" + (retOutputFrom + posLenAtt.getPositionLength()));
+        //printStates();
+        return true;
+      } else if (done) {
+        //System.out.println("    done, return false");
+        return false;
+      }
+
+      if (input.incrementToken()) {
+        // Input node this token leaves from:
+        inputFrom += posIncAtt.getPositionIncrement();
+
+        int startOffset = offsetAtt.startOffset();
+        int endOffset = offsetAtt.endOffset();
+
+        // Input node this token goes to:
+        int inputTo = inputFrom + posLenAtt.getPositionLength();
+        //System.out.println("  input.inc " + termAtt + ": " + inputFrom + "-" + inputTo);
+
+        InputNode src = inputNodes.get(inputFrom);
+        if (src.node == -1) {
+          // This means the "from" node of this token was never seen as a "to" node,
+          // which should only happen if we just crossed a hole.  This is a challenging
+          // case for us because we normally rely on the full dependencies expressed
+          // by the arcs to assign outgoing node IDs.  It would be better if tokens
+          // were never dropped but instead just marked deleted with a new
+          // TermDeletedAttribute (boolean valued) ... but until that future, we have
+          // a hack here to forcefully jump the output node ID:
+          assert src.outputNode == -1;
+          src.node = inputFrom;
+
+          src.outputNode = outputNodes.getMaxPos() + 1;
+          //System.out.println("    hole: force to outputNode=" + src.outputNode);
+          OutputNode outSrc = outputNodes.get(src.outputNode);
+
+          // Not assigned yet:
+          assert outSrc.node == -1;
+          outSrc.node = src.outputNode;
+          outSrc.inputNodes.add(inputFrom);
+          outSrc.startOffset = startOffset;
+        } else {
+          OutputNode outSrc = outputNodes.get(src.outputNode);
+          if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) {
+            // "shrink wrap" the offsets so the original tokens (with most
+            // restrictive offsets) win:
+            outSrc.startOffset = Math.max(startOffset, outSrc.startOffset);
+          }
+        }
+
+        // Buffer this token:
+        src.tokens.add(captureState());
+        src.maxToNode = Math.max(src.maxToNode, inputTo);
+        maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize());
+
+        InputNode dest = inputNodes.get(inputTo);
+        if (dest.node == -1) {
+          // Common case: first time a token is arriving to this input position:
+          dest.node = inputTo;
+        }
+
+        // Always number output nodes sequentially:
+        int outputEndNode = src.outputNode + 1;
+
+        if (outputEndNode > dest.outputNode) {
+          if (dest.outputNode != -1) {
+            boolean removed = outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo));
+            assert removed;
+          }
+          //System.out.println("    increase output node: " + dest.outputNode + " vs " + outputEndNode);
+          outputNodes.get(outputEndNode).inputNodes.add(inputTo);
+          dest.outputNode = outputEndNode;
+
+          // Since all we ever do is merge incoming nodes together, and then renumber
+          // the merged nodes sequentially, we should only ever assign smaller node
+          // numbers:
+          assert outputEndNode <= inputTo: "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo;
+        }
+
+        OutputNode outDest = outputNodes.get(dest.outputNode);
+        // "shrink wrap" the offsets so the original tokens (with most
+        // restrictive offsets) win:
+        if (outDest.endOffset == -1 || endOffset < outDest.endOffset) {
+          outDest.endOffset = endOffset;
+        }
+
+      } else {
+        //System.out.println("  got false from input");
+        input.end();
+        finalPosInc = posIncAtt.getPositionIncrement();
+        finalOffset = offsetAtt.endOffset();
+        done = true;
+        // Don't return false here: we need to force release any buffered tokens now
+      }
+    }
+  }
+
+  // Only for debugging:
+  /*
+  private void printStates() {
+    System.out.println("states:");
+    for(int i=outputFrom;i<outputNodes.getMaxPos();i++) {
+      OutputNode outputNode = outputNodes.get(i);
+      System.out.println("  output " + i + ": inputs " + outputNode.inputNodes);
+      for(int inputNodeID : outputNode.inputNodes) {
+        InputNode inputNode = inputNodes.get(inputNodeID);
+        assert inputNode.outputNode == i;
+      }
+    }
+  }
+  */
+
+  @Override
+  public void end() throws IOException {
+    if (done == false) {
+      super.end();
+    } else {
+      // NOTE, shady: don't call super.end, because we did already from incrementToken
+    }
+
+    clearAttributes();
+    if (done) {
+      // On exc, done is false, and we will not have set these:
+      posIncAtt.setPositionIncrement(finalPosInc);
+      offsetAtt.setOffset(finalOffset, finalOffset);
+    } else {
+      super.end();
+    }
+  }
+  
+  @Override
+  public void reset() throws IOException {
+    //System.out.println("F: reset");
+    super.reset();
+    inputFrom = -1;
+    inputNodes.reset();
+    InputNode in = inputNodes.get(0);
+    in.node = 0;
+    in.outputNode = 0;
+
+    outputNodes.reset();
+    OutputNode out = outputNodes.get(0);
+    out.node = 0;
+    out.inputNodes.add(0);
+    out.startOffset = 0;
+    outputFrom = 0;
+    //retOutputFrom = -1;
+    lastOutputFrom = -1;
+    done = false;
+    finalPosInc = -1;
+    finalOffset = -1;
+    lastStartOffset = 0;
+    maxLookaheadUsed = 0;
+  }
+
+  /** For testing */
+  public int getMaxLookaheadUsed() {
+    return maxLookaheadUsed;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilterFactory.java
new file mode 100644
index 0000000..920ab3d
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilterFactory.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.core;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/** 
+ * Factory for {@link FlattenGraphFilter}. 
+ *
+ * @lucene.experimental
+ */
+public class FlattenGraphFilterFactory extends TokenFilterFactory {
+
+  /** Creates a new FlattenGraphFilterFactory */
+  public FlattenGraphFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new FlattenGraphFilter(input);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
index f80ed8a..aef697c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
@@ -28,6 +28,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.InPlaceMergeSorter;
@@ -80,7 +81,12 @@ import org.apache.lucene.util.InPlaceMergeSorter;
  * the current {@link StandardTokenizer} immediately removes many intra-word
  * delimiters, it is recommended that this filter be used after a tokenizer that
  * does not do this (such as {@link WhitespaceTokenizer}).
+ *
+ * @deprecated Use {@link WordDelimiterGraphFilter} instead: it produces a correct
+ * token graph so that e.g. {@link PhraseQuery} works correctly when it's used in
+ * the search time analyzer.
  */
+@Deprecated
 public final class WordDelimiterFilter extends TokenFilter {
   
   public static final int LOWER = 0x01;
@@ -116,7 +122,7 @@ public final class WordDelimiterFilter extends TokenFilter {
   /**
    * Causes maximum runs of word parts to be catenated:
    * <p>
-   * "wi-fi" =&gt; "wifi"
+   * "500-42" =&gt; "50042"
    */
   public static final int CATENATE_NUMBERS = 8;
 
@@ -494,7 +500,6 @@ public final class WordDelimiterFilter extends TokenFilter {
   private void generatePart(boolean isSingleWord) {
     clearAttributes();
     termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
-
     int startOffset = savedStartOffset + iterator.current;
     int endOffset = savedStartOffset + iterator.end;
     

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
index 6a15b55..0002d65 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
@@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.ResourceLoader;
 import org.apache.lucene.analysis.util.ResourceLoaderAware;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.search.PhraseQuery;
 
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
 
@@ -47,7 +48,12 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
  *             types="wdfftypes.txt" /&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
+ *
+ * @deprecated Use {@link WordDelimiterGraphFilterFactory} instead: it produces a correct
+ * token graph so that e.g. {@link PhraseQuery} works correctly when it's used in
+ * the search time analyzer.
  */
+@Deprecated
 public class WordDelimiterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
   public static final String PROTECTED_TOKENS = "protected";
   public static final String TYPES = "types";

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
new file mode 100644
index 0000000..ea6f6cd
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -0,0 +1,692 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ 
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.InPlaceMergeSorter;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Splits words into subwords and performs optional transformations on subword
+ * groups, producing a correct token graph so that e.g. {@link PhraseQuery} can
+ * work correctly when this filter is used in the search-time analyzer.  Unlike
+ * the deprecated {@link WordDelimiterFilter}, this token filter produces a
+ * correct token graph as output.  However, it cannot consume an input token
+ * graph correctly.
+ *
+ * <p>
+ * Words are split into subwords with the following rules:
+ * <ul>
+ * <li>split on intra-word delimiters (by default, all non alpha-numeric
+ * characters): <code>"Wi-Fi"</code> &#8594; <code>"Wi", "Fi"</code></li>
+ * <li>split on case transitions: <code>"PowerShot"</code> &#8594;
+ * <code>"Power", "Shot"</code></li>
+ * <li>split on letter-number transitions: <code>"SD500"</code> &#8594;
+ * <code>"SD", "500"</code></li>
+ * <li>leading and trailing intra-word delimiters on each subword are ignored:
+ * <code>"//hello---there, 'dude'"</code> &#8594;
+ * <code>"hello", "there", "dude"</code></li>
+ * <li>trailing "'s" are removed for each subword: <code>"O'Neil's"</code>
+ * &#8594; <code>"O", "Neil"</code>
+ * <ul>
+ * <li>Note: this step isn't performed in a separate filter because of possible
+ * subword combinations.</li>
+ * </ul>
+ * </li>
+ * </ul>
+ * 
+ * The <b>combinations</b> parameter affects how subwords are combined:
+ * <ul>
+ * <li>combinations="0" causes no subword combinations: <code>"PowerShot"</code>
+ * &#8594; <code>0:"Power", 1:"Shot"</code> (0 and 1 are the token positions)</li>
+ * <li>combinations="1" means that in addition to the subwords, maximum runs of
+ * non-numeric subwords are catenated and produced at the same position of the
+ * last subword in the run:
+ * <ul>
+ * <li><code>"PowerShot"</code> &#8594;
+ * <code>0:"Power", 1:"Shot" 1:"PowerShot"</code></li>
+ * <li><code>"A's+B's&amp;C's"</code> &gt; <code>0:"A", 1:"B", 2:"C", 2:"ABC"</code>
+ * </li>
+ * <li><code>"Super-Duper-XL500-42-AutoCoder!"</code> &#8594;
+ * <code>0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"</code>
+ * </li>
+ * </ul>
+ * </li>
+ * </ul>
+ * One use for {@link WordDelimiterGraphFilter} is to help match words with different
+ * subword delimiters. For example, if the source text contained "wi-fi" one may
+ * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so
+ * is to specify combinations="1" in the analyzer used for indexing, and
+ * combinations="0" (the default) in the analyzer used for querying. Given that
+ * the current {@link StandardTokenizer} immediately removes many intra-word
+ * delimiters, it is recommended that this filter be used after a tokenizer that
+ * does not do this (such as {@link WhitespaceTokenizer}).
+ */
+
+public final class WordDelimiterGraphFilter extends TokenFilter {
+  
+  /**
+   * Causes parts of words to be generated:
+   * <p>
+   * "PowerShot" =&gt; "Power" "Shot"
+   */
+  public static final int GENERATE_WORD_PARTS = 1;
+
+  /**
+   * Causes number subwords to be generated:
+   * <p>
+   * "500-42" =&gt; "500" "42"
+   */
+  public static final int GENERATE_NUMBER_PARTS = 2;
+
+  /**
+   * Causes maximum runs of word parts to be catenated:
+   * <p>
+   * "wi-fi" =&gt; "wifi"
+   */
+  public static final int CATENATE_WORDS = 4;
+
+  /**
+   * Causes maximum runs of number parts to be catenated:
+   * <p>
+   * "500-42" =&gt; "50042"
+   */
+  public static final int CATENATE_NUMBERS = 8;
+
+  /**
+   * Causes all subword parts to be catenated:
+   * <p>
+   * "wi-fi-4000" =&gt; "wifi4000"
+   */
+  public static final int CATENATE_ALL = 16;
+
+  /**
+   * Causes original words are preserved and added to the subword list (Defaults to false)
+   * <p>
+   * "500-42" =&gt; "500" "42" "500-42"
+   */
+  public static final int PRESERVE_ORIGINAL = 32;
+
+  /**
+   * Causes lowercase -&gt; uppercase transition to start a new subword.
+   */
+  public static final int SPLIT_ON_CASE_CHANGE = 64;
+
+  /**
+   * If not set, causes numeric changes to be ignored (subwords will only be generated
+   * given SUBWORD_DELIM tokens).
+   */
+  public static final int SPLIT_ON_NUMERICS = 128;
+
+  /**
+   * Causes trailing "'s" to be removed for each subword
+   * <p>
+   * "O'Neil's" =&gt; "O", "Neil"
+   */
+  public static final int STEM_ENGLISH_POSSESSIVE = 256;
+  
+  /**
+   * If not null is the set of tokens to protect from being delimited
+   *
+   */
+  final CharArraySet protWords;
+
+  private final int flags;
+
+  // packs start pos, end pos, start part, end part (= slice of the term text) for each buffered part:
+  private int[] bufferedParts = new int[16];
+  private int bufferedLen;
+  private int bufferedPos;
+
+  // holds text for each buffered part, or null if it's a simple slice of the original term
+  private char[][] bufferedTermParts = new char[4][];
+  
+  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
+  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+
+  // used for iterating word delimiter breaks
+  private final WordDelimiterIterator iterator;
+
+  // used for concatenating runs of similar typed subwords (word,number)
+  private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
+
+  // number of subwords last output by concat.
+  private int lastConcatCount;
+
+  // used for catenate all
+  private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
+
+  // used for accumulating position increment gaps so that we preserve incoming holes:
+  private int accumPosInc;
+
+  private char[] savedTermBuffer = new char[16];
+  private int savedTermLength;
+  private int savedStartOffset;
+  private int savedEndOffset;
+  private AttributeSource.State savedState;
+  
+  // if length by start + end offsets doesn't match the term text then assume
+  // this is a synonym and don't adjust the offsets.
+  private boolean hasIllegalOffsets;
+
+  private int wordPos;
+
+  /**
+   * Creates a new WordDelimiterGraphFilter
+   *
+   * @param in TokenStream to be filtered
+   * @param charTypeTable table containing character types
+   * @param configurationFlags Flags configuring the filter
+   * @param protWords If not null is the set of tokens to protect from being delimited
+   */
+  public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+    super(in);
+    if ((configurationFlags &
+        ~(GENERATE_WORD_PARTS |
+          GENERATE_NUMBER_PARTS |
+          CATENATE_WORDS |
+          CATENATE_NUMBERS |
+          CATENATE_ALL |
+          PRESERVE_ORIGINAL |
+          SPLIT_ON_CASE_CHANGE |
+          SPLIT_ON_NUMERICS |
+          STEM_ENGLISH_POSSESSIVE)) != 0) {
+      throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags);
+    }
+    this.flags = configurationFlags;
+    this.protWords = protWords;
+    this.iterator = new WordDelimiterIterator(
+        charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
+  }
+
+  /**
+   * Creates a new WordDelimiterGraphFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE}
+   * as its charTypeTable
+   *
+   * @param in TokenStream to be filtered
+   * @param configurationFlags Flags configuring the filter
+   * @param protWords If not null is the set of tokens to protect from being delimited
+   */
+  public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
+  }
+
+  /** Iterates all words parts and concatenations, buffering up the term parts we should return. */
+  private void bufferWordParts() throws IOException {
+
+    saveState();
+
+    // if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming
+    // offsets.  this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc:
+    hasIllegalOffsets = (savedEndOffset - savedStartOffset != savedTermLength);
+
+    bufferedLen = 0;
+    lastConcatCount = 0;
+    wordPos = 0;
+
+    if (iterator.isSingleWord()) {
+      buffer(wordPos, wordPos+1, iterator.current, iterator.end);
+      wordPos++;
+      iterator.next();
+    } else {
+
+      // iterate all words parts, possibly buffering them, building up concatenations and possibly buffering them too:
+      while (iterator.end != WordDelimiterIterator.DONE) {
+        int wordType = iterator.type();
+      
+        // do we already have queued up incompatible concatenations?
+        if (concat.isNotEmpty() && (concat.type & wordType) == 0) {
+          flushConcatenation(concat);
+        }
+
+        // add subwords depending upon options
+        if (shouldConcatenate(wordType)) {
+          concatenate(concat);
+        }
+      
+        // add all subwords (catenateAll)
+        if (has(CATENATE_ALL)) {
+          concatenate(concatAll);
+        }
+      
+        // if we should output the word or number part
+        if (shouldGenerateParts(wordType)) {
+          buffer(wordPos, wordPos+1, iterator.current, iterator.end);
+          wordPos++;
+        }
+        iterator.next();
+      }
+
+      if (concat.isNotEmpty()) {
+        // flush final concatenation
+        flushConcatenation(concat);
+      }
+        
+      if (concatAll.isNotEmpty()) {
+        // only if we haven't output this same combo above, e.g. PowerShot with CATENATE_WORDS:
+        if (concatAll.subwordCount > lastConcatCount) {
+          if (wordPos == concatAll.startPos) {
+            // we are not generating parts, so we must advance wordPos now
+            wordPos++;
+          }
+          concatAll.write();
+        }
+        concatAll.clear();
+      }
+    }
+
+    if (has(PRESERVE_ORIGINAL)) {
+      if (wordPos == 0) {
+        // can happen w/ strange flag combos and inputs :)
+        wordPos++;
+      }
+      // add the original token now so that we can set the correct end position
+      buffer(0, wordPos, 0, savedTermLength);
+    }
+            
+    sorter.sort(0, bufferedLen);
+    wordPos = 0;
+
+    // set back to 0 for iterating from the buffer
+    bufferedPos = 0;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    while (true) {
+      if (savedState == null) {
+
+        // process a new input token
+        if (input.incrementToken() == false) {
+          return false;
+        }
+
+        int termLength = termAttribute.length();
+        char[] termBuffer = termAttribute.buffer();
+
+        accumPosInc += posIncAttribute.getPositionIncrement();
+
+        // iterate & cache all word parts up front:
+        iterator.setText(termBuffer, termLength);
+        iterator.next();
+        
+        // word of no delimiters, or protected word: just return it
+        if ((iterator.current == 0 && iterator.end == termLength) ||
+            (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
+          posIncAttribute.setPositionIncrement(accumPosInc);
+          accumPosInc = 0;
+          return true;
+        }
+        
+        // word of simply delimiters: swallow this token, creating a hole, and move on to next token
+        if (iterator.end == WordDelimiterIterator.DONE) {
+          if (has(PRESERVE_ORIGINAL) == false) {
+            continue;
+          } else {
+            return true;
+          }
+        }
+
+        // otherwise, we have delimiters, process & buffer all parts:
+        bufferWordParts();
+      }
+
+      if (bufferedPos < bufferedLen) {
+        clearAttributes();
+        restoreState(savedState);
+
+        char[] termPart = bufferedTermParts[bufferedPos];
+        int startPos = bufferedParts[4*bufferedPos];
+        int endPos = bufferedParts[4*bufferedPos+1];
+        int startPart = bufferedParts[4*bufferedPos+2];
+        int endPart = bufferedParts[4*bufferedPos+3];
+        bufferedPos++;
+
+        if (hasIllegalOffsets) {
+          offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+        } else {
+          offsetAttribute.setOffset(savedStartOffset + startPart, savedStartOffset + endPart);
+        }
+
+        if (termPart == null) {
+          termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart);
+        } else {
+          termAttribute.copyBuffer(termPart, 0, termPart.length);
+        }
+
+        posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos);
+        accumPosInc = 0;
+        posLenAttribute.setPositionLength(endPos - startPos);
+        wordPos = startPos;
+        return true;
+      }
+        
+      // no saved concatenations, on to the next input word
+      savedState = null;
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    accumPosInc = 0;
+    savedState = null;
+    concat.clear();
+    concatAll.clear();
+  }
+
+  // ================================================= Helper Methods ================================================
+
+  private class PositionSorter extends InPlaceMergeSorter {
+    @Override
+    protected int compare(int i, int j) {
+      // sort by smaller start position
+      int iPosStart = bufferedParts[4*i];
+      int jPosStart = bufferedParts[4*j];
+      int cmp = Integer.compare(iPosStart, jPosStart);
+      if (cmp != 0) {
+        return cmp;
+      }
+
+      // tie break by longest pos length:
+      int iPosEnd = bufferedParts[4*i+1];
+      int jPosEnd = bufferedParts[4*j+1];
+      return Integer.compare(jPosEnd, iPosEnd);
+    }
+
+    @Override
+    protected void swap(int i, int j) {
+      int iOffset = 4*i;
+      int jOffset = 4*j;
+      for(int x=0;x<4;x++) {
+        int tmp = bufferedParts[iOffset+x];
+        bufferedParts[iOffset+x] = bufferedParts[jOffset+x];
+        bufferedParts[jOffset+x] = tmp;
+      }
+
+      char[] tmp2 = bufferedTermParts[i];
+      bufferedTermParts[i] = bufferedTermParts[j];
+      bufferedTermParts[j] = tmp2;
+    }
+  }
+  
+  final PositionSorter sorter = new PositionSorter();
+
+  /** 
+   * startPos, endPos -> graph start/end position
+   * startPart, endPart -> slice of the original term for this part
+   */
+
+  void buffer(int startPos, int endPos, int startPart, int endPart) {
+    buffer(null, startPos, endPos, startPart, endPart);
+  }
+
+  /** 
+   * a null termPart means it's a simple slice of the original term
+   */
+  void buffer(char[] termPart, int startPos, int endPos, int startPart, int endPart) {
+    /*
+    System.out.println("buffer: pos=" + startPos + "-" + endPos + " part=" + startPart + "-" + endPart);
+    if (termPart != null) {
+      System.out.println("  termIn=" + new String(termPart));
+    } else {
+      System.out.println("  term=" + new String(savedTermBuffer, startPart, endPart-startPart));
+    }
+    */
+    assert endPos > startPos: "startPos=" + startPos + " endPos=" + endPos;
+    assert endPart > startPart || (endPart == 0 && startPart == 0 && savedTermLength == 0): "startPart=" + startPart + " endPart=" + endPart;
+    if ((bufferedLen+1)*4 > bufferedParts.length) {
+      bufferedParts = ArrayUtil.grow(bufferedParts, (bufferedLen+1)*4);
+    }
+    if (bufferedTermParts.length == bufferedLen) {
+      int newSize = ArrayUtil.oversize(bufferedLen+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
+      char[][] newArray = new char[newSize][];
+      System.arraycopy(bufferedTermParts, 0, newArray, 0, bufferedTermParts.length);
+      bufferedTermParts = newArray;
+    }
+    bufferedTermParts[bufferedLen] = termPart;
+    bufferedParts[bufferedLen*4] = startPos;
+    bufferedParts[bufferedLen*4+1] = endPos;
+    bufferedParts[bufferedLen*4+2] = startPart;
+    bufferedParts[bufferedLen*4+3] = endPart;
+    bufferedLen++;
+  }
+  
+  /**
+   * Saves the existing attribute states
+   */
+  private void saveState() {
+    savedTermLength = termAttribute.length();
+    savedStartOffset = offsetAttribute.startOffset();
+    savedEndOffset = offsetAttribute.endOffset();
+    savedState = captureState();
+
+    if (savedTermBuffer.length < savedTermLength) {
+      savedTermBuffer = new char[ArrayUtil.oversize(savedTermLength, Character.BYTES)];
+    }
+
+    System.arraycopy(termAttribute.buffer(), 0, savedTermBuffer, 0, savedTermLength);
+  }
+
+  /**
+   * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
+   *
+   * @param concat WordDelimiterConcatenation that will be flushed
+   */
+  private void flushConcatenation(WordDelimiterConcatenation concat) {
+    if (wordPos == concat.startPos) {
+      // we are not generating parts, so we must advance wordPos now
+      wordPos++;
+    }
+    lastConcatCount = concat.subwordCount;
+    if (concat.subwordCount != 1 || shouldGenerateParts(concat.type) == false) {
+      concat.write();
+    }
+    concat.clear();
+  }
+
+  /**
+   * Determines whether to concatenate a word or number if the current word is the given type
+   *
+   * @param wordType Type of the current word used to determine if it should be concatenated
+   * @return {@code true} if concatenation should occur, {@code false} otherwise
+   */
+  private boolean shouldConcatenate(int wordType) {
+    return (has(CATENATE_WORDS) && WordDelimiterIterator.isAlpha(wordType)) || (has(CATENATE_NUMBERS) && WordDelimiterIterator.isDigit(wordType));
+  }
+
+  /**
+   * Determines whether a word/number part should be generated for a word of the given type
+   *
+   * @param wordType Type of the word used to determine if a word/number part should be generated
+   * @return {@code true} if a word/number part should be generated, {@code false} otherwise
+   */
+  private boolean shouldGenerateParts(int wordType) {
+    return (has(GENERATE_WORD_PARTS) && WordDelimiterIterator.isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && WordDelimiterIterator.isDigit(wordType));
+  }
+
+  /**
+   * Concatenates the saved buffer to the given WordDelimiterConcatenation
+   *
+   * @param concatenation WordDelimiterConcatenation to concatenate the buffer to
+   */
+  private void concatenate(WordDelimiterConcatenation concatenation) {
+    if (concatenation.isEmpty()) {
+      concatenation.type = iterator.type();
+      concatenation.startPart = iterator.current;
+      concatenation.startPos = wordPos;
+    }
+    concatenation.append(savedTermBuffer, iterator.current, iterator.end - iterator.current);
+    concatenation.endPart = iterator.end;
+  }
+
+  /**
+   * Determines whether the given flag is set
+   *
+   * @param flag Flag to see if set
+   * @return {@code true} if flag is set
+   */
+  private boolean has(int flag) {
+    return (flags & flag) != 0;
+  }
+
+  // ================================================= Inner Classes =================================================
+
+  /**
+   * A WDF concatenated 'run'
+   */
+  final class WordDelimiterConcatenation {
+    final StringBuilder buffer = new StringBuilder();
+    int startPart;
+    int endPart;
+    int startPos;
+    int type;
+    int subwordCount;
+
+    /**
+     * Appends the given text of the given length, to the concetenation at the given offset
+     *
+     * @param text Text to append
+     * @param offset Offset in the concetenation to add the text
+     * @param length Length of the text to append
+     */
+    void append(char text[], int offset, int length) {
+      buffer.append(text, offset, length);
+      subwordCount++;
+    }
+
+    /**
+     * Writes the concatenation to part buffer
+     */
+    void write() {
+      char[] termPart = new char[buffer.length()];
+      buffer.getChars(0, buffer.length(), termPart, 0);
+      buffer(termPart, startPos, wordPos, startPart, endPart);
+    }
+
+    /**
+     * Determines if the concatenation is empty
+     *
+     * @return {@code true} if the concatenation is empty, {@code false} otherwise
+     */
+    boolean isEmpty() {
+      return buffer.length() == 0;
+    }
+
+    boolean isNotEmpty() {
+      return isEmpty() == false;
+    }
+
+    /**
+     * Clears the concatenation and resets its state
+     */
+    void clear() {
+      buffer.setLength(0);
+      startPart = endPart = type = subwordCount = 0;
+    }
+  }
+
+  /** Returns string representation of configuration flags */
+  public static String flagsToString(int flags) {
+    StringBuilder b = new StringBuilder();
+    if ((flags & GENERATE_WORD_PARTS) != 0) {
+      b.append("GENERATE_WORD_PARTS");
+    }
+    if ((flags & GENERATE_NUMBER_PARTS) != 0) {
+      if (b.length() > 0) {
+        b.append(" | ");
+      }
+      b.append("GENERATE_NUMBER_PARTS");
+    }
+    if ((flags & CATENATE_WORDS) != 0) {
+      if (b.length() > 0) {
+        b.append(" | ");
+      }
+      b.append("CATENATE_WORDS");
+    }
+    if ((flags & CATENATE_NUMBERS) != 0) {
+      if (b.length() > 0) {
+        b.append(" | ");
+      }
+      b.append("CATENATE_NUMBERS");
+    }
+    if ((flags & CATENATE_ALL) != 0) {
+      if (b.length() > 0) {
+        b.append(" | ");
+      }
+      b.append("CATENATE_ALL");
+    }
+    if ((flags & PRESERVE_ORIGINAL) != 0) {
+      if (b.length() > 0) {
+        b.append(" | ");
+      }
+      b.append("PRESERVE_ORIGINAL");
+    }
+    if ((flags & SPLIT_ON_CASE_CHANGE) != 0) {
+      if (b.length() > 0) {
+        b.append(" | ");
+      }
+      b.append("SPLIT_ON_CASE_CHANGE");
+    }
+    if ((flags & SPLIT_ON_NUMERICS) != 0) {
+      if (b.length() > 0) {
+        b.append(" | ");
+      }
+      b.append("SPLIT_ON_NUMERICS");
+    }
+    if ((flags & STEM_ENGLISH_POSSESSIVE) != 0) {
+      if (b.length() > 0) {
+        b.append(" | ");
+      }
+      b.append("STEM_ENGLISH_POSSESSIVE");
+    }
+
+    return b.toString();
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder b = new StringBuilder();
+    b.append("WordDelimiterGraphFilter(flags=");
+    b.append(flagsToString(flags));
+    b.append(')');
+    return b.toString();
+  }
+  
+  // questions:
+  // negative numbers?  -42 indexed as just 42?
+  // dollar sign?  $42
+  // percent sign?  33%
+  // downsides:  if source text is "powershot" then a query of "PowerShot" won't match!
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
new file mode 100644
index 0000000..a06cc75
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.*;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.*;
+
+/**
+ * Factory for {@link WordDelimiterGraphFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.WordDelimiterGraphFilterFactory" protected="protectedword.txt"
+ *             preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
+ *             catenateWords="0" catenateNumbers="0" catenateAll="0"
+ *             generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
+ *             types="wdfftypes.txt" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+  public static final String PROTECTED_TOKENS = "protected";
+  public static final String TYPES = "types";
+
+  private final String wordFiles;
+  private final String types;
+  private final int flags;
+  byte[] typeTable = null;
+  private CharArraySet protectedWords = null;
+  
+  /** Creates a new WordDelimiterGraphFilterFactory */
+  public WordDelimiterGraphFilterFactory(Map<String, String> args) {
+    super(args);
+    int flags = 0;
+    if (getInt(args, "generateWordParts", 1) != 0) {
+      flags |= GENERATE_WORD_PARTS;
+    }
+    if (getInt(args, "generateNumberParts", 1) != 0) {
+      flags |= GENERATE_NUMBER_PARTS;
+    }
+    if (getInt(args, "catenateWords", 0) != 0) {
+      flags |= CATENATE_WORDS;
+    }
+    if (getInt(args, "catenateNumbers", 0) != 0) {
+      flags |= CATENATE_NUMBERS;
+    }
+    if (getInt(args, "catenateAll", 0) != 0) {
+      flags |= CATENATE_ALL;
+    }
+    if (getInt(args, "splitOnCaseChange", 1) != 0) {
+      flags |= SPLIT_ON_CASE_CHANGE;
+    }
+    if (getInt(args, "splitOnNumerics", 1) != 0) {
+      flags |= SPLIT_ON_NUMERICS;
+    }
+    if (getInt(args, "preserveOriginal", 0) != 0) {
+      flags |= PRESERVE_ORIGINAL;
+    }
+    if (getInt(args, "stemEnglishPossessive", 1) != 0) {
+      flags |= STEM_ENGLISH_POSSESSIVE;
+    }
+    wordFiles = get(args, PROTECTED_TOKENS);
+    types = get(args, TYPES);
+    this.flags = flags;
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    if (wordFiles != null) {  
+      protectedWords = getWordSet(loader, wordFiles, false);
+    }
+    if (types != null) {
+      List<String> files = splitFileNames( types );
+      List<String> wlist = new ArrayList<>();
+      for( String file : files ){
+        List<String> lines = getLines(loader, file.trim());
+        wlist.addAll( lines );
+      }
+      typeTable = parseTypes(wlist);
+    }
+  }
+
+  @Override
+  public TokenFilter create(TokenStream input) {
+    return new WordDelimiterGraphFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+                                        flags, protectedWords);
+  }
+  
+  // source => type
+  private static Pattern typePattern = Pattern.compile( "(.*)\\s*=>\\s*(.*)\\s*$" );
+  
+  // parses a list of MappingCharFilter style rules into a custom byte[] type table
+  private byte[] parseTypes(List<String> rules) {
+    SortedMap<Character,Byte> typeMap = new TreeMap<>();
+    for( String rule : rules ){
+      Matcher m = typePattern.matcher(rule);
+      if( !m.find() )
+        throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]");
+      String lhs = parseString(m.group(1).trim());
+      Byte rhs = parseType(m.group(2).trim());
+      if (lhs.length() != 1)
+        throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
+      if (rhs == null)
+        throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
+      typeMap.put(lhs.charAt(0), rhs);
+    }
+    
+    // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
+    byte types[] = new byte[Math.max(typeMap.lastKey()+1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
+    for (int i = 0; i < types.length; i++)
+      types[i] = WordDelimiterIterator.getType(i);
+    for (Map.Entry<Character,Byte> mapping : typeMap.entrySet())
+      types[mapping.getKey()] = mapping.getValue();
+    return types;
+  }
+  
+  private Byte parseType(String s) {
+    if (s.equals("LOWER"))
+      return LOWER;
+    else if (s.equals("UPPER"))
+      return UPPER;
+    else if (s.equals("ALPHA"))
+      return ALPHA;
+    else if (s.equals("DIGIT"))
+      return DIGIT;
+    else if (s.equals("ALPHANUM"))
+      return ALPHANUM;
+    else if (s.equals("SUBWORD_DELIM"))
+      return SUBWORD_DELIM;
+    else
+      return null;
+  }
+  
+  char[] out = new char[256];
+  
+  private String parseString(String s){
+    int readPos = 0;
+    int len = s.length();
+    int writePos = 0;
+    while( readPos < len ){
+      char c = s.charAt( readPos++ );
+      if( c == '\\' ){
+        if( readPos >= len )
+          throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
+        c = s.charAt( readPos++ );
+        switch( c ) {
+          case '\\' : c = '\\'; break;
+          case 'n' : c = '\n'; break;
+          case 't' : c = '\t'; break;
+          case 'r' : c = '\r'; break;
+          case 'b' : c = '\b'; break;
+          case 'f' : c = '\f'; break;
+          case 'u' :
+            if( readPos + 3 >= len )
+              throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
+            c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
+            readPos += 4;
+            break;
+        }
+      }
+      out[writePos++] = c;
+    }
+    return new String( out, 0, writePos );
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
index 0367dab..86b983d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
@@ -16,15 +16,21 @@
  */
 package org.apache.lucene.analysis.miscellaneous;
 
-
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
-
 /**
- * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
+ * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules.
  * @lucene.internal
  */
 public final class WordDelimiterIterator {
 
+  static final int LOWER = 0x01;
+  static final int UPPER = 0x02;
+  static final int DIGIT = 0x04;
+  static final int SUBWORD_DELIM = 0x08;
+
+  // combinations: for testing, not for setting bits
+  public static final int ALPHA = 0x03;
+  public static final int ALPHANUM = 0x07;
+
   /** Indicates the end of iteration */
   public static final int DONE = -1;
   
@@ -97,7 +103,7 @@ public final class WordDelimiterIterator {
    * Create a new WordDelimiterIterator operating with the supplied rules.
    * 
    * @param charTypeTable table containing character types
-   * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+   * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless)
    * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
    * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" =&gt; "O", "Neil"
    */
@@ -323,4 +329,45 @@ public final class WordDelimiterIterator {
       default: return SUBWORD_DELIM;
     }
   }
-}
\ No newline at end of file
+
+  /**
+   * Checks if the given word type includes {@link #ALPHA}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains ALPHA, {@code false} otherwise
+   */
+  static boolean isAlpha(int type) {
+    return (type & ALPHA) != 0;
+  }
+
+  /**
+   * Checks if the given word type includes {@link #DIGIT}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains DIGIT, {@code false} otherwise
+   */
+  static boolean isDigit(int type) {
+    return (type & DIGIT) != 0;
+  }
+
+  /**
+   * Checks if the given word type includes {@link #SUBWORD_DELIM}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
+   */
+  static boolean isSubwordDelim(int type) {
+    return (type & SUBWORD_DELIM) != 0;
+  }
+
+  /**
+   * Checks if the given word type includes {@link #UPPER}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains UPPER, {@code false} otherwise
+   */
+  static boolean isUpper(int type) {
+    return (type & UPPER) != 0;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
deleted file mode 100644
index c1fa1f7..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.synonym;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.RollingBuffer;
-
-/**
- * Converts an incoming graph token stream, such as one from
- * {@link SynonymGraphFilter}, into a flat form so that
- * all nodes form a single linear chain with no side paths.  Every
- * path through the graph touches every node.  This is necessary
- * when indexing a graph token stream, because the index does not
- * save {@link PositionLengthAttribute} and so it cannot
- * preserve the graph structure.  However, at search time,
- * query parsers can correctly handle the graph and this token
- * filter should <b>not</b> be used.
- *
- * <p>If the graph was not already flat to start, this
- * is likely a lossy process, i.e. it will often cause the 
- * graph to accept token sequences it should not, and to
- * reject token sequences it should not.
- *
- * <p>However, when applying synonyms during indexing, this
- * is necessary because Lucene already does not index a graph 
- * and so the indexing process is already lossy
- * (it ignores the {@link PositionLengthAttribute}).
- *
- * @lucene.experimental
- */
-public final class FlattenGraphFilter extends TokenFilter {
-
-  /** Holds all tokens leaving a given input position. */
-  private final static class InputNode implements RollingBuffer.Resettable {
-    private final List<AttributeSource.State> tokens = new ArrayList<>();
-
-    /** Our input node, or -1 if we haven't been assigned yet */
-    int node = -1;
-
-    /** Maximum to input node for all tokens leaving here; we use this
-     *  to know when we can freeze. */
-    int maxToNode = -1;
-
-    /** Where we currently map to; this changes (can only
-     *  increase as we see more input tokens), until we are finished
-     *  with this position. */
-    int outputNode = -1;
-
-    /** Which token (index into {@link #tokens}) we will next output. */
-    int nextOut;
-
-    @Override
-    public void reset() {
-      tokens.clear();
-      node = -1;
-      outputNode = -1;
-      maxToNode = -1;
-      nextOut = 0;
-    }
-  }
-
-  /** Gathers up merged input positions into a single output position,
-   *  only for the current "frontier" of nodes we've seen but can't yet
-   *  output because they are not frozen. */
-  private final static class OutputNode implements RollingBuffer.Resettable {
-    private final List<Integer> inputNodes = new ArrayList<>();
-
-    /** Node ID for this output, or -1 if we haven't been assigned yet. */
-    int node = -1;
-
-    /** Which input node (index into {@link #inputNodes}) we will next output. */
-    int nextOut;
-    
-    /** Start offset of tokens leaving this node. */
-    int startOffset = -1;
-
-    /** End offset of tokens arriving to this node. */
-    int endOffset = -1;
-
-    @Override
-    public void reset() {
-      inputNodes.clear();
-      node = -1;
-      nextOut = 0;
-      startOffset = -1;
-      endOffset = -1;
-    }
-  }
-
-  private final RollingBuffer<InputNode> inputNodes = new RollingBuffer<InputNode>() {
-    @Override
-    protected InputNode newInstance() {
-      return new InputNode();
-    }
-  };
-
-  private final RollingBuffer<OutputNode> outputNodes = new RollingBuffer<OutputNode>() {
-    @Override
-    protected OutputNode newInstance() {
-      return new OutputNode();
-    }
-  };
-
-  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
-  /** Which input node the last seen token leaves from */
-  private int inputFrom;
-
-  /** We are currently releasing tokens leaving from this output node */
-  private int outputFrom;
-
-  // for debugging:
-  //private int retOutputFrom;
-
-  private boolean done;
-
-  private int lastOutputFrom;
-
-  private int finalOffset;
-
-  private int finalPosInc;
-
-  private int maxLookaheadUsed;
-
-  private int lastStartOffset;
-
-  public FlattenGraphFilter(TokenStream in) {
-    super(in);
-  }
-
-  private boolean releaseBufferedToken() {
-
-    // We only need the while loop (retry) if we have a hole (an output node that has no tokens leaving):
-    while (outputFrom < outputNodes.getMaxPos()) {
-      OutputNode output = outputNodes.get(outputFrom);
-      if (output.inputNodes.isEmpty()) {
-        // No tokens arrived to this node, which happens for the first node
-        // after a hole:
-        //System.out.println("    skip empty outputFrom=" + outputFrom);
-        outputFrom++;
-        continue;
-      }
-
-      int maxToNode = -1;
-      for(int inputNodeID : output.inputNodes) {
-        InputNode inputNode = inputNodes.get(inputNodeID);
-        assert inputNode.outputNode == outputFrom;
-        maxToNode = Math.max(maxToNode, inputNode.maxToNode);
-      }
-      //System.out.println("  release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom);
-
-      // TODO: we could shrink the frontier here somewhat if we
-      // always output posLen=1 as part of our "sausagizing":
-      if (maxToNode <= inputFrom || done) {
-        //System.out.println("  output node merged these inputs: " + output.inputNodes);
-        // These tokens are now frozen
-        assert output.nextOut < output.inputNodes.size(): "output.nextOut=" + output.nextOut + " vs output.inputNodes.size()=" + output.inputNodes.size();
-        InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut));
-        if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) {
-          return false;
-        }
-        if (inputNode.tokens.size() == 0) {
-          assert inputNode.nextOut == 0;
-          assert output.nextOut == 0;
-          // Hole dest nodes should never be merged since 1) we always
-          // assign them to a new output position, and 2) since they never
-          // have arriving tokens they cannot be pushed:
-          assert output.inputNodes.size() == 1: output.inputNodes.size();
-          outputFrom++;
-          inputNodes.freeBefore(output.inputNodes.get(0));
-          outputNodes.freeBefore(outputFrom);
-          continue;
-        }
-
-        assert inputNode.nextOut < inputNode.tokens.size();
-
-        restoreState(inputNode.tokens.get(inputNode.nextOut));
-
-        // Correct posInc
-        assert outputFrom >= lastOutputFrom;
-        posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom);
-        int toInputNodeID = inputNode.node + posLenAtt.getPositionLength();
-        InputNode toInputNode = inputNodes.get(toInputNodeID);
-
-        // Correct posLen
-        assert toInputNode.outputNode > outputFrom;
-        posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom);
-        lastOutputFrom = outputFrom;
-        inputNode.nextOut++;
-        //System.out.println("  ret " + this);
-
-        OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode);
-
-        // Correct offsets
-
-        // This is a bit messy; we must do this so offset don't go backwards,
-        // which would otherwise happen if the replacement has more tokens
-        // than the input:
-        int startOffset = Math.max(lastStartOffset, output.startOffset);
-
-        // We must do this in case the incoming tokens have broken offsets:
-        int endOffset = Math.max(startOffset, outputEndNode.endOffset);
-        
-        offsetAtt.setOffset(startOffset, endOffset);
-        lastStartOffset = startOffset;
-
-        if (inputNode.nextOut == inputNode.tokens.size()) {
-          output.nextOut++;
-          if (output.nextOut == output.inputNodes.size()) {
-            outputFrom++;
-            inputNodes.freeBefore(output.inputNodes.get(0));
-            outputNodes.freeBefore(outputFrom);
-          }
-        }
-
-        return true;
-      } else {
-        return false;
-      }
-    }
-
-    //System.out.println("    break false");
-    return false;
-  }
-
-  @Override
-  public boolean incrementToken() throws IOException {
-    //System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom);
-
-    while (true) {
-      if (releaseBufferedToken()) {
-        //retOutputFrom += posIncAtt.getPositionIncrement();
-        //System.out.println("    return buffered: " + termAtt + " " + retOutputFrom + "-" + (retOutputFrom + posLenAtt.getPositionLength()));
-        //printStates();
-        return true;
-      } else if (done) {
-        //System.out.println("    done, return false");
-        return false;
-      }
-
-      if (input.incrementToken()) {
-        // Input node this token leaves from:
-        inputFrom += posIncAtt.getPositionIncrement();
-
-        int startOffset = offsetAtt.startOffset();
-        int endOffset = offsetAtt.endOffset();
-
-        // Input node this token goes to:
-        int inputTo = inputFrom + posLenAtt.getPositionLength();
-        //System.out.println("  input.inc " + termAtt + ": " + inputFrom + "-" + inputTo);
-
-        InputNode src = inputNodes.get(inputFrom);
-        if (src.node == -1) {
-          // This means the "from" node of this token was never seen as a "to" node,
-          // which should only happen if we just crossed a hole.  This is a challenging
-          // case for us because we normally rely on the full dependencies expressed
-          // by the arcs to assign outgoing node IDs.  It would be better if tokens
-          // were never dropped but instead just marked deleted with a new
-          // TermDeletedAttribute (boolean valued) ... but until that future, we have
-          // a hack here to forcefully jump the output node ID:
-          assert src.outputNode == -1;
-          src.node = inputFrom;
-
-          src.outputNode = outputNodes.getMaxPos() + 1;
-          //System.out.println("    hole: force to outputNode=" + src.outputNode);
-          OutputNode outSrc = outputNodes.get(src.outputNode);
-
-          // Not assigned yet:
-          assert outSrc.node == -1;
-          outSrc.node = src.outputNode;
-          outSrc.inputNodes.add(inputFrom);
-          outSrc.startOffset = startOffset;
-        } else {
-          OutputNode outSrc = outputNodes.get(src.outputNode);
-          if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) {
-            // "shrink wrap" the offsets so the original tokens (with most
-            // restrictive offsets) win:
-            outSrc.startOffset = Math.max(startOffset, outSrc.startOffset);
-          }
-        }
-
-        // Buffer this token:
-        src.tokens.add(captureState());
-        src.maxToNode = Math.max(src.maxToNode, inputTo);
-        maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize());
-
-        InputNode dest = inputNodes.get(inputTo);
-        if (dest.node == -1) {
-          // Common case: first time a token is arriving to this input position:
-          dest.node = inputTo;
-        }
-
-        // Always number output nodes sequentially:
-        int outputEndNode = src.outputNode + 1;
-
-        if (outputEndNode > dest.outputNode) {
-          if (dest.outputNode != -1) {
-            boolean removed = outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo));
-            assert removed;
-          }
-          //System.out.println("    increase output node: " + dest.outputNode + " vs " + outputEndNode);
-          outputNodes.get(outputEndNode).inputNodes.add(inputTo);
-          dest.outputNode = outputEndNode;
-
-          // Since all we ever do is merge incoming nodes together, and then renumber
-          // the merged nodes sequentially, we should only ever assign smaller node
-          // numbers:
-          assert outputEndNode <= inputTo: "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo;
-        }
-
-        OutputNode outDest = outputNodes.get(dest.outputNode);
-        // "shrink wrap" the offsets so the original tokens (with most
-        // restrictive offsets) win:
-        if (outDest.endOffset == -1 || endOffset < outDest.endOffset) {
-          outDest.endOffset = endOffset;
-        }
-
-      } else {
-        //System.out.println("  got false from input");
-        input.end();
-        finalPosInc = posIncAtt.getPositionIncrement();
-        finalOffset = offsetAtt.endOffset();
-        done = true;
-        // Don't return false here: we need to force release any buffered tokens now
-      }
-    }
-  }
-
-  // Only for debugging:
-  /*
-  private void printStates() {
-    System.out.println("states:");
-    for(int i=outputFrom;i<outputNodes.getMaxPos();i++) {
-      OutputNode outputNode = outputNodes.get(i);
-      System.out.println("  output " + i + ": inputs " + outputNode.inputNodes);
-      for(int inputNodeID : outputNode.inputNodes) {
-        InputNode inputNode = inputNodes.get(inputNodeID);
-        assert inputNode.outputNode == i;
-      }
-    }
-  }
-  */
-
-  @Override
-  public void end() throws IOException {
-    if (done == false) {
-      super.end();
-    } else {
-      // NOTE, shady: don't call super.end, because we did already from incrementToken
-    }
-
-    clearAttributes();
-    if (done) {
-      // On exc, done is false, and we will not have set these:
-      posIncAtt.setPositionIncrement(finalPosInc);
-      offsetAtt.setOffset(finalOffset, finalOffset);
-    } else {
-      super.end();
-    }
-  }
-  
-  @Override
-  public void reset() throws IOException {
-    //System.out.println("F: reset");
-    super.reset();
-    inputFrom = -1;
-    inputNodes.reset();
-    InputNode in = inputNodes.get(0);
-    in.node = 0;
-    in.outputNode = 0;
-
-    outputNodes.reset();
-    OutputNode out = outputNodes.get(0);
-    out.node = 0;
-    out.inputNodes.add(0);
-    out.startOffset = 0;
-    outputFrom = 0;
-    //retOutputFrom = -1;
-    lastOutputFrom = -1;
-    done = false;
-    finalPosInc = -1;
-    finalOffset = -1;
-    lastStartOffset = 0;
-    maxLookaheadUsed = 0;
-  }
-
-  // for testing
-  int getMaxLookaheadUsed() {
-    return maxLookaheadUsed;
-  }
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilterFactory.java
deleted file mode 100644
index a6cba97..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilterFactory.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.synonym;
-
-import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
-
-/** 
- * Factory for {@link FlattenGraphFilter}. 
- *
- * @lucene.experimental
- */
-public class FlattenGraphFilterFactory extends TokenFilterFactory {
-
-  /** Creates a new FlattenGraphFilterFactory */
-  public FlattenGraphFilterFactory(Map<String,String> args) {
-    super(args);
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
-  }
-  
-  @Override
-  public TokenStream create(TokenStream input) {
-    return new FlattenGraphFilter(input);
-  }
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
index 29f6e1c..ec2676f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
@@ -21,6 +21,7 @@ import java.util.Arrays;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.FlattenGraphFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
index df10e9b..87ddc08 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.FlattenGraphFilterFactory;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.util.ResourceLoader;
 import org.apache.lucene.analysis.util.ResourceLoaderAware;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java
index 3d50e08..788db0a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java
@@ -17,8 +17,14 @@
 
 package org.apache.lucene.analysis.synonym;
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.FlattenGraphFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -31,11 +37,6 @@ import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.RollingBuffer;
 import org.apache.lucene.util.fst.FST;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.LinkedList;
-import java.util.List;
-
 // TODO: maybe we should resolve token -> wordID then run
 // FST on wordIDs, for better perf?
  

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index 5f8894c..4e33006 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -78,6 +78,7 @@ org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
 org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
 org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
+org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory
 org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
 org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory
 org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
@@ -103,6 +104,6 @@ org.apache.lucene.analysis.standard.StandardFilterFactory
 org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
 org.apache.lucene.analysis.synonym.SynonymFilterFactory
 org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory
-org.apache.lucene.analysis.synonym.FlattenGraphFilterFactory
+org.apache.lucene.analysis.core.FlattenGraphFilterFactory
 org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory
 org.apache.lucene.analysis.util.ElisionFilterFactory


Mime
View raw message