lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From a.@apache.org
Subject [06/18] lucene-solr:jira/solr-12438: SOLR-12376: New TaggerRequestHandler (SolrTextTagger).
Date Thu, 07 Jun 2018 12:18:24 GMT
SOLR-12376: New TaggerRequestHandler (SolrTextTagger).


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/cf633921
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/cf633921
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/cf633921

Branch: refs/heads/jira/solr-12438
Commit: cf63392183ffc96428fc4c52f546fec2cdf766d5
Parents: c587598
Author: David Smiley <dsmiley@apache.org>
Authored: Tue Jun 5 14:04:55 2018 -0400
Committer: David Smiley <dsmiley@apache.org>
Committed: Tue Jun 5 14:04:55 2018 -0400

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   3 +
 solr/NOTICE.txt                                 |  14 +
 .../apache/solr/core/SolrResourceLoader.java    |  12 +-
 .../solr/handler/tagger/OffsetCorrector.java    | 178 +++++++++
 .../solr/handler/tagger/TagClusterReducer.java  | 103 +++++
 .../org/apache/solr/handler/tagger/TagLL.java   | 176 ++++++++
 .../org/apache/solr/handler/tagger/Tagger.java  | 230 +++++++++++
 .../handler/tagger/TaggerRequestHandler.java    | 397 +++++++++++++++++++
 .../solr/handler/tagger/TaggingAttribute.java   |  65 +++
 .../handler/tagger/TaggingAttributeImpl.java    |  79 ++++
 .../solr/handler/tagger/TermPrefixCursor.java   | 189 +++++++++
 .../solr/handler/tagger/XmlOffsetCorrector.java | 113 ++++++
 .../solr/handler/tagger/package-info.java       |  27 ++
 .../solr/collection1/conf/schema-tagger.xml     | 187 +++++++++
 .../solr/collection1/conf/solrconfig-tagger.xml |  59 +++
 .../tagger/EmbeddedSolrNoSerializeTest.java     | 153 +++++++
 .../handler/tagger/RandomizedTaggerTest.java    | 150 +++++++
 .../apache/solr/handler/tagger/Tagger2Test.java | 175 ++++++++
 .../apache/solr/handler/tagger/TaggerTest.java  | 296 ++++++++++++++
 .../solr/handler/tagger/TaggerTestCase.java     | 251 ++++++++++++
 .../handler/tagger/TaggingAttributeTest.java    |  73 ++++
 .../handler/tagger/WordLengthTaggingFilter.java | 110 +++++
 .../tagger/WordLengthTaggingFilterFactory.java  |  67 ++++
 .../handler/tagger/XmlInterpolationTest.java    | 224 +++++++++++
 solr/solr-ref-guide/src/searching.adoc          |  33 +-
 solr/solr-ref-guide/src/the-tagger-handler.adoc | 265 +++++++++++++
 26 files changed, 3622 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 66d2026..479406f 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -154,6 +154,9 @@ New Features
 
 * SOLR-12389: support deeply nested json objects in clusterprops.json (noble)
 
+* SOLR-12376: Added the TaggerRequestHandler (AKA SolrTextTagger) for tagging text.  It's used as a component of
+  NER/ERD systems including query-understanding.  See the ref guide for more info.  (David Smiley
+
 Bug Fixes
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/NOTICE.txt
----------------------------------------------------------------------
diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt
index fd954f4..a5b2070 100644
--- a/solr/NOTICE.txt
+++ b/solr/NOTICE.txt
@@ -537,3 +537,17 @@ See http://www.restlet.org/
 Protocol Buffers - Google's data interchange format
 Copyright 2008 Google Inc.
 http://code.google.com/apis/protocolbuffers/
+
+=========================================================================
+==     SolrTextTagger Notice                                           ==
+=========================================================================
+
+The TaggerRequestHandler and related classes in its package came from the
+OpenSextant Solr Text Tagger,
+Copyright 2013 The MITRE Corporation. All Rights Reserved.
+
+  This software was produced for the U. S. Government
+  under Contract No. W15P7T-11-C-F600, and is
+  subject to the Rights in Noncommercial Computer Software
+  and Noncommercial Computer Software Documentation
+  Clause 252.227-7014 (JUN 1995)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
index 22753dd..0ff5c7b 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
@@ -16,6 +16,10 @@
  */
 package org.apache.solr.core;
 
+import javax.naming.Context;
+import javax.naming.InitialContext;
+import javax.naming.NamingException;
+import javax.naming.NoInitialContextException;
 import java.io.Closeable;
 import java.io.File;
 import java.io.FileOutputStream;
@@ -47,10 +51,6 @@ import java.util.concurrent.ConcurrentSkipListSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
-import javax.naming.Context;
-import javax.naming.InitialContext;
-import javax.naming.NamingException;
-import javax.naming.NoInitialContextException;
 
 import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.util.CharFilterFactory;
@@ -88,9 +88,9 @@ public class SolrResourceLoader implements ResourceLoader,Closeable
   static final String project = "solr";
   static final String base = "org.apache" + "." + project;
   static final String[] packages = {
-      "", "analysis.", "schema.", "handler.", "search.", "update.", "core.", "response.", "request.",
+      "", "analysis.", "schema.", "handler.", "handler.tagger.", "search.", "update.", "core.", "response.", "request.",
       "update.processor.", "util.", "spelling.", "handler.component.", "handler.dataimport.",
-      "spelling.suggest.", "spelling.suggest.fst.", "rest.schema.analysis.", "security.","handler.admin.",
+      "spelling.suggest.", "spelling.suggest.fst.", "rest.schema.analysis.", "security.", "handler.admin.",
       "cloud.autoscaling."
   };
   private static final java.lang.String SOLR_CORE_NAME = "solr.core.name";

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java b/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java
new file mode 100644
index 0000000..1fb4911
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/OffsetCorrector.java
@@ -0,0 +1,178 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.util.Arrays;
+
+import com.carrotsearch.hppc.IntArrayList;
+
+public abstract class OffsetCorrector {
+
+  //TODO support a streaming style of consuming input text so that we need not take a
+  // String. Trickier because we need to keep more information as we parse to know when tags
+  // are adjacent with/without whitespace
+
+  //Data structure requirements:
+  // Given a character offset:
+  //   * determine what tagId is it's parent.
+  //   * determine if it is adjacent to the parent open tag, ignoring whitespace
+  //   * determine if it is adjacent to the parent close tag, ignoring whitespace
+  // Given a tagId:
+  //   * What is it's parent tagId
+  //   * What's the char offset of the start and end of the open tag
+  //   * What's the char offset of the start and end of the close tag
+
+  /** Document text. */
+  protected final String docText;
+
+  /** Array of tag info comprised of 5 int fields:
+   *    [int parentTag, int openStartOff, int openEndOff, int closeStartOff, int closeEndOff].
+   * It's size indicates how many tags there are. Tag's are ID'ed sequentially from 0. */
+  protected final IntArrayList tagInfo;
+
+  /** offsets of parent tag id change (ascending order) */
+  protected final IntArrayList parentChangeOffsets;
+  /** tag id; parallel array to parentChangeOffsets */
+  protected final IntArrayList parentChangeIds;
+
+  protected final int[] offsetPair = new int[] { -1, -1};//non-thread-safe state
+
+  /** Disjoint start and end span offsets (inclusive) of non-taggable sections. Null if none. */
+  protected final IntArrayList nonTaggableOffsets;
+
+  /**
+   * Initialize based on the document text.
+   * @param docText non-null structured content.
+   * @param hasNonTaggable if there may be "non-taggable" tags to track
+   */
+  protected OffsetCorrector(String docText, boolean hasNonTaggable) {
+    this.docText = docText;
+    final int guessNumElements = Math.max(docText.length() / 20, 4);
+
+    tagInfo = new IntArrayList(guessNumElements * 5);
+    parentChangeOffsets = new IntArrayList(guessNumElements * 2);
+    parentChangeIds = new IntArrayList(guessNumElements * 2);
+    nonTaggableOffsets = hasNonTaggable ? new IntArrayList(guessNumElements / 5) : null;
+  }
+
+  /** Corrects the start and end offset pair. It will return null if it can't
+   * due to a failure to keep the offsets balance-able, or if it spans "non-taggable" tags.
+   * The start (left) offset is pulled left as needed over whitespace and opening tags. The end
+   * (right) offset is pulled right as needed over whitespace and closing tags. It's returned as
+   * a 2-element array.
+   * <p>Note that the returned array is internally reused; just use it to examine the response.
+   */
+  public int[] correctPair(int leftOffset, int rightOffset) {
+    rightOffset = correctEndOffsetForCloseElement(rightOffset);
+    if (spansNonTaggable(leftOffset, rightOffset))
+      return null;
+
+    int startTag = lookupTag(leftOffset);
+    //offsetPair[0] = Math.max(offsetPair[0], getOpenStartOff(startTag));
+    int endTag = lookupTag(rightOffset-1);
+    //offsetPair[1] = Math.min(offsetPair[1], getCloseStartOff(endTag));
+
+    // Find the ancestor tag enclosing offsetPair.  And bump out left offset along the way.
+    int iTag = startTag;
+    for (; !tagEnclosesOffset(iTag, rightOffset); iTag = getParentTag(iTag)) {
+      //Ensure there is nothing except whitespace thru OpenEndOff
+      int tagOpenEndOff = getOpenEndOff(iTag);
+      if (hasNonWhitespace(tagOpenEndOff, leftOffset))
+        return null;
+      leftOffset = getOpenStartOff(iTag);
+    }
+    final int ancestorTag = iTag;
+    // Bump out rightOffset until we get to ancestorTag.
+    for (iTag = endTag; iTag != ancestorTag; iTag = getParentTag(iTag)) {
+      //Ensure there is nothing except whitespace thru CloseStartOff
+      int tagCloseStartOff = getCloseStartOff(iTag);
+      if (hasNonWhitespace(rightOffset, tagCloseStartOff))
+        return null;
+      rightOffset = getCloseEndOff(iTag);
+    }
+
+    offsetPair[0] = leftOffset;
+    offsetPair[1] = rightOffset;
+    return offsetPair;
+  }
+
+  /** Correct endOffset for adjacent element at the right side.  E.g. offsetPair might point to:
+   * <pre>
+   *   foo&lt;/tag&gt;
+   * </pre>
+   * and this method pulls the end offset left to the '&lt;'. This is necessary for use with
+   * {@link org.apache.lucene.analysis.charfilter.HTMLStripCharFilter}.
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-5734 */
+  protected int correctEndOffsetForCloseElement(int endOffset) {
+    if (docText.charAt(endOffset-1) == '>') {
+      final int newEndOffset = docText.lastIndexOf('<', endOffset - 2);
+      if (newEndOffset > offsetPair[0])//just to be sure
+        return newEndOffset;
+    }
+    return endOffset;
+  }
+
+  protected boolean hasNonWhitespace(int start, int end) {
+    for (int i = start; i < end; i++) {
+      if (!Character.isWhitespace(docText.charAt(i)))
+        return true;
+    }
+    return false;
+  }
+
+  protected boolean tagEnclosesOffset(int tag, int off) {
+    return off >= getOpenStartOff(tag) && off < getCloseEndOff(tag);
+  }
+
+  protected int getParentTag(int tag) { return tagInfo.get(tag * 5 + 0); }
+  protected int getOpenStartOff(int tag) { return tagInfo.get(tag * 5 + 1); }
+  protected int getOpenEndOff(int tag) { return tagInfo.get(tag * 5 + 2); }
+  protected int getCloseStartOff(int tag) { return tagInfo.get(tag * 5 + 3); }
+  protected int getCloseEndOff(int tag) { return tagInfo.get(tag * 5 + 4); }
+
+  protected int lookupTag(int off) {
+    int idx = Arrays.binarySearch(parentChangeOffsets.buffer, 0, parentChangeOffsets.size(), off);
+    if (idx < 0)
+      idx = (-idx - 1) - 1;//round down
+    return parentChangeIds.get(idx);
+  }
+
+  protected boolean spansNonTaggable(int startOff, int endOff) {
+    if (nonTaggableOffsets == null)
+      return false;
+    int idx = Arrays.binarySearch(nonTaggableOffsets.buffer, 0, nonTaggableOffsets.size(), startOff);
+    //if tag start coincides with first or last char of non-taggable span then result is true.
+    // (probably never happens since those characters are actual element markup)
+    if (idx >= 0)
+      return true;
+    idx = -idx - 1;//modify for where we would insert
+    //if idx is odd then our span intersects a non-taggable span; return true
+    if ((idx & 1) == 1)
+      return true;
+    //it's non-taggable if the next non-taggable start span is before our endOff
+    if (idx == nonTaggableOffsets.size())
+      return false;
+    return nonTaggableOffsets.get(idx) < endOff;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java b/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java
new file mode 100644
index 0000000..9310a04
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TagClusterReducer.java
@@ -0,0 +1,103 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+public interface TagClusterReducer {
+  /**
+   * Reduces the linked-list to only those tags that should be emitted
+   * @param head not null; 1-element array to head which isn't null either
+   */
+  void reduce(TagLL[] head);
+
+  static final TagClusterReducer ALL = new TagClusterReducer() {
+    @Override
+    public void reduce(TagLL[] head) {
+    }
+  };
+
+  static final TagClusterReducer NO_SUB = new TagClusterReducer() {
+    @Override
+    public void reduce(TagLL[] head) {
+      //loop forward over all tags
+      for (TagLL tag = head[0].nextTag; tag != null; tag = tag.nextTag) {
+        //loop backwards over prev tags from this tag
+        for (TagLL tPrev = tag.prevTag; tPrev != null; tPrev = tPrev.prevTag) {
+          assert tPrev.startOffset <= tag.startOffset;
+          //if a previous tag's endOffset is <= this one's, tForward can be removed
+          if (tPrev.endOffset >= tag.endOffset) {
+            tag.removeLL();
+            break;
+          } else if (tPrev.startOffset == tag.startOffset) {
+            tPrev.removeLL();
+            //continue; 'tag' is still valid
+          }
+        }
+      }
+    }
+  };
+
+  static final TagClusterReducer LONGEST_DOMINANT_RIGHT = new TagClusterReducer() {
+    @Override
+    public void reduce(TagLL[] head) {
+
+      //--Optimize for common single-tag case
+      if (head[0].nextTag == null)
+        return;
+
+      while (true) {
+        //--Find longest not already marked
+        TagLL longest = null;
+        for (TagLL t = head[0]; t != null; t = t.nextTag) {
+          if (!t.mark && (longest == null || t.charLen() >= longest.charLen()))
+            longest = t;
+        }
+        if (longest == null)
+          break;
+        //--Mark longest (so we return it eventually)
+        longest.mark = true;
+        //--Remove tags overlapping this longest
+        for (TagLL t = head[0]; t != null; t = t.nextTag) {
+          if (t.mark)
+            continue;
+
+          if (t.overlaps(longest)) {
+            t.removeLL();
+          } else if (t.startOffset >= longest.endOffset) {
+            break;//no subsequent can possibly overlap
+          }
+        }
+      }//loop
+
+      //all-remaining should be marked
+//      for (TagLL t = head; t != null; t = t.nextTag) {
+//        assert t.mark;
+////        if (!t.mark) {
+////          t.removeLL();
+////          if (head == t)
+////            head = t.nextTag;
+////        }
+//      }
+      assert head[0].mark;
+    }
+  };
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java b/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java
new file mode 100644
index 0000000..e8bb0a3
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TagLL.java
@@ -0,0 +1,176 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This is a Tag -- a startOffset, endOffset and value.
+ * <p>
+ * A Tag starts without a value in an
+ * "advancing" state.  {@link #advance(org.apache.lucene.util.BytesRef, int)}
+ * is called with subsequent words and then eventually it won't advance any
+ * more, and value is set (could be null).
+ * <p>
+ * A Tag is also a doubly-linked-list (hence the LL in the name). All tags share
+ * a reference to the head via a 1-element array, which is potentially modified
+ * if any of the linked-list methods are called. Tags in the list should have
+ * equal or increasing start offsets.
+ */
+public class TagLL{
+
+  private final TagLL[] head;//a shared pointer to the head; 1 element
+  TagLL prevTag, nextTag; // linked list
+
+  private TermPrefixCursor cursor;
+
+  final int startOffset;//inclusive
+  int endOffset;//exclusive
+  Object value;//null means unset
+
+  /** optional boolean used by some TagClusterReducer's */
+  boolean mark = false;
+
+  TagLL(TagLL[] head, TermPrefixCursor cursor, int startOffset, int endOffset, Object value) {
+    this.head = head;
+    this.cursor = cursor;
+    this.startOffset = startOffset;
+    this.endOffset = endOffset;
+    this.value = value;
+  }
+
+  /**
+   * Advances this tag with "word" at offset "offset".  If this tag is not in
+   * an advancing state then it does nothing. If it is advancing and prior to
+   * advancing further it sees a value, then a non-advancing tag may be inserted
+   * into the LL as side-effect. If this returns false (it didn't advance) and
+   * if there is no value, then it will also be removed.
+   *
+   *
+   * @param word      The next word or null if at an end
+   * @param offset    The last character in word's offset in the underlying
+   *                  stream. If word is null then it's meaningless.
+   *
+   * @return          Whether it advanced or not.
+   */
+  boolean advance(BytesRef word, int offset) throws IOException {
+    if (!isAdvancing())
+      return false;
+
+    Object iVal = cursor.getDocIds();
+
+    if (word != null && cursor.advance(word)) {
+
+      if (iVal != null) {
+        addBeforeLL(new TagLL(head, null, startOffset, endOffset, iVal));
+      }
+
+      assert offset >= endOffset;
+      endOffset = offset;
+      return true;
+    } else {
+      this.value = iVal;
+      this.cursor = null;
+      if (iVal == null)
+        removeLL();
+      return false;
+    }
+  }
+
+  /** Removes this tag from the chain, connecting prevTag and nextTag. Does not
+   * modify "this" object's pointers, so the caller can refer to nextTag after
+   * removing it. */
+  public void removeLL() {
+    if (head[0] == this)
+      head[0] = nextTag;
+    if (prevTag != null) {
+      prevTag.nextTag = nextTag;
+    }
+    if (nextTag != null) {
+      nextTag.prevTag = prevTag;
+    }
+  }
+
+  void addBeforeLL(TagLL tag) {
+    assert tag.startOffset <= startOffset;
+    if (prevTag != null) {
+      assert prevTag.startOffset <= tag.startOffset;
+      prevTag.nextTag = tag;
+      tag.prevTag = prevTag;
+    } else {
+      assert head[0] == this;
+      head[0] = tag;
+    }
+    prevTag = tag;
+    tag.nextTag = this;
+  }
+
+  void addAfterLL(TagLL tag) {
+    assert tag.startOffset >= startOffset;
+    if (nextTag != null) {
+      assert nextTag.startOffset >= tag.startOffset;
+      nextTag.prevTag = tag;
+      tag.nextTag = nextTag;
+    }
+    nextTag = tag;
+    tag.prevTag = this;
+  }
+
+  public int charLen() {
+    return endOffset - startOffset;
+  }
+
+  public TagLL getNextTag() {
+    return nextTag;
+  }
+
+  public TagLL getPrevTag() {
+    return prevTag;
+  }
+
+  public int getStartOffset() {
+    return startOffset;
+  }
+  public int getEndOffset() {
+    return endOffset;
+  }
+  public boolean overlaps(TagLL other) {
+    //don't use >= or <= because startOffset is inclusive while endOffset is exclusive
+    if (startOffset < other.startOffset)
+      return endOffset > other.startOffset;
+    else
+      return startOffset < other.endOffset;
+  }
+
+  boolean isAdvancing() {
+    return cursor != null;
+  }
+
+  @Override
+  public String toString() {
+    return (prevTag != null ? '*' : '-') + "|" + (nextTag != null ? '*' : '-') +
+        " " + startOffset + " to " + endOffset + (isAdvancing() ? '+' : " #" + value);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java b/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java
new file mode 100644
index 0000000..12a4cf0
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java
@@ -0,0 +1,230 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Tags maximum string of words in a corpus.  This is a callback-style API
+ * in which you implement {@link #tagCallback(int, int, Object)}.
+ *
+ * This class should be independently usable outside Solr.
+ */
+public abstract class Tagger {
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  private final TokenStream tokenStream;
+  private final TermToBytesRefAttribute byteRefAtt;
+  private final PositionIncrementAttribute posIncAtt;
+  private final OffsetAttribute offsetAtt;
+  private final TaggingAttribute taggingAtt;
+
+  private final TagClusterReducer tagClusterReducer;
+  private final Terms terms;
+  private final Bits liveDocs;
+  private final boolean skipAltTokens;
+  private final boolean ignoreStopWords;
+
+  private Map<BytesRef, IntsRef> docIdsCache;
+
+  /** Whether the WARNING about skipped tokens was already logged. */
+  private boolean loggedSkippedAltTokenWarning = false;
+
+  public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
+                TagClusterReducer tagClusterReducer, boolean skipAltTokens,
+                boolean ignoreStopWords) throws IOException {
+    this.terms = terms;
+    this.liveDocs = liveDocs;
+    this.tokenStream = tokenStream;
+    this.skipAltTokens = skipAltTokens;
+    this.ignoreStopWords = ignoreStopWords;
+    byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
+    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
+    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
+    taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
+    tokenStream.reset();
+
+    this.tagClusterReducer = tagClusterReducer;
+  }
+
+  public void enableDocIdsCache(int initSize) {
+    if (initSize > 0)
+      docIdsCache = new HashMap<>(initSize);
+  }
+
+  public void process() throws IOException {
+    if (terms == null)
+      return;
+
+    //a shared pointer to the head used by this method and each Tag instance.
+    final TagLL[] head = new TagLL[1];
+
+    TermPrefixCursor cursor = null;//re-used
+
+    //boolean switch used to log warnings in case tokens where skipped during tagging.
+    boolean skippedTokens = false;
+
+    while (tokenStream.incrementToken()) {
+      if (log.isTraceEnabled()) {
+        log.trace("Token: {}, posInc: {},  offset: [{},{}]",
+                byteRefAtt, posIncAtt.getPositionIncrement(),
+                offsetAtt.startOffset(), offsetAtt.endOffset());
+      }
+      //check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
+      if (posIncAtt.getPositionIncrement() < 1) {
+        //(a) Deal with this as a configuration issue and throw an exception
+        if (!skipAltTokens) {
+          //TODO throw UnsupportedTokenException when PhraseBuilder is ported
+          throw new IllegalStateException("Query Analyzer generates alternate "
+              + "Tokens (posInc == 0). Please adapt your Analyzer configuration or "
+              + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such "
+              + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS
+              + "' might result in wrong tagging results if the index time analyzer "
+              + "is not configured accordingly. For detailed information see "
+              + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
+        } else {
+          //(b) In case the index time analyser had indexed all variants (users
+          //    need to ensure that) processing of alternate tokens can be skipped
+          //    as anyways all alternatives will be contained in the FST.
+          skippedTokens = true;
+          log.trace("  ... ignored token");
+          continue;
+        }
+      }
+      //-- If PositionIncrement > 1 (stopwords)
+      if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
+        log.trace("   - posInc > 1 ... mark cluster as done");
+        advanceTagsAndProcessClusterIfDone(head, null);
+      }
+
+      final BytesRef term;
+      //NOTE: we need to lookup tokens if
+      // * the LookupAtt is true OR
+      // * there are still advancing tags (to find the longest possible match)
+      if(taggingAtt.isTaggable() || head[0] != null){
+        //-- Lookup the term id from the next token
+        term = byteRefAtt.getBytesRef();
+        if (term.length == 0) {
+          throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token");
+        }
+      } else { //no current cluster AND lookup == false ...
+        term = null; //skip this token
+      }
+
+      //-- Process tag
+      advanceTagsAndProcessClusterIfDone(head, term);
+
+      //-- only create new Tags for Tokens we need to lookup
+      if (taggingAtt.isTaggable() && term != null) {
+
+        //determine if the terms index has a term starting with the provided term
+        // TODO create a pool of these cursors to reuse them more?  could be trivial impl
+        if (cursor == null)// (else the existing cursor will be re-used)
+          cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache);
+        if (cursor.advance(term)) {
+          TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
+          cursor = null;//because the new tag now "owns" this instance
+          //and add it to the end
+          if (head[0] == null) {
+            head[0] = newTail;
+          } else {
+            for (TagLL t = head[0]; true; t = t.nextTag) {
+              if (t.nextTag == null) {
+                t.addAfterLL(newTail);
+                break;
+              }
+            }
+          }
+        }
+      }//if termId >= 0
+    }//end while(incrementToken())
+
+    //-- Finish all tags
+    advanceTagsAndProcessClusterIfDone(head, null);
+    assert head[0] == null;
+
+    if(!loggedSkippedAltTokenWarning && skippedTokens){
+      loggedSkippedAltTokenWarning = true; //only log once
+      log.warn("The Tagger skipped some alternate tokens (tokens with posInc == 0) "
+          + "while processing text. This may cause problems with some Analyzer "
+          + "configurations (e.g. query time synonym expansion). For details see "
+          + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
+    }
+
+    tokenStream.end();
+    //tokenStream.close(); caller closes because caller acquired it
+  }
+
+  private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException {
+    //-- Advance tags
+    final int endOffset = term != null ? offsetAtt.endOffset() : -1;
+    boolean anyAdvance = false;
+    for (TagLL t = head[0]; t != null; t = t.nextTag) {
+      anyAdvance |= t.advance(term, endOffset);
+    }
+
+    //-- Process cluster if done
+    if (!anyAdvance && head[0] != null) {
+      tagClusterReducer.reduce(head);
+      for (TagLL t = head[0]; t != null; t = t.nextTag) {
+        assert t.value != null;
+        tagCallback(t.startOffset, t.endOffset, t.value);
+      }
+      head[0] = null;
+    }
+  }
+
+  /**
+   * Invoked by {@link #process()} for each tag found.  endOffset is always &gt;= the endOffset
+   * given in the previous call.
+   *
+   * @param startOffset The character offset of the original stream where the tag starts.
+   * @param endOffset One more than the character offset of the original stream where the tag ends.
+   * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}.
+   */
+  protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey);
+
+  /**
+   * Returns a sorted array of integer docIds given the corresponding key.
+   * @param docIdsKey The lookup key.
+   * @return Not null
+   */
+  protected IntsRef lookupDocIds(Object docIdsKey) {
+    return (IntsRef) docIdsKey;
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java
new file mode 100644
index 0000000..a972e47
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggerRequestHandler.java
@@ -0,0 +1,397 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import javax.xml.stream.XMLStreamException;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.Callable;
+
+import com.google.common.io.CharStreams;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.StopFilterFactory;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.queries.function.FunctionValues;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IntsRef;
+import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.RequestHandlerBase;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.BitDocSet;
+import org.apache.solr.search.DocList;
+import org.apache.solr.search.DocSet;
+import org.apache.solr.search.DocSlice;
+import org.apache.solr.search.QParser;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.search.SolrReturnFields;
+import org.apache.solr.search.SyntaxError;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Scans posted text, looking for matching strings in the Solr index.
+ * The public static final String members are request parameters.
+ * This handler is also called the "SolrTextTagger".
+ *
+ * @since 7.4.0
+ */
+public class TaggerRequestHandler extends RequestHandlerBase {
+
+  /** Request parameter. */
+  public static final String OVERLAPS = "overlaps";
+  /** Request parameter. */
+  public static final String TAGS_LIMIT = "tagsLimit";
+  /** Request parameter. */
+  public static final String MATCH_TEXT = "matchText";
+  /** Request parameter. */
+  public static final String SKIP_ALT_TOKENS = "skipAltTokens";
+  /** Request parameter. */
+  public static final String IGNORE_STOPWORDS = "ignoreStopwords";
+  /** Request parameter. */
+  public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust";
+
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  @Override
+  public String getDescription() {
+    return "Processes input text to find matching tokens stored in the index.";
+  }
+
+  @Override
+  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
+
+    //--Read params
+    final String indexedField = req.getParams().get("field");
+    if (indexedField == null)
+      throw new RuntimeException("required param 'field'");
+
+    final TagClusterReducer tagClusterReducer =
+            chooseTagClusterReducer(req.getParams().get(OVERLAPS));
+    final int rows = req.getParams().getInt(CommonParams.ROWS, 10000);
+    final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000);
+    final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false);
+    final SchemaField idSchemaField = req.getSchema().getUniqueKeyField();
+    if (idSchemaField == null) {
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tagger requires a" +
+              "uniqueKey in the schema.");//TODO this could be relaxed
+    }
+    final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false);
+    final boolean ignoreStopWords = req.getParams().getBool(IGNORE_STOPWORDS,
+            fieldHasIndexedStopFilter(indexedField, req));
+
+    //--Get posted data
+    Reader inputReader = null;
+    Iterable<ContentStream> streams = req.getContentStreams();
+    if (streams != null) {
+      Iterator<ContentStream> iter = streams.iterator();
+      if (iter.hasNext()) {
+        inputReader = iter.next().getReader();
+      }
+      if (iter.hasNext()) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+            getClass().getSimpleName()+" does not support multiple ContentStreams"); //TODO support bulk tagging?
+      }
+    }
+    if (inputReader == null) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+          getClass().getSimpleName()+" requires text to be POSTed to it");
+    }
+
+    // We may or may not need to read the input into a string
+    final InputStringLazy inputStringFuture = new InputStringLazy(inputReader);
+
+    final OffsetCorrector offsetCorrector = getOffsetCorrector(req.getParams(), inputStringFuture);
+
+    final String inputString;//only populated if needed
+    if (addMatchText || inputStringFuture.inputString != null) {
+      //Read the input fully into a String buffer that we'll need later,
+      // then replace the input with a reader wrapping the buffer.
+      inputString = inputStringFuture.call();
+      inputReader.close();
+      inputReader = new StringReader(inputString);
+    } else {
+      inputString = null;//not used
+    }
+
+    final SolrIndexSearcher searcher = req.getSearcher();
+    final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc());
+    final List tags = new ArrayList(2000);
+
+    try {
+      Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer();
+      try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) {
+        Terms terms = searcher.getSlowAtomicReader().terms(indexedField);
+        if (terms == null)
+          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+                  "field " + indexedField + " has no indexed data");
+        Tagger tagger = new Tagger(terms, computeDocCorpus(req), tokenStream, tagClusterReducer,
+                skipAltTokens, ignoreStopWords) {
+          @SuppressWarnings("unchecked")
+          @Override
+          protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) {
+            if (tags.size() >= tagsLimit)
+              return;
+            if (offsetCorrector != null) {
+              int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset);
+              if (offsetPair == null) {
+                log.debug("Discarded offsets [{}, {}] because couldn't balance XML.",
+                        startOffset, endOffset);
+                return;
+              }
+              startOffset = offsetPair[0];
+              endOffset = offsetPair[1];
+            }
+
+            NamedList tag = new NamedList();
+            tag.add("startOffset", startOffset);
+            tag.add("endOffset", endOffset);
+            if (addMatchText)
+              tag.add("matchText", inputString.substring(startOffset, endOffset));
+            //below caches, and also flags matchDocIdsBS
+            tag.add("ids", lookupSchemaDocIds(docIdsKey));
+            tags.add(tag);
+          }
+
+          Map<Object, List> docIdsListCache = new HashMap<>(2000);
+
+          ValueSourceAccessor uniqueKeyCache = new ValueSourceAccessor(searcher,
+                  idSchemaField.getType().getValueSource(idSchemaField, null));
+
+          @SuppressWarnings("unchecked")
+          private List lookupSchemaDocIds(Object docIdsKey) {
+            List schemaDocIds = docIdsListCache.get(docIdsKey);
+            if (schemaDocIds != null)
+              return schemaDocIds;
+            IntsRef docIds = lookupDocIds(docIdsKey);
+            //translate lucene docIds to schema ids
+            schemaDocIds = new ArrayList(docIds.length);
+            for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
+              int docId = docIds.ints[i];
+              assert i == docIds.offset || docIds.ints[i - 1] < docId : "not sorted?";
+              matchDocIdsBS.set(docId);//also, flip docid in bitset
+              try {
+                schemaDocIds.add(uniqueKeyCache.objectVal(docId));//translates here
+              } catch (IOException e) {
+                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+              }
+            }
+            assert !schemaDocIds.isEmpty();
+
+            docIdsListCache.put(docIds, schemaDocIds);
+            return schemaDocIds;
+          }
+
+        };
+        tagger.enableDocIdsCache(2000);//TODO configurable
+        tagger.process();
+      }
+    } finally {
+      inputReader.close();
+    }
+    rsp.add("tagsCount",tags.size());
+    rsp.add("tags", tags);
+
+    rsp.setReturnFields(new SolrReturnFields( req ));
+
+    //Solr's standard name for matching docs in response
+    rsp.add("response", getDocList(rows, matchDocIdsBS));
+  }
+
+  private static class InputStringLazy implements Callable<String> {
+    final Reader inputReader;
+    String inputString;
+
+    InputStringLazy(Reader inputReader) {
+      this.inputReader = inputReader;
+    }
+
+    @Override
+    public String call() throws IOException {
+      if (inputString == null) {
+        inputString = CharStreams.toString(inputReader);
+      }
+      return inputString;
+    }
+  }
+
+  protected OffsetCorrector getOffsetCorrector(SolrParams params, Callable<String> inputStringProvider) throws Exception {
+    final boolean xmlOffsetAdjust = params.getBool(XML_OFFSET_ADJUST, false);
+    if (!xmlOffsetAdjust) {
+      return null;
+    }
+    try {
+      return new XmlOffsetCorrector(inputStringProvider.call());
+    } catch (XMLStreamException e) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+          "Expecting XML but wasn't: " + e, e);
+    }
+  }
+
+  private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException {
+    //Now we must supply a Solr DocList and add it to the response.
+    //  Typically this is gotten via a SolrIndexSearcher.search(), but in this case we
+    //  know exactly what documents to return, the order doesn't matter nor does
+    //  scoring.
+    //  Ideally an implementation of DocList could be directly implemented off
+    //  of a BitSet, but there are way too many methods to implement for a minor
+    //  payoff.
+    int matchDocs = matchDocIdsBS.cardinality();
+    int[] docIds = new int[ Math.min(rows, matchDocs) ];
+    DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1);
+    for (int i = 0; i < docIds.length; i++) {
+      docIds[i] = docIdIter.nextDoc();
+    }
+    return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f);
+  }
+
+  private TagClusterReducer chooseTagClusterReducer(String overlaps) {
+    TagClusterReducer tagClusterReducer;
+    if (overlaps == null || overlaps.equals("NO_SUB")) {
+      tagClusterReducer = TagClusterReducer.NO_SUB;
+    } else if (overlaps.equals("ALL")) {
+      tagClusterReducer = TagClusterReducer.ALL;
+    } else if (overlaps.equals("LONGEST_DOMINANT_RIGHT")) {
+      tagClusterReducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
+    } else {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+          "unknown tag overlap mode: "+overlaps);
+    }
+    return tagClusterReducer;
+  }
+
+  /**
+   * The set of documents matching the provided 'fq' (filter query). Don't include deleted docs
+   * either. If null is returned, then all docs are available.
+   */
+  private Bits computeDocCorpus(SolrQueryRequest req) throws SyntaxError, IOException {
+    final String[] corpusFilterQueries = req.getParams().getParams("fq");
+    final SolrIndexSearcher searcher = req.getSearcher();
+    final Bits docBits;
+    if (corpusFilterQueries != null && corpusFilterQueries.length > 0) {
+      List<Query> filterQueries = new ArrayList<Query>(corpusFilterQueries.length);
+      for (String corpusFilterQuery : corpusFilterQueries) {
+        QParser qParser = QParser.getParser(corpusFilterQuery, null, req);
+        try {
+          filterQueries.add(qParser.parse());
+        } catch (SyntaxError e) {
+          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
+        }
+      }
+
+      final DocSet docSet = searcher.getDocSet(filterQueries);//hopefully in the cache
+      //note: before Solr 4.7 we could call docSet.getBits() but no longer.
+      if (docSet instanceof BitDocSet) {
+        docBits = ((BitDocSet)docSet).getBits();
+      } else {
+        docBits = new Bits() {
+
+          @Override
+          public boolean get(int index) {
+            return docSet.exists(index);
+          }
+
+          @Override
+          public int length() {
+            return searcher.maxDoc();
+          }
+        };
+      }
+    } else {
+      docBits = searcher.getSlowAtomicReader().getLiveDocs();
+    }
+    return docBits;
+  }
+
+  private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
+    FieldType fieldType = req.getSchema().getFieldType(field);
+    Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
+    if (analyzer instanceof TokenizerChain) {
+      TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
+      TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
+      for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
+        if (tokenFilterFactory instanceof StopFilterFactory)
+          return true;
+      }
+    }
+    return false;
+  }
+
+  /** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */
+  static class ValueSourceAccessor {
+    private final List<LeafReaderContext> readerContexts;
+    private final ValueSource valueSource;
+    private final Map fContext;
+    private final FunctionValues[] functionValuesPerSeg;
+    private final int[] functionValuesDocIdPerSeg;
+
+    ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) {
+      readerContexts = searcher.getIndexReader().leaves();
+      this.valueSource = valueSource;
+      fContext = ValueSource.newContext(searcher);
+      functionValuesPerSeg = new FunctionValues[readerContexts.size()];
+      functionValuesDocIdPerSeg = new int[readerContexts.size()];
+    }
+
+    Object objectVal(int topDocId) throws IOException {
+      // lookup segment level stuff:
+      int segIdx = ReaderUtil.subIndex(topDocId, readerContexts);
+      LeafReaderContext rcontext = readerContexts.get(segIdx);
+      int segDocId = topDocId - rcontext.docBase;
+      // unfortunately Lucene 7.0 requires forward only traversal (with no reset method).
+      //   So we need to track our last docId (per segment) and re-fetch the FunctionValues. :-(
+      FunctionValues functionValues = functionValuesPerSeg[segIdx];
+      if (functionValues == null || segDocId < functionValuesDocIdPerSeg[segIdx]) {
+        functionValues = functionValuesPerSeg[segIdx] = valueSource.getValues(fContext, rcontext);
+      }
+      functionValuesDocIdPerSeg[segIdx] = segDocId;
+
+      // get value:
+      return functionValues.objectVal(segDocId);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java
new file mode 100644
index 0000000..b7803e4
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttribute.java
@@ -0,0 +1,65 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute used by the {@link Tagger} to decide if a token can start a
+ * new {@link TagLL tag}.
+ * <p>
+ * By default this Attribute will return <code>true</code>, but it might be
+ * reset by some {@link TokenFilter} added to the {@link TokenStream} used
+ * to analyze the parsed text. Typically this will be done based on NLP
+ * processing results (e.g. to only lookup Named Entities).
+ * <p>
+ * NOTE: that all Tokens are used to advance existing {@link TagLL tags}.
+ */
+public interface TaggingAttribute extends Attribute {
+
+  /**
+   * By default this Attribute will be initialised with <code>true</code>.
+   * This ensures that all tokens are taggable by default (especially if
+   * the {@link TaggingAttribute} is not set by any component in the configured
+   * {@link TokenStream}
+   */
+  public static final boolean DEFAULT_TAGGABLE = true;
+
+  /**
+   * Getter for the taggable state of the current Token
+   *
+   * @return the state
+   */
+  public boolean isTaggable();
+
+  /**
+   * Setter for the taggable state. Typically called by code within
+   * {@link TokenFilter#incrementToken()}.
+   *
+   * @param lookup the state
+   */
+  public void setTaggable(boolean lookup);
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java
new file mode 100644
index 0000000..55ecfbc6
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TaggingAttributeImpl.java
@@ -0,0 +1,79 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/**
+ * Implementation of the {@link TaggingAttribute}
+ */
+public class TaggingAttributeImpl extends AttributeImpl implements TaggingAttribute {
+
+  /**
+   * the private field initialised with {@link TaggingAttribute#DEFAULT_TAGGABLE}
+   */
+  private boolean taggable = TaggingAttribute.DEFAULT_TAGGABLE;
+
+  /*
+   * (non-Javadoc)
+   * @see org.opensextant.solrtexttagger.LookupAttribute#isLookup()
+   */
+  @Override
+  public boolean isTaggable() {
+    return taggable;
+  }
+
+  /*
+   * (non-Javadoc)
+   * @see org.opensextant.solrtexttagger.LookupAttribute#setLookup(boolean)
+   */
+  @Override
+  public void setTaggable(boolean lookup) {
+    this.taggable = lookup;
+  }
+
+  /*
+   * (non-Javadoc)
+   * @see org.apache.lucene.util.AttributeImpl#clear()
+   */
+  @Override
+  public void clear() {
+    taggable = DEFAULT_TAGGABLE;
+  }
+
+  /*
+   * (non-Javadoc)
+   * @see org.apache.lucene.util.AttributeImpl#copyTo(org.apache.lucene.util.AttributeImpl)
+   */
+  @Override
+  public void copyTo(AttributeImpl target) {
+    ((TaggingAttribute) target).setTaggable(taggable);
+  }
+
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(TaggingAttribute.class, "taggable", isTaggable());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java b/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java
new file mode 100644
index 0000000..1e82dbe
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java
@@ -0,0 +1,189 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRef;
+
+/**
+ * Cursor into the terms that advances by prefix.
+ */
+class TermPrefixCursor {
+
+  //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup.
+  // Maybe that could be added to Lucene.
+
+  // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict?
+
+  private static final byte SEPARATOR_CHAR = ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable?
+  private static final IntsRef EMPTY_INTSREF = new IntsRef();
+
+  private final TermsEnum termsEnum;
+  private final Bits liveDocs;
+  private final Map<BytesRef, IntsRef> docIdsCache;
+
+  private BytesRef prefixBuf;//we append to this
+  private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder();
+  private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied
+  private PostingsEnum postingsEnum;
+  private IntsRef docIds;
+
+  TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map<BytesRef, IntsRef> docIdsCache) {
+    this.termsEnum = termsEnum;
+    this.liveDocs = liveDocs;
+    this.docIdsCache = docIdsCache;
+  }
+
+  /** Appends the separator char (if not the first) plus the given word to the prefix buffer,
+   * then seeks to it. If the seek fails, false is returned and this cursor
+   * can be re-used as if in a new state.  The {@code word} BytesRef is considered temporary,
+   * and is not saved within this class. */
+  boolean advance(BytesRef word) throws IOException {
+    if (prefixBuf == null) { // first advance
+      //set prefixBuf to word temporary. When advance() completes, we either null out or copy.
+      prefixBuf = word;
+      prefixBufOnLoan = true;
+      if (seekPrefix()) {//... and we have to
+        ensureBufIsACopy();
+        return true;
+      } else {
+        prefixBuf = null;//just to be darned sure 'word' isn't referenced here
+        return false;
+      }
+
+    } else { // subsequent advance
+      //append to existing
+      assert !prefixBufOnLoan;
+
+      prefixBufBuilder.append(SEPARATOR_CHAR);
+      prefixBufBuilder.append(word);
+      prefixBuf = prefixBufBuilder.get();
+      if (seekPrefix()) {
+        return true;
+      } else {
+        prefixBuf = null;
+        return false;
+      }
+    }
+  }
+
+  private void ensureBufIsACopy() {
+    if (!prefixBufOnLoan)
+      return;
+
+    prefixBufBuilder.clear();
+    prefixBufBuilder.copyBytes(prefixBuf);
+    prefixBuf = prefixBufBuilder.get();
+    prefixBufOnLoan = false;
+  }
+
+  /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char.
+   * Sets docIds. **/
+  private boolean seekPrefix() throws IOException {
+    TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);
+
+    docIds = null;//invalidate
+    switch (seekStatus) {
+      case END:
+        return false;
+
+      case FOUND:
+        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
+        docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
+        if (docIds.length > 0) {
+          return true;
+        }
+
+        //Pretend we didn't find it; go to next term
+        docIds = null;
+        if (termsEnum.next() == null) { // case END
+          return false;
+        }
+        //fall through to NOT_FOUND
+
+      case NOT_FOUND:
+        //termsEnum must start with prefixBuf to continue
+        BytesRef teTerm = termsEnum.term();
+
+        if (teTerm.length > prefixBuf.length) {
+          for (int i = 0; i < prefixBuf.length; i++) {
+            if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
+              return false;
+          }
+          if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR)
+            return false;
+          return true;
+        }
+        return false;
+    }
+    throw new IllegalStateException(seekStatus.toString());
+  }
+
+  /** Returns an IntsRef either cached or reading postingsEnum. Not null. */
+  private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
+    // (The cache can have empty IntsRefs)
+
+    //lookup prefixBuf in a cache
+    if (docIdsCache != null) {
+      docIds = docIdsCache.get(prefixBuf);
+      if (docIds != null) {
+        return docIds;
+      }
+    }
+
+    //read postingsEnum
+    docIds = new IntsRef(termsEnum.docFreq());
+    int docId;
+    while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
+      if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
+        continue;
+      }
+      docIds.ints[docIds.length++] = docId;
+    }
+    if (docIds.length == 0)
+      docIds = EMPTY_INTSREF;
+
+    //cache
+    if (docIdsCache != null) {
+      ensureBufIsACopy();
+      //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
+      docIdsCache.put(prefixBuf.clone(), docIds);
+    }
+    return docIds;
+  }
+
+  /** The docIds of the last call to advance, if it returned true. It might be null, but
+   * its length won't be 0. Treat as immutable. */
+  IntsRef getDocIds() {
+    assert docIds == null || docIds.length != 0;
+    return docIds;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java b/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java
new file mode 100644
index 0000000..576328f
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/XmlOffsetCorrector.java
@@ -0,0 +1,113 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import javax.xml.stream.XMLResolver;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.events.XMLEvent;
+import java.io.InputStream;
+import java.io.StringReader;
+
+import com.ctc.wstx.stax.WstxInputFactory;
+import org.apache.commons.io.input.ClosedInputStream;
+import org.codehaus.stax2.LocationInfo;
+import org.codehaus.stax2.XMLInputFactory2;
+import org.codehaus.stax2.XMLStreamReader2;
+
+/**
+ * Corrects offsets to adjust for XML formatted data. The goal is such that the caller should be
+ * able to insert a start XML tag at the start offset and a corresponding end XML tag at the end
+ * offset of the tagger, and have it be valid XML.  See {@link #correctPair(int, int)}.
+ *
+ * This will not work on invalid XML.
+ *
+ * Not thread-safe.
+ */
+public class XmlOffsetCorrector extends OffsetCorrector {
+
+  //TODO use StAX without hard requirement on woodstox.   xmlStreamReader.getLocation().getCharacterOffset()
+
+  private static final XMLInputFactory2 XML_INPUT_FACTORY;
+  static {
+    // note: similar code in Solr's EmptyEntityResolver
+    XML_INPUT_FACTORY = new WstxInputFactory();
+    XML_INPUT_FACTORY.setXMLResolver(new XMLResolver() {
+      @Override
+      public InputStream resolveEntity(String publicId, String systemId, String baseURI, String namespace) {
+        return ClosedInputStream.CLOSED_INPUT_STREAM;
+      }
+    });
+    // TODO disable DTD?
+    // XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE)
+    XML_INPUT_FACTORY.configureForSpeed();
+  }
+
+  /**
+   * Initialize based on the document text.
+   * @param docText non-null XML content.
+   * @throws XMLStreamException If there's a problem parsing the XML.
+   */
+  public XmlOffsetCorrector(String docText) throws XMLStreamException {
+    super(docText, false);
+
+    int tagCounter = 0;
+    int thisTag = -1;
+
+    //note: we *could* add a virtual outer tag to guarantee all text is in the context of a tag,
+    // but we shouldn't need to because there is no findable text outside the top element.
+
+    final XMLStreamReader2 xmlStreamReader =
+            (XMLStreamReader2) XML_INPUT_FACTORY.createXMLStreamReader(new StringReader(docText));
+
+    while (xmlStreamReader.hasNext()) {
+      int eventType = xmlStreamReader.next();
+      switch (eventType) {
+        case XMLEvent.START_ELEMENT: {
+          tagInfo.ensureCapacity(tagInfo.size() + 5);
+          final int parentTag = thisTag;
+          final LocationInfo info = xmlStreamReader.getLocationInfo();
+          tagInfo.add(parentTag);
+          tagInfo.add((int) info.getStartingCharOffset(), (int) info.getEndingCharOffset());
+          tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag
+          thisTag = tagCounter++;
+
+          parentChangeOffsets.add((int) info.getStartingCharOffset());
+          parentChangeIds.add(thisTag);
+          break;
+        }
+        case XMLEvent.END_ELEMENT: {
+          final LocationInfo info = xmlStreamReader.getLocationInfo();
+          tagInfo.set(5 * thisTag + 3, (int) info.getStartingCharOffset());
+          tagInfo.set(5 * thisTag + 4, (int) info.getEndingCharOffset());
+          thisTag = getParentTag(thisTag);
+
+          parentChangeOffsets.add((int) info.getEndingCharOffset());
+          parentChangeIds.add(thisTag);
+          break;
+        }
+        default: //do nothing
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java b/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java
new file mode 100644
index 0000000..c2055b3
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/tagger/package-info.java
@@ -0,0 +1,27 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The {@link org.apache.solr.handler.tagger.TaggerRequestHandler} and supporting classes.
+ * This was formerly known as OpenSextant's SolrTextTagger.
+ */
+package org.apache.solr.handler.tagger;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml b/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml
new file mode 100644
index 0000000..051cd10
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-tagger.xml
@@ -0,0 +1,187 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+  This software was produced for the U. S. Government
+  under Contract No. W15P7T-11-C-F600, and is
+  subject to the Rights in Noncommercial Computer Software
+  and Noncommercial Computer Software Documentation
+  Clause 252.227-7014 (JUN 1995)
+
+  Copyright 2013 The MITRE Corporation. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+<schema name="minimal" version="1.6">
+
+  <fields>
+    <field name="id" type="string" docValues="true" required="true"/>
+    <field name="name" type="string"/>
+    <!-- freq, positions, and norms are not needed by the tagger. However if you
+    intend to have this field be used for general search, you should not exclude
+    these stats. -->
+    <field name="name_tag" type="tag" stored="false"/>
+    <field name="name_tagStop" type="tagStop" stored="false"/>
+    <field name="name_tagWDF" type="tagWDF" stored="false"/>
+    <!--<field name="name_tagPartial" type="tagPartial" stored="false"/>-->
+    <field name="name_tagXml" type="tagXml" stored="false"/>
+    <field name="name_tagAttribute" type="tagAttribute" stored="false"/>
+
+    <copyField source="name" dest="name_tag"/>
+    <copyField source="name" dest="name_tagStop"/>
+    <copyField source="name" dest="name_tagWDF"/>
+    <!--<copyField source="name" dest="name_tagPartial"/>-->
+    <copyField source="name" dest="name_tagXml"/>
+    <copyField source="name" dest="name_tagAttribute"/>
+
+    <dynamicField name="*" type="string" indexed="true" stored="true"/>
+  </fields>
+
+  <uniqueKey>id</uniqueKey>
+
+  <types>
+    <fieldType name="string" class="solr.StrField"/>
+
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
+
+    <fieldType name="tag" class="solr.TextField" positionIncrementGap="100"
+               postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <!--Usually good but for our tests, lets not. <filter class="solr.ASCIIFoldingFilterFactory"/>-->
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+        <filter class="solr.ConcatenateGraphFilterFactory" preservePositionIncrements="false" />
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <!--<filter class="solr.ASCIIFoldingFilterFactory"/>-->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- adds a stop filter -->
+    <fieldType name="tagStop" class="solr.TextField" positionIncrementGap="100"
+               postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.StopFilterFactory" /><!-- by default english stopwords -->
+
+        <filter class="solr.ConcatenateGraphFilterFactory" preservePositionIncrements="false" />
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.StopFilterFactory" /><!-- by default english stopwords -->
+      </analyzer>
+    </fieldType>
+
+    <!-- adds a WordDelimiterFilter producing stacked/synonym tokens -->
+    <fieldType name="tagWDF" class="solr.TextField" positionIncrementGap="100"
+               postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory" />
+        <filter class="solr.WordDelimiterGraphFilterFactory"
+                generateWordParts="1" generateNumberParts="1"
+                catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+        <filter class="solr.ConcatenateGraphFilterFactory" preservePositionIncrements="false" />
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory" />
+        <filter class="solr.WordDelimiterGraphFilterFactory"
+                generateWordParts="1" generateNumberParts="1"
+                catenateWords="0" catenateNumbers="0" catenateAll="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Problem: in XML can't write \u001F
+    <fieldType name="tagPartial" class="solr.TextField" positionIncrementGap="100"
+               postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+        <filter class="solr.ShingleFilterFactory" tokenSeparator="&#x1F;"
+                outputUnigramsIfNoShingles="true" maxShingleSize="10"  />
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+    -->
+
+    <fieldType name="tagXml" class="solr.TextField" positionIncrementGap="100"
+               postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+        <filter class="solr.ConcatenateGraphFilterFactory" preservePositionIncrements="false" />
+      </analyzer>
+      <analyzer type="query">
+        <charFilter class="solr.HTMLStripCharFilterFactory" /><!-- ADDED THIS! -->
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="tagAttribute" class="solr.TextField" positionIncrementGap="100" postingsFormat="FST50">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory" />
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+        <filter class="solr.ConcatenateGraphFilterFactory" preservePositionIncrements="false" />
+      </analyzer>
+      <analyzer type="query">
+        <!-- 32 just for tests, bumps posInc -->
+        <tokenizer class="solr.StandardTokenizerFactory"
+                   maxTokenLength="32"/>
+        <!--
+         NOTE: This used the WordLengthTaggingFilterFactory to test the
+         TaggingAttribute. The WordLengthTaggingFilter set the TaggingAttribute
+         for words based on their length. The attribute is ignored at indexing
+         time, but the Tagger will use it to only start tags for words that are
+         equals or longer as the configured minLength.
+         -->
+        <filter class="org.apache.solr.handler.tagger.WordLengthTaggingFilterFactory" minLength="4"/>
+        <filter class="solr.ASCIIFoldingFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> -->
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> -->
+        <!-- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+  </types>
+
+
+</schema>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml
new file mode 100644
index 0000000..e0d3677
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tagger.xml
@@ -0,0 +1,59 @@
+<?xml version="1.0" ?>
+<!--
+  This software was produced for the U. S. Government
+  under Contract No. W15P7T-11-C-F600, and is
+  subject to the Rights in Noncommercial Computer Software
+  and Noncommercial Computer Software Documentation
+  Clause 252.227-7014 (JUN 1995)
+
+  Copyright 2013 The MITRE Corporation. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<!-- a basic solrconfig that tests can use when they want simple minimal solrconfig/schema
+     DO NOT ADD THINGS TO THIS CONFIG! -->
+<config>
+  <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
+  <dataDir>${solr.data.dir:}</dataDir>
+  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
+
+  <!-- for postingsFormat="..." -->
+  <codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />
+
+  <!-- since Solr 4.8: -->
+  <schemaFactory name="SchemaFactory" class="solr.ClassicIndexSchemaFactory" />
+
+  <query>
+    <!-- illustrate putting in memory for warm-up -->
+    <listener event="firstSearcher" class="solr.QuerySenderListener">
+      <arr name="queries">
+        <lst><str name="q">name_tag:[* TO *]</str></lst>
+      </arr>
+    </listener>
+    <listener event="newSearcher" class="solr.QuerySenderListener">
+      <arr name="queries">
+        <lst><str name="q">name_tag:[* TO *]</str></lst>
+      </arr>
+    </listener>
+  </query>
+
+  <requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
+
+  <requestHandler name="/tag" class="solr.TaggerRequestHandler">
+    <lst name="defaults">
+      <str name="field">name_tag</str>
+    </lst>
+  </requestHandler>
+
+</config>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cf633921/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java
new file mode 100644
index 0000000..8d31ad0
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/EmbeddedSolrNoSerializeTest.java
@@ -0,0 +1,153 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.BiFunction;
+
+import org.apache.lucene.document.Field;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.StreamingResponseCallback;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Tests that we can skip serialization of the documents when embedding
+ * Solr.
+ */
+public class EmbeddedSolrNoSerializeTest extends SolrTestCaseJ4 {
+
+  static EmbeddedSolrServer solrServer;
+
+  @BeforeClass
+  public static void init() throws Exception {
+    initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+    solrServer = new EmbeddedSolrServer(h.getCoreContainer(), "collection1");
+    //we don't need to close the EmbeddedSolrServer because SolrTestCaseJ4 closes the core
+  }
+
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    clearIndex();
+    assertU(adoc("id", "9999", "name", "Boston"));
+    assertU(commit());
+  }
+
+  @Test
+  public void testTag() throws SolrServerException, IOException {
+    ModifiableSolrParams params = params();
+    String input = "foo boston bar";//just one tag;
+    QueryRequest req = new SolrTaggerRequest(params, input);
+    req.setPath("/tag");
+
+    QueryResponse rsp = req.process(solrServer);
+    SolrDocumentList results= (SolrDocumentList) rsp.getResponse().get("response");
+    assertNotNull(rsp.getResponse().get("tags"));
+    assertNotNull(results.get(0));
+  }
+
+  @SuppressWarnings("serial")
+  public static class SolrTaggerRequest extends QueryRequest {
+
+    private final String input;
+
+    public SolrTaggerRequest(SolrParams p, String input) {
+      super(p, METHOD.POST);
+      this.input = input;
+    }
+
+    // Deprecated in 7.2 but should live on until 8.x
+    @SuppressWarnings("deprecation")
+    @Override
+    public Collection<ContentStream> getContentStreams() {
+      return Collections.singleton(new ContentStreamBase.StringStream(input));
+    }
+
+    // As of 7.2.  But won't work until: https://issues.apache.org/jira/browse/SOLR-12142
+//    @Override
+//    public RequestWriter.ContentWriter getContentWriter(String expectedType) {
+//      return new RequestWriter.StringPayloadContentWriter(input, "text/plain; charset=UTF8");
+//    }
+  }
+
+  @Test
+  public void testSearch() throws Exception {
+    QueryResponse rsp = solrServer.query(params("q", "name:Boston"));
+    assertNotNull(rsp.getResults().get(0));
+  }
+
+  @Test
+  public void testAssertTagStreamingWithSolrTaggerRequest() throws Exception {
+    doTestAssertTagStreaming(SolrTaggerRequest::new);
+  }
+
+  @Test
+  @Ignore("As of Solr 7, stream.body is disabled by default for security ") // DWS: dubious, IMO
+  // and it can't be enabled with EmbeddedSolrServer until SOLR-12126
+  public void testAssertTagStreamingWithStreamBodyParam() throws Exception {
+    doTestAssertTagStreaming((params, input) -> {
+      params.set("stream.body", input);
+      return new QueryRequest(params);
+    });
+  }
+
+  public void doTestAssertTagStreaming(BiFunction<ModifiableSolrParams,String,QueryRequest> newQueryRequest) throws IOException, SolrServerException {
+    ModifiableSolrParams params = params();
+    String input = "foo boston bar";//just one tag;
+    QueryRequest req = newQueryRequest.apply(params, input);
+    req.setPath("/tag");
+
+    final AtomicReference<SolrDocument> refDoc = new AtomicReference<>();
+    req.setStreamingResponseCallback(new StreamingResponseCallback() {
+      @Override
+      public void streamSolrDocument(SolrDocument doc) {
+        refDoc.set(doc);
+      }
+
+      @Override
+      public void streamDocListInfo(long numFound, long start, Float maxScore) {
+
+      }
+    });
+    QueryResponse rsp = req.process(solrServer);
+    assertNotNull(rsp.getResponse().get("tags"));
+    assertNotNull(refDoc.get());
+    assertEquals("Boston", ((Field)refDoc.get().getFieldValue("name")).stringValue());
+  }
+}


Mime
View raw message