lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dsmi...@apache.org
Subject [1/2] lucene-solr:branch_7x: SOLR-12376: New TaggerRequestHandler (SolrTextTagger).
Date Tue, 05 Jun 2018 18:28:22 GMT
Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x 3694bbdaa -> 39bec8659


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java
new file mode 100644
index 0000000..cb742a8
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/RandomizedTaggerTest.java
@@ -0,0 +1,150 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+import java.util.Set;
+
+import com.carrotsearch.randomizedtesting.annotations.Repeat;
+import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Randomly generate taggable text and verify via simple tag algorithm.
+ */
+@Repeat(iterations = 10)
+public class RandomizedTaggerTest extends TaggerTestCase {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+  }
+
+  @Test
+  public void test() throws Exception {
+    final Random R = random();
+
+    Set<String> names = new HashSet<>();
+    //random list of single-word names
+    final int NUM_SINGLES = 4;//RandomInts.randomIntBetween(R, 1, 5);
+    for (int i = 0; i < NUM_SINGLES; i++) {
+      if (i == 0)//first is a big string (perhaps triggers bugs related to growing buffers)
+        names.add(randomStringOfLength(16, 32));
+      else
+        names.add(randomString());
+    }
+
+    //add random list of multi-word names, partially including existing names
+    final int NUM_MULTI = 10;
+    for (int i = 0; i < NUM_MULTI; i++) {
+      final int numWords = RandomNumbers.randomIntBetween(R, 2, 4);
+      StringBuilder buf = new StringBuilder();
+      for (int j = 0; j < numWords; j++) {
+        if (j != 0)
+          buf.append(' ');
+        if (R.nextBoolean()) {//new likely non-existent word
+          buf.append(randomString());
+        } else {//existing word (possible multi-word from prev iteration)
+          buf.append(RandomPicks.randomFrom(R, names));
+        }
+      }
+      names.add(buf.toString());
+    }
+
+    // BUILD NAMES
+    buildNames(names.toArray(new String[names.size()]));
+
+    // QUERY LOOP
+    for (int tTries = 0; tTries < 10 * RANDOM_MULTIPLIER; tTries++) {
+      // Build up random input, similar to multi-word random names above
+      StringBuilder input = new StringBuilder();
+      final int INPUT_WORD_LEN = 20;
+      input.append(' ');//must start with space based on assertBruteForce logic
+      for (int i = 0; i < INPUT_WORD_LEN; i++) {
+        if (R.nextBoolean()) {//new likely non-existent word
+          input.append(randomString());
+        } else {//existing word (possible multi-word from prev iteration)
+          input.append(RandomPicks.randomFrom(R, NAMES));
+        }
+        input.append(' ');//must end with a space
+      }
+
+      boolean madeIt = false;
+      try {
+        assertBruteForce(input.toString());
+        madeIt = true;
+      } finally {
+        if (!madeIt) {
+          System.out.println("Reproduce with:");
+          System.out.print(" buildNames(");
+          for (int i = 0; i < NAMES.size(); i++) {
+            if (i != 0)
+              System.out.print(',');
+            System.out.print('"');
+            System.out.print(NAMES.get(i));
+            System.out.print('"');
+          }
+          System.out.println(");");
+          System.out.println(" assertBruteForce(\"" + input+"\");");
+        }
+      }
+    }
+
+  }
+
+  private void assertBruteForce(String input) throws Exception {
+    assert input.matches(" .* ");
+    baseParams.set("overlaps", "ALL");
+
+    //loop through NAMES and find all tag offsets
+    List<TestTag> testTags = new ArrayList<>();
+    for (String name : NAMES) {
+      String spaceName = " "+name+" ";
+      int off = 0;
+      while (true) {
+        int idx = input.indexOf(spaceName, off);
+        if (idx < 0)
+          break;
+        testTags.add(new TestTag(idx + 1, idx + 1 + name.length(), name, name));
+        off = idx + 1;
+      }
+    }
+
+    //assert
+    assertTags(reqDoc(input), testTags.toArray(new TestTag[testTags.size()]));
+  }
+
+  private String randomString() { return randomStringOfLength(1, 1); }
+
+  private String randomStringOfLength(int min, int max) {
+    return RandomStrings.randomAsciiLettersOfLengthBetween(random(), min, max).toLowerCase(Locale.ROOT);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java
new file mode 100644
index 0000000..c7580e1
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/Tagger2Test.java
@@ -0,0 +1,175 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.nio.charset.StandardCharsets;
+
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Test the {@link TaggerRequestHandler}.
+ */
+public class Tagger2Test extends TaggerTestCase {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+  }
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
+  }
+
+  /** whole matching, no sub-tags */
+  @Test
+  public void testLongestDominantRight() throws Exception {
+    buildNames("in", "San", "in San", "Francisco", "San Francisco",
+        "San Francisco State College", "College of California",
+        "Clayton", "Clayton North", "North Carolina");
+
+    assertTags("He lived in San Francisco.",
+        "in", "San Francisco");
+
+    assertTags("He enrolled in San Francisco State College of California",
+        "in", "San Francisco State College");
+
+    assertTags("He lived in Clayton North Carolina",
+        "in", "Clayton", "North Carolina");
+
+  }
+
+  // As of Lucene/Solr 4.9, StandardTokenizer never does this anymore (reported to Lucene dev-list,
+  // Jan 26th 2015.  Honestly it's not particularly important to us but it renders this test
+  // pointless.
+  /** Orig issue https://github.com/OpenSextant/SolrTextTagger/issues/2  related: #13 */
+  @Test
+  @Ignore
+  public void testVeryLongWord() throws Exception {
+    String SANFRAN = "San Francisco";
+    buildNames(SANFRAN);
+
+    // exceeds default 255 max token length which means it in-effect becomes a stop-word
+    StringBuilder STOP = new StringBuilder(260);//>255
+    for (int i = 0; i < STOP.capacity(); i++) {
+      STOP.append((char) ('0' + (i % 10)));
+    }
+
+    String doc = "San " + STOP + " Francisco";
+    assertTags(doc);//no match due to default stop word handling
+    //and we find it when we ignore stop words
+    assertTags(reqDoc(doc, "ignoreStopwords", "true"), new TestTag(0, doc.length(), doc, lookupByName(SANFRAN)));
+  }
+
+  /** Support for stopwords (posInc &gt; 1);
+   * discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */
+  @Test
+  public void testStopWords() throws Exception {
+    baseParams.set("field", "name_tagStop");//stop filter (pos inc enabled) index & query
+
+    String SOUTHOFWALES = "South of Wales";//'of' is stop word index time & query
+    String ACITYA = "A City A";
+
+    buildNames(SOUTHOFWALES, ACITYA);
+
+    //round-trip works
+    assertTags(reqDoc(SOUTHOFWALES), new TestTag(0, SOUTHOFWALES.length(), SOUTHOFWALES,
+            lookupByName(SOUTHOFWALES)));
+    //  but offsets doesn't include stopword when leading or trailing...
+    assertTags(reqDoc(ACITYA), new TestTag(2, 6, "City",
+            lookupByName(ACITYA)));
+    //break on stop words
+    assertTags(reqDoc(SOUTHOFWALES, "ignoreStopwords", "false"));//match nothing
+  }
+
+  /** Tests WordDelimiterGraphFilter, stacked/synonymous tokens at index time (catenate options) */
+  @Test
+  public void testWDF() throws Exception {
+    baseParams.set("field", "name_tagWDF");
+
+    final String WINSTONSALEM = "City of Winston-Salem";//hyphen
+    final String BOSTONHARBOR = "Boston Harbor";//space
+    buildNames(WINSTONSALEM, BOSTONHARBOR);
+
+    //round-trip works
+    assertTags(reqDoc(WINSTONSALEM), new TestTag(0, WINSTONSALEM.length(), WINSTONSALEM,
+        lookupByName(WINSTONSALEM)));
+
+    // space separated works
+    final String WS_SPACE = WINSTONSALEM.replace('-', ' ');
+    assertTags(reqDoc(WS_SPACE),
+        new TestTag(0, WS_SPACE.length(), WS_SPACE,
+        lookupByName(WINSTONSALEM)));
+
+    //must be full match
+    assertTags(reqDoc("Winston"));//match nothing
+    assertTags(reqDoc("Salem"));//match nothing
+
+    // round-trip works
+    assertTags(reqDoc(BOSTONHARBOR), new TestTag(0, BOSTONHARBOR.length(), BOSTONHARBOR,
+        lookupByName(BOSTONHARBOR)));
+
+    // hyphen separated works
+    final String BH_HYPHEN = BOSTONHARBOR.replace(' ', '-');
+    assertTags(reqDoc(BH_HYPHEN),
+        new TestTag(0, BH_HYPHEN.length(), BH_HYPHEN,
+            lookupByName(BOSTONHARBOR)));
+    //must be full match
+    assertTags(reqDoc("Boston"));//match nothing
+    assertTags(reqDoc("Harbor"));//match nothing
+  }
+
+  /** Ensure character offsets work for multi-byte characters */
+  @Test
+  public void testMultibyteChar() throws Exception {
+    //  https://unicode-table.com/en/2019/
+    //             0         1         2         3         4
+    //             01234567890123456789012345678901234567890
+    String TEXT = "He mentionned ’Obama’ in the White House";
+    assertEquals(40, TEXT.length()); // char length (in Java, UTF16)
+
+    String QUOTE = TEXT.substring(14, 15);
+    assertEquals(8217, QUOTE.codePointAt(0));
+
+    //UTF8
+    assertEquals(3, QUOTE.getBytes(StandardCharsets.UTF_8).length);
+    assertEquals(1, "a".getBytes(StandardCharsets.UTF_8).length);
+    assertEquals(40 + 2*2, TEXT.getBytes(StandardCharsets.UTF_8).length);
+
+    //UTF16 big endian    (by specifying big/little endian, there is no "byte order mark")
+    assertEquals(2, QUOTE.getBytes(StandardCharsets.UTF_16BE).length);
+    assertEquals(2, "a".getBytes(StandardCharsets.UTF_16BE).length);
+    assertEquals(40 * 2, TEXT.getBytes(StandardCharsets.UTF_16BE).length);
+
+
+    buildNames("Obama");
+
+    assertTags(TEXT, "Obama");
+
+    // TODO test surrogate pairs (i.e. code points not in the BMP)
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java
new file mode 100644
index 0000000..93b11b5
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTest.java
@@ -0,0 +1,296 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+
+/**
+ * The original test for {@link TaggerRequestHandler}.
+ */
+public class TaggerTest extends TaggerTestCase {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+  }
+
+  private void indexAndBuild() throws Exception {
+    N[] names = N.values();
+    String[] namesStrs = new String[names.length];
+    for (int i = 0; i < names.length; i++) {
+      namesStrs[i] = names[i].getName();
+    }
+    buildNames(namesStrs);
+  }
+
+  /** Name corpus */
+  enum N {
+    //keep order to retain ord()
+    London, London_Business_School, Boston, City_of_London,
+    of, the//filtered out of the corpus by a custom query
+    ;
+
+    String getName() { return name().replace('_',' '); }
+    static N lookupByName(String name) { return N.valueOf(name.replace(' ', '_')); }
+    int getId() { return ordinal(); }
+  }
+
+  public void testFormat() throws Exception {
+    baseParams.set("overlaps", "NO_SUB");
+    indexAndBuild();
+
+    String rspStr = _testFormatRequest(false);
+    String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+        "<response>\n" +
+        "\n" +
+        "<int name=\"tagsCount\">1</int>\n" +
+        "<arr name=\"tags\">\n" +
+        "  <lst>\n" +
+        "    <int name=\"startOffset\">0</int>\n" +
+        "    <int name=\"endOffset\">22</int>\n" +
+        "    <arr name=\"ids\">\n" +
+        "      <str>1</str>\n" +
+        "    </arr>\n" +
+        "  </lst>\n" +
+        "</arr>\n" +
+        "<result name=\"response\" numFound=\"1\" start=\"0\">\n" +
+        "  <doc>\n" +
+        "    <str name=\"id\">1</str>\n" +
+        "    <str name=\"name\">London Business School</str></doc>\n" +
+        "</result>\n" +
+        "</response>\n";
+    assertEquals(expected, rspStr);
+  }
+
+  public void testFormatMatchText() throws Exception {
+    baseParams.set("overlaps", "NO_SUB");
+    indexAndBuild();
+
+    String rspStr = _testFormatRequest(true);
+    String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+        "<response>\n" +
+        "\n" +
+        "<int name=\"tagsCount\">1</int>\n" +
+        "<arr name=\"tags\">\n" +
+        "  <lst>\n" +
+        "    <int name=\"startOffset\">0</int>\n" +
+        "    <int name=\"endOffset\">22</int>\n" +
+        "    <str name=\"matchText\">london business school</str>\n" +
+        "    <arr name=\"ids\">\n" +
+        "      <str>1</str>\n" +
+        "    </arr>\n" +
+        "  </lst>\n" +
+        "</arr>\n" +
+        "<result name=\"response\" numFound=\"1\" start=\"0\">\n" +
+        "  <doc>\n" +
+        "    <str name=\"id\">1</str>\n" +
+        "    <str name=\"name\">London Business School</str></doc>\n" +
+        "</result>\n" +
+        "</response>\n";
+    assertEquals(expected, rspStr);
+  }
+
+  private String _testFormatRequest(boolean matchText) throws Exception {
+    String doc = "london business school";//just one tag
+    SolrQueryRequest req = reqDoc(doc, "indent", "on", "omitHeader", "on", "matchText", ""+matchText);
+    String rspStr = h.query(req);
+    req.close();
+    return rspStr;
+  }
+
+  /** Partial matching, no sub-tags */
+  @Ignore //TODO ConcatenateGraphFilter uses a special separator char that we can't put into XML (invalid char)
+  public void testPartialMatching() throws Exception {
+    baseParams.set("field", "name_tagPartial");
+    baseParams.set("overlaps", "NO_SUB");
+    baseParams.set("fq", "NOT name:(of the)");//test filtering
+    indexAndBuild();
+
+    //these match nothing
+    assertTags(reqDoc("") );
+    assertTags(reqDoc(" ") );
+    assertTags(reqDoc("the") );
+
+    String doc;
+
+    //just London Business School via "school" substring
+    doc = "school";
+    assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
+
+    doc = "a school";
+    assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
+
+    doc = "school a";
+    assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
+
+    //More interesting
+
+    doc = "school City";
+    assertTags(reqDoc(doc),
+        tt(doc, "school", 0, N.London_Business_School),
+        tt(doc, "City", 0, N.City_of_London) );
+
+    doc = "City of London Business School";
+    assertTags(reqDoc(doc),   //no plain London (sub-tag)
+        tt(doc, "City of London", 0, N.City_of_London),
+        tt(doc, "London Business School", 0, N.London_Business_School));
+  }
+
+  /** whole matching, no sub-tags */
+  public void testWholeMatching() throws Exception {
+    baseParams.set("overlaps", "NO_SUB");
+    baseParams.set("fq", "NOT name:(of the)");//test filtering
+    indexAndBuild();
+
+    //these match nothing
+    assertTags(reqDoc(""));
+    assertTags(reqDoc(" ") );
+    assertTags(reqDoc("the") );
+
+    //partial on N.London_Business_School matches nothing
+    assertTags(reqDoc("school") );
+    assertTags(reqDoc("a school") );
+    assertTags(reqDoc("school a") );
+    assertTags(reqDoc("school City") );
+
+    String doc;
+
+    doc = "school business london";//backwards
+    assertTags(reqDoc(doc), tt(doc,"london", 0, N.London));
+
+    doc = "of London Business School";
+    assertTags(reqDoc(doc),   //no plain London (sub-tag)
+        tt(doc, "London Business School", 0, N.London_Business_School));
+
+    //More interesting
+    doc = "City of London Business School";
+    assertTags(reqDoc(doc),   //no plain London (sub-tag)
+        tt(doc, "City of London", 0, N.City_of_London),
+        tt(doc, "London Business School", 0, N.London_Business_School));
+
+    doc = "City of London Business";
+    assertTags(reqDoc(doc),   //no plain London (sub-tag) no Business (partial-match)
+        tt(doc, "City of London", 0, N.City_of_London));
+
+    doc = "London Business magazine";
+    assertTags(reqDoc(doc),  //Just London; L.B.S. fails
+        tt(doc, "London", 0, N.London));
+  }
+
+  /** whole matching, with sub-tags */
+  public void testSubTags() throws Exception {
+    baseParams.set("overlaps", "ALL");
+    baseParams.set("fq", "NOT name:(of the)");//test filtering
+    indexAndBuild();
+
+    //these match nothing
+    assertTags(reqDoc(""));
+    assertTags(reqDoc(" ") );
+    assertTags(reqDoc("the") );
+
+    //partial on N.London_Business_School matches nothing
+    assertTags(reqDoc("school") );
+    assertTags(reqDoc("a school") );
+    assertTags(reqDoc("school a") );
+    assertTags(reqDoc("school City") );
+
+    String doc;
+
+    doc = "school business london";//backwards
+    assertTags(reqDoc(doc), tt(doc,"london", 0, N.London));
+
+    //More interesting
+    doc = "City of London Business School";
+    assertTags(reqDoc(doc),
+        tt(doc, "City of London", 0, N.City_of_London),
+        tt(doc, "London", 0, N.London),
+        tt(doc, "London Business School", 0, N.London_Business_School));
+
+    doc = "City of London Business";
+    assertTags(reqDoc(doc),
+        tt(doc, "City of London", 0, N.City_of_London),
+        tt(doc, "London", 0, N.London));
+  }
+
+  public void testMultipleFilterQueries() throws Exception {
+    baseParams.set("overlaps", "ALL");
+
+    // build up the corpus with some additional fields for filtering purposes
+    deleteByQueryAndGetVersion("*:*", null);
+
+    int i = 0;
+    assertU(adoc("id", ""+i++, "name", N.London.getName(), "type", "city", "country", "UK"));
+    assertU(adoc("id", ""+i++, "name", N.London_Business_School.getName(), "type", "school", "country", "UK"));
+    assertU(adoc("id", ""+i++, "name", N.Boston.getName(), "type", "city", "country", "US"));
+    assertU(adoc("id", ""+i++, "name", N.City_of_London.getName(), "type", "org", "country", "UK"));
+    assertU(commit());
+
+    // not calling buildNames so that we can bring along extra attributes for filtering
+    NAMES = Arrays.stream(N.values()).map(N::getName).collect(Collectors.toList());
+
+    // phrase that matches everything
+    String doc = "City of London Business School in Boston";
+
+    // first do no filtering
+    ModifiableSolrParams p = new ModifiableSolrParams();
+    p.add(CommonParams.Q, "*:*");
+    assertTags(reqDoc(doc, p),
+        tt(doc, "City of London", 0, N.City_of_London),
+        tt(doc, "London", 0, N.London),
+        tt(doc, "London Business School", 0, N.London_Business_School),
+        tt(doc, "Boston", 0, N.Boston));
+
+    // add a single fq
+    p.add(CommonParams.FQ, "type:city");
+    assertTags(reqDoc(doc, p),
+        tt(doc, "London", 0, N.London),
+        tt(doc, "Boston", 0, N.Boston));
+
+    // add another fq
+    p.add(CommonParams.FQ, "country:US");
+    assertTags(reqDoc(doc, p),
+        tt(doc, "Boston", 0, N.Boston));
+  }
+
+  private TestTag tt(String doc, String substring, int substringIndex, N name) {
+    assert substringIndex == 0;
+
+    //little bit of copy-paste code from super.tt()
+    int startOffset = -1, endOffset;
+    int substringIndex1 = 0;
+    for(int i = 0; i <= substringIndex1; i++) {
+      startOffset = doc.indexOf(substring, ++startOffset);
+      assert startOffset >= 0 : "The test itself is broken";
+    }
+    endOffset = startOffset+ substring.length();//1 greater (exclusive)
+    return new TestTag(startOffset, endOffset, substring, lookupByName(name.getName()));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java
new file mode 100644
index 0000000..e525ce9
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggerTestCase.java
@@ -0,0 +1,251 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeSet;
+
+import org.apache.commons.lang.builder.CompareToBuilder;
+import org.apache.commons.lang.builder.EqualsBuilder;
+import org.apache.lucene.document.Document;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.request.SolrQueryRequestBase;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.search.DocIterator;
+import org.apache.solr.search.DocList;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.junit.Rule;
+import org.junit.rules.TestWatcher;
+import org.junit.runner.Description;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class TaggerTestCase extends SolrTestCaseJ4 {
+
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  @Rule
+  public TestWatcher watchman = new TestWatcher() {
+    @Override
+    protected void starting(Description description) {
+      log.info("{} being run...", description.getDisplayName());
+    }
+  };
+
+  protected final ModifiableSolrParams baseParams = new ModifiableSolrParams();
+
+  //populated in buildNames; tested in assertTags
+  protected static List<String> NAMES;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    baseParams.clear();
+    baseParams.set(CommonParams.QT, "/tag");
+    baseParams.set(CommonParams.WT, "xml");
+  }
+
+  protected void assertTags(String doc, String... tags) throws Exception {
+    TestTag[] tts = new TestTag[tags.length];
+    for (int i = 0; i < tags.length; i++) {
+      tts[i] = tt(doc, tags[i]);
+    }
+    assertTags(reqDoc(doc), tts);
+  }
+
+  protected static void buildNames(String... names) throws Exception {
+    deleteByQueryAndGetVersion("*:*", null);
+    NAMES = Arrays.asList(names);
+    //Collections.sort(NAMES);
+    int i = 0;
+    for (String n : NAMES) {
+      assertU(adoc("id", ""+(i++), "name", n));
+    }
+    assertU(commit());
+  }
+
+  protected String lookupByName(String name) {
+    for (String n : NAMES) {
+      if (n.equalsIgnoreCase(name))
+        return n;
+    }
+    return null;
+  }
+
+  protected TestTag tt(String doc, String substring) {
+    int startOffset = -1, endOffset;
+    int substringIndex = 0;
+    for(int i = 0; i <= substringIndex; i++) {
+      startOffset = doc.indexOf(substring,++startOffset);
+      assert startOffset >= 0 : "The test itself is broken";
+    }
+    endOffset = startOffset+substring.length();//1 greater (exclusive)
+    return new TestTag(startOffset, endOffset, substring, lookupByName(substring));
+  }
+
+  /** Asserts the tags.  Will call req.close(). */
+  protected void assertTags(SolrQueryRequest req, TestTag... eTags) throws Exception {
+    try {
+      SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get(CommonParams.QT), req);
+      TestTag[] aTags = pullTagsFromResponse(req, rsp);
+
+      String message;
+      if (aTags.length > 10)
+        message = null;
+      else
+        message = Arrays.asList(aTags).toString();
+      Arrays.sort(eTags);
+      assertSortedArrayEquals(message, eTags, aTags);
+
+    } finally {
+      req.close();
+    }
+  }
+
+  @SuppressWarnings("unchecked")
+  protected TestTag[] pullTagsFromResponse(SolrQueryRequest req, SolrQueryResponse rsp ) throws IOException {
+    NamedList rspValues = rsp.getValues();
+    Map<String, String> matchingNames = new HashMap<>();
+    SolrIndexSearcher searcher = req.getSearcher();
+    DocList docList = (DocList) rspValues.get("response");
+    DocIterator iter = docList.iterator();
+    while (iter.hasNext()) {
+      int docId = iter.next();
+      Document doc = searcher.doc(docId);
+      String id = doc.getField("id").stringValue();
+      String name = lookupByName(doc.get("name"));
+      assertEquals("looking for "+name, NAMES.indexOf(name)+"", id);
+      matchingNames.put(id, name);
+    }
+
+    //build TestTag[] aTags from response ('a' is actual)
+    List<NamedList> mTagsList = (List<NamedList>) rspValues.get("tags");
+    List<TestTag> aTags = new ArrayList<>();
+    for (NamedList map : mTagsList) {
+      List<String> foundIds = (List<String>) map.get("ids");
+      for (String id  : foundIds) {
+        aTags.add(new TestTag(
+            ((Number)map.get("startOffset")).intValue(),
+            ((Number)map.get("endOffset")).intValue(),
+            null,
+            matchingNames.get(id)));
+      }
+    }
+    return aTags.toArray(new TestTag[0]);
+  }
+
+  /** REMEMBER to close() the result req object. */
+  protected SolrQueryRequest reqDoc(String doc, String... moreParams) {
+    return reqDoc(doc, params(moreParams));
+  }
+
+  /** REMEMBER to close() the result req object. */
+  protected SolrQueryRequest reqDoc(String doc, SolrParams moreParams) {
+    log.debug("Test doc: "+doc);
+    SolrParams params = SolrParams.wrapDefaults(moreParams, baseParams);
+    SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), params) {};
+    Iterable<ContentStream> stream = Collections.singleton((ContentStream)new ContentStreamBase.StringStream(doc));
+    req.setContentStreams(stream);
+    return req;
+  }
+
+  /** Asserts the sorted arrays are equals, with a helpful error message when not.*/
+  public void assertSortedArrayEquals(String message, Object[] expecteds, Object[] actuals) {
+    AssertionError error = null;
+    try {
+      assertArrayEquals(null, expecteds, actuals);
+    } catch (AssertionError e) {
+      error = e;
+    }
+    if (error == null)
+      return;
+    TreeSet<Object> expectedRemaining = new TreeSet<>(Arrays.asList(expecteds));
+    expectedRemaining.removeAll(Arrays.asList(actuals));
+    if (!expectedRemaining.isEmpty())
+      fail(message+": didn't find expected "+expectedRemaining.first()+" (of "+expectedRemaining.size()+"); "+ error);
+    TreeSet<Object> actualsRemaining = new TreeSet<>(Arrays.asList(actuals));
+    actualsRemaining.removeAll(Arrays.asList(expecteds));
+    fail(message+": didn't expect "+actualsRemaining.first()+" (of "+actualsRemaining.size()+"); "+ error);
+  }
+
+  class TestTag implements Comparable {
+    final int startOffset, endOffset;
+    final String substring;
+    final String docName;
+
+    TestTag(int startOffset, int endOffset, String substring, String docName) {
+      this.startOffset = startOffset;
+      this.endOffset = endOffset;
+      this.substring = substring;
+      this.docName = docName;
+    }
+
+    @Override
+    public String toString() {
+      return "TestTag{" +
+          "[" + startOffset + "-" + endOffset + "]" +
+          " doc=" + NAMES.indexOf(docName) + ":'" + docName + "'" +
+          (docName.equals(substring) || substring == null ? "" : " substr="+substring)+
+          '}';
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      TestTag that = (TestTag) obj;
+      return new EqualsBuilder()
+          .append(this.startOffset, that.startOffset)
+          .append(this.endOffset, that.endOffset)
+          .append(this.docName, that.docName)
+          .isEquals();
+    }
+
+    @Override
+    public int hashCode() {
+      return startOffset;//cheesy but acceptable
+    }
+
+    @Override
+    public int compareTo(Object o) {
+      TestTag that = (TestTag) o;
+      return new CompareToBuilder()
+          .append(this.startOffset, that.startOffset)
+          .append(this.endOffset, that.endOffset)
+          .append(this.docName,that.docName)
+          .toComparison();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java
new file mode 100644
index 0000000..39c7828
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/TaggingAttributeTest.java
@@ -0,0 +1,73 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Test the {@link TaggerRequestHandler} with
+ * a Analyzer chain that does use the {@link TaggingAttribute}. See the test
+ * configuration under 'taggingattribute'.
+ */
+public class TaggingAttributeTest extends TaggerTestCase {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+  }
+
+  /**
+   * Whole matching, no sub-tags. Links only words with &gt; 3 letters.
+   * Because of that "San" is not used to start tags
+   *
+   */
+  @Test
+  public void testTaggingAttribute() throws Exception {
+    baseParams.set("field", "name_tagAttribute"); // has WordLengthTaggingFilter using the TaggingAttribute
+    // this test is based on the longest dominant right test, so we use the
+    // the same TagClusterReducer setting
+    baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
+
+    buildNames("in", "San", "in San", "Francisco", "San Francisco",
+        "San Francisco State College", "College of California",
+        "Clayton", "Clayton North", "North Carolina");
+
+    assertTags("He lived in San Francisco.",
+        //"in", "San Francisco"); //whis would be expected without taggable
+        "Francisco");// this are the expected results with taggable
+
+    assertTags("He enrolled in San Francisco State College of California",
+        //"in", "San Francisco State College"); //without taggable enabled
+        "Francisco", "College of California");// With taggable
+    //NOTE this also tests that started tags are advanced for non-taggable
+    //     tokens, as otherwise 'College of California' would not be
+    //     suggested.
+
+    assertTags("He lived in Clayton North Carolina",
+        //"in", "Clayton", "North Carolina");
+        "Clayton", "North Carolina");
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java
new file mode 100644
index 0000000..237a8b8
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilter.java
@@ -0,0 +1,110 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Simple TokenFilter that lookup only Tokens with more as the parsed number
+ * of chars.<p>
+ * <b>NOTE:</b>This implementation is only intended to be used as an example
+ * and for unit testing the {@link TaggingAttribute} feature. Typically
+ * implementations will be based on NLP results (e.g. using POS tags or
+ * detected Named Entities).
+ * <p>
+ * <b>Example Usage:</b><p>
+ * Currently the usage requires to modify the Analyzer as defined by the
+ * <code>indexedField</code>. An alternative would be to allow the configuration
+ * of a special FieldType in the schema.xml and use this Analyzer for processing
+ * the text sent to the request.<p>
+ * While the current solution is fine for direct API usage, defining the
+ * Analyzer in the schema.xml would be better suitable for using this feature
+ * with the {@link TaggerRequestHandler}.
+ *
+ * <pre class="prettyprint">
+ *     Analyzer analyzer = req.getSchema().getField(indexedField).getType().getAnalyzer();
+ *     //get the TokenStream from the Analyzer
+ *     TokenStream baseStream = analyzer.tokenStream("", reader);
+ *     //add a FilterStream that sets the LookupAttribute to the end
+ *     TokenStream filterStream = new WordLengthLookupFilter(baseStream);
+ *     //create the Tagger using the modified analyzer chain.
+ *     new Tagger(corpus, filterStream, tagClusterReducer) {
+ *
+ *         protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
+ *             //implement the callback
+ *         }
+ *
+ *     }.process();
+ * </pre>
+ */
+public class WordLengthTaggingFilter extends TokenFilter {
+
+  /**
+   * The default minimum length is <code>3</code>
+   */
+  public static final int DEFAULT_MIN_LENGTH = 3;
+  private final TaggingAttribute lookupAtt = addAttribute(TaggingAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private int minLength;
+
+  /**
+   * TokenFilter only marks tokens to be looked up with equals or more as
+   * {@link #DEFAULT_MIN_LENGTH} characters
+   */
+  public WordLengthTaggingFilter(TokenStream input) {
+    this(input, null);
+  }
+
+  /**
+   * TokenFilter only marks tokens to be looked up with equals or more characters
+   * as the parsed minimum.
+   *
+   * @param input     the TokenStream to consume tokens from
+   * @param minLength The minimum length to lookup a Token. <code>null</code>
+   *                  or &lt;= 0 to use the #DEFAULT_MIN_LENGTH
+   */
+  public WordLengthTaggingFilter(TokenStream input, Integer minLength) {
+    super(input);
+    if (minLength == null || minLength <= 0) {
+      this.minLength = DEFAULT_MIN_LENGTH;
+    } else {
+      this.minLength = minLength;
+    }
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      int size = offsetAtt.endOffset() - offsetAtt.startOffset();
+      lookupAtt.setTaggable(size >= minLength);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java
new file mode 100644
index 0000000..dbfc538
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/WordLengthTaggingFilterFactory.java
@@ -0,0 +1,67 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class WordLengthTaggingFilterFactory extends TokenFilterFactory {
+
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String MIN_LENGTH = "minLength";
+
+  private final Integer minLength;
+
+  public WordLengthTaggingFilterFactory(Map<String, String> args) {
+    super(args);
+    int minLength = -1;
+    Object value = args.get(MIN_LENGTH);
+    if (value != null) {
+      try {
+        minLength = Integer.parseInt(value.toString());
+      } catch (NumberFormatException e) {
+        log.warn("Unable to parse minLength from value 'minLength=\"{}\"'", value);
+
+      }
+    }
+    if (minLength <= 0) {
+      log.info("use default minLength={}", WordLengthTaggingFilter.DEFAULT_MIN_LENGTH);
+      this.minLength = null;
+    } else {
+      log.info("set minLength={}", minLength);
+      this.minLength = minLength;
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new WordLengthTaggingFilter(input, minLength);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java b/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java
new file mode 100644
index 0000000..d7dd5df
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/tagger/XmlInterpolationTest.java
@@ -0,0 +1,224 @@
+/*
+ * This software was produced for the U. S. Government
+ * under Contract No. W15P7T-11-C-F600, and is
+ * subject to the Rights in Noncommercial Computer Software
+ * and Noncommercial Computer Software Documentation
+ * Clause 252.227-7014 (JUN 1995)
+ *
+ * Copyright 2013 The MITRE Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.tagger;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.xml.sax.InputSource;
+
+public class XmlInterpolationTest extends TaggerTestCase {
+
+  private static DocumentBuilder xmlDocBuilder;
+
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance();
+    xmlDocBuilderFactory.setValidating(true);
+    xmlDocBuilderFactory.setNamespaceAware(true);
+    xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder();
+
+    initCore("solrconfig-tagger.xml", "schema-tagger.xml");
+  }
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    baseParams.set("field", "name_tagXml");
+    baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
+    baseParams.set("xmlOffsetAdjust", "true");
+  }
+
+  @Test
+  public void test() throws Exception {
+    buildNames("start end");
+
+    assertXmlTag("<doc>before start <!-- c --> end after</doc>", true);
+    assertXmlTag("<doc>before start <br/> end after</doc>", true);
+    assertXmlTag("<doc>before <em>start</em> <b>end</b> after</doc>", true);
+    assertXmlTag("<doc>before <em>start</em> end after</doc>", true);
+    assertXmlTag("<doc>before start end<em> after</em></doc>", true);
+    assertXmlTag("<doc><em>before </em>start end after</doc>", true);//adjacent tags
+    assertXmlTag("<doc>before <b> <em>start</em> </b> end after</doc>", true);
+    assertXmlTag("<doc>before <b> <em>start</em> </b> <em>  end  </em> after</doc>", true);
+
+    assertXmlTag("<doc><p>before start</p> end after</doc>", false);
+    assertXmlTag("<doc>before start <p>end after</p> </doc>", false);
+
+    assertXmlTag("<doc>before <em a='A' b='B'>start</em> <b a='A' b='B'>end</b> after</doc>", true);
+  }
+
+  @Test(expected = SolrException.class)
+  public void testInvalidXml() throws Exception {
+    assertXmlTag("notXml", false);
+  }
+
+  @Test(expected = Exception.class)
+  public void testValidatingXml() throws Exception {
+    validateXml("foo");
+  }
+
+  protected void assertXmlTag(String docText, boolean expected) throws Exception {
+    final SolrQueryRequest req = reqDoc(docText);
+    try { // 5.4 and beyond we can use try-with-resources
+      final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req);
+      final TestTag[] testTags = pullTagsFromResponse(req, rsp);
+      if (!expected) {
+        assertEquals(0, testTags.length);
+      } else {
+        assertEquals(1, testTags.length);
+        final TestTag tag = testTags[0];
+        validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName));
+      }
+    } finally {
+      req.close();
+    }
+  }
+
+  protected void validateXml(String xml) throws Exception {
+    // the "parse" method also validates XML, will throw an exception if mis-formatted
+    xmlDocBuilder.parse(new InputSource(new StringReader(xml)));
+  }
+
+
+  @Test
+  public void testLuceneHtmlFilterBehavior() {
+    String docText;
+
+    //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734
+    docText = "<doc><a><b>start</b> end</a></doc>";
+    assertArrayEquals(tagExpect(docText, "start", "end</a>"), analyzeTagOne(docText, "start", "end"));
+
+    //Space after "end" means offset doesn't include </a>
+    docText = "<doc><a><b>start</b> end </a></doc>";
+    assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end"));
+
+    //Matches entity at end
+    final String endStr = String.format(Locale.ROOT, "en&#x%02x;", (int) 'd');
+    docText = "<doc>start " + endStr + "</doc>";
+    assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end"));
+    //... and at start
+    final String startStr = String.format(Locale.ROOT, "&#x%02x;tart", (int) 's');
+    docText = "<doc>" + startStr + " end</doc>";
+    assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end"));
+
+    //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start".
+    docText = "<!DOCTYPE start [ "
+            + "<!ENTITY start \"start\">"
+            + "]><start><?start start ?><!-- start --><start/>&start;</start>";
+    assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start"));
+
+    //Test entity behavior
+    docText =                " &mdash; &ndash; &amp; &foo; &#xA0; a&nbsp;b";
+    assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"},
+            analyzeReturnTokens(docText));
+
+    //Observe offset adjustment of trailing entity to end tag
+    docText = "foo&nbsp;bar";
+    assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo"));
+  }
+
+  private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) {
+    String insertStart = "<A id='"+ id +"'>";// (normally we'd escape id)
+    String insertEnd = "</A>";
+    return docText.substring(0, startOffset)
+            + insertStart
+            + docText.substring(startOffset, endOffset)
+            + insertEnd
+            + docText.substring(endOffset);
+  }
+
+  private int[] tagExpect(String docText, String start, String end) {
+    return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()};
+  }
+
+  private int[] analyzeTagOne(String docText, String start, String end) {
+    int[] result = {-1, -1};
+
+    Reader filter = new HTMLStripCharFilter(new StringReader(docText));
+
+    WhitespaceTokenizer ts = new WhitespaceTokenizer();
+    final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
+    final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
+    try {
+      ts.setReader(filter);
+      ts.reset();
+      while (ts.incrementToken()) {
+        final String termString = termAttribute.toString();
+        if (termString.equals(start))
+          result[0] = offsetAttribute.startOffset();
+        if (termString.equals(end)) {
+          result[1] = offsetAttribute.endOffset();
+          return result;
+        }
+      }
+      ts.end();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    } finally {
+      IOUtils.closeQuietly(ts);
+    }
+    return result;
+  }
+
+  private String[] analyzeReturnTokens(String docText) {
+    List<String> result = new ArrayList<>();
+
+    Reader filter = new HTMLStripCharFilter(new StringReader(docText),
+            Collections.singleton("unescaped"));
+    WhitespaceTokenizer ts = new WhitespaceTokenizer();
+    final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
+    try {
+      ts.setReader(filter);
+      ts.reset();
+      while (ts.incrementToken()) {
+        result.add(termAttribute.toString());
+      }
+      ts.end();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    } finally {
+      IOUtils.closeQuietly(ts);
+    }
+    return result.toArray(new String[result.size()]);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/solr-ref-guide/src/searching.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/searching.adoc b/solr/solr-ref-guide/src/searching.adoc
index 145c1a4..753c2d8 100644
--- a/solr/solr-ref-guide/src/searching.adoc
+++ b/solr/solr-ref-guide/src/searching.adoc
@@ -1,5 +1,35 @@
 = Searching
-:page-children: overview-of-searching-in-solr, velocity-search-ui, relevance, query-syntax-and-parsing, json-request-api, json-facet-api, faceting, highlighting, spell-checking, query-re-ranking, transforming-result-documents, suggester, morelikethis, pagination-of-results, collapse-and-expand-results, result-grouping, result-clustering, spatial-search, the-terms-component, the-term-vector-component, the-stats-component, the-query-elevation-component, response-writers, near-real-time-searching, realtime-get, exporting-result-sets, streaming-expressions, parallel-sql-interface, analytics
+:page-children: overview-of-searching-in-solr, +
+  velocity-search-ui, +
+  relevance, +
+  query-syntax-and-parsing, +
+  json-request-api, +
+  json-facet-api, +
+  faceting, +
+  highlighting, +
+  spell-checking, +
+  query-re-ranking, +
+  transforming-result-documents, +
+  suggester, +
+  morelikethis, +
+  pagination-of-results, +
+  collapse-and-expand-results, +
+  result-grouping, +
+  result-clustering, +
+  spatial-search, +
+  the-terms-component, +
+  the-term-vector-component, +
+  the-stats-component, +
+  the-query-elevation-component, +
+  the-tagger-handler, +
+  response-writers, +
+  near-real-time-searching, +
+  realtime-get, +
+  exporting-result-sets, +
+  streaming-expressions, +
+  parallel-sql-interface, +
+  analytics
+
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
@@ -50,6 +80,7 @@ This section describes how Solr works with search requests. It covers the follow
 * <<the-term-vector-component.adoc#the-term-vector-component,The Term Vector Component>>: How to get term information about specific documents.
 * <<the-stats-component.adoc#the-stats-component,The Stats Component>>: How to return information from numeric fields within a document set.
 * <<the-query-elevation-component.adoc#the-query-elevation-component,The Query Elevation Component>>: How to force documents to the top of the results for certain queries.
+* <<the-tagger-handler.adoc#the-tagger-handler,The Tagger Handler>>: The SolrTextTagger, for basic named entity tagging in text.
 * <<response-writers.adoc#response-writers,Response Writers>>: Detailed information about configuring and using Solr's response writers.
 * <<near-real-time-searching.adoc#near-real-time-searching,Near Real Time Searching>>: How to include documents in search results nearly immediately after they are indexed.
 * <<realtime-get.adoc#realtime-get,RealTime Get>>: How to get the latest version of a document without opening a searcher.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/39bec865/solr/solr-ref-guide/src/the-tagger-handler.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/the-tagger-handler.adoc b/solr/solr-ref-guide/src/the-tagger-handler.adoc
new file mode 100644
index 0000000..14ba8ed
--- /dev/null
+++ b/solr/solr-ref-guide/src/the-tagger-handler.adoc
@@ -0,0 +1,265 @@
+[[the-tagger-handler]]
+= The Tagger Handler
+
+The "Tagger" Request Handler, AKA the "SolrTextTagger" is a "text tagger".
+Given a dictionary (a Solr index) with a name-like field,
+  you post text to this request handler and it will return every occurrence of one of those names with offsets and other document metadata desired.
+It's used for named entity recognition (NER).
+It doesn't do any NLP (outside of Lucene text analysis) so it's said to be a "naive tagger",
+  but it's definitely useful as-is and a more complete NER or ERD (entity recognition and disambiguation)
+  system can be built with this as a key component.
+The SolrTextTagger might be used on queries for query-understanding or large documents as well.
+
+To get a sense of how to use it, jump to the tutorial below.
+
+The tagger does not yet support a sharded index.
+Tens, perhaps hundreds of millions of names (documents) are supported, mostly limited by memory.
+
+[[tagger-configuration]]
+== Configuration
+
+The Solr schema needs 2 things:
+
+* A unique key field (see `<uniqueKey>`).
+  Recommended field settings: set `docValues=true`
+* A tag field, a TextField, with `ConcatenateGraphFilterFactory` at the end of the index chain (not the query chain):
+  Set `preservePositionIncrements=false` on that filter.
+  Recommended field settings: `omitNorms=true`, `omitTermFreqAndPositions=true` and `postingsFormat=FST50`
+
+The text field's _index analysis chain_, aside from needing ConcatenateGraphFilterFactory at the end,
+  can otherwise have whatever tokenizer and filters suit your matching preferences.
+It can have multi-word synonyms and use WordDelimiterGraphFilterFactory for example.
+However, do _not_ use FlattenGraphFilterFactory as it will interfere with ConcatenateGraphFilterFactory.
+Position gaps (e.g. stop words) get ignored; it's not (yet) supported for the gap to be significant.
+
+The text field's _query analysis chain_, on the other hand, is more limited.
+There should not be tokens at the same position, thus no synonym expansion -- do that at index time instead.
+Stop words (or any other filter introducing a position gap) are supported.
+At runtime the tagger can be configured to either treat it as a tag break or to ignore it.
+
+The Solr config needs the `solr.TagRequestHandler` defined, which supports `defaults`, `invariants`, and `appends`
+sections just like the search handler.
+
+[[tagger-parameters]]
+== Tagger Parameters
+
+The tagger's execution is completely configurable with request parameters.  Only `field` is required.
+
+`field`::
+  The tag field that serves as the dictionary.
+  This is required; you'll probably specify it in the request handler.
+
+`fq`::
+  You can specify some number of _filter queries_ to limit the dictionary used for tagging.
+  This parameter is the same as is used by the `solr.SearchHandler`.
+
+`rows`::
+  The maximum number of documents to return, but defaulting to 10000 for a tag request.
+  This parameter is the same as is used by the `solr.SearchHandler`.
+
+`fl`::
+  Solr's standard param for listing the fields to return.
+  This parameter is the same as is used by the `solr.SearchHandler`.
+
+`overlaps`::
+  Choose the algorithm to determine which tags in an overlapping set should be retained, versus being pruned away.
+  Options are:
+
+  * `ALL`: Emit all tags.
+  * `NO_SUB`: Don't emit a tag that is completely within another tag (i.e. no subtag).
+  * `LONGEST_DOMINANT_RIGHT`: Given a cluster of overlapping tags, emit the longest one (by character length).
+     If there is a tie, pick the right-most.
+     Remove any tags overlapping with this tag then repeat the algorithm to potentially find other tags
+     that can be emitted in the cluster.
+
+`matchText`::
+  A boolean indicating whether to return the matched text in the tag response.
+  This will trigger the tagger to fully buffer the input before tagging.
+
+`tagsLimit`::
+  The maximum number of tags to return in the response.
+  Tagging effectively stops after this point.
+  By default this is 1000.
+
+`skipAltTokens`::
+  A boolean flag used to suppress errors that can occur if, for example,
+  you enable synonym expansion at query time in the analyzer, which you normally shouldn't do.
+  Let this default to false unless you know that such tokens can't be avoided.
+
+`ignoreStopwords`::
+  A boolean flag that causes stopwords (or any condition causing positions to skip like >255 char words)
+  to be ignored as if it wasn't there.
+  Otherwise, the behavior is to treat them as breaks in tagging on the presumption your indexed text-analysis
+  configuration doesn't have a StopWordFilter.
+  By default the indexed analysis chain is checked for the presence of a StopWordFilter and if found
+  then ignoreStopWords is true if unspecified.
+  You probably shouldn't have a StopWordFilter configured and probably won't need to set this param either.
+
+`xmlOffsetAdjust`::
+  A boolean indicating that the input is XML and furthermore that the offsets of returned tags should be adjusted as
+  necessary to allow for the client to insert an openening and closing element at the tag offset pair.
+  If it isn't possible to do so then the tag will be omitted.
+  You are expected to configure `HTMLStripCharFilterFactory` in the schema when using this option.
+  This will trigger the tagger to fully buffer the input before tagging.
+
+Solr's parameters for controlling the response format are supported, like:
+  `echoParams`, `wt`, `indent`, etc.
+
+[[tagger-tutorial-with-geonames]]
+== Tutorial with Geonames
+
+This is a tutorial that demonstrates how to configure and use the text
+tagger with the popular Geonames data set. It's more than a tutorial;
+it's a how-to with information that wasn't described above.
+
+[[tagger-create-and-configure-a-solr-collection]]
+=== Create and Configure a Solr Collection
+
+Create a Solr collection named "geonames". For the tutorial, we'll
+assume the default "data-driven" configuration. It's good for
+experimentation and getting going fast but not for production or being
+optimal.
+
+....
+bin/solr create -c geonames
+....
+
+[[tagger-configuring]]
+==== Configuring
+
+We need to configure the schema first. The "data driven" mode we're
+using allows us to keep this step fairly minimal -- we just need to
+declare a field type, 2 fields, and a copy-field. The critical part
+up-front is to define the "tag" field type. There are many many ways to
+configure text analysis; and we're not going to get into those choices
+here. But an important bit is the `ConcatenateGraphFilterFactory` at the
+end of the index analyzer chain. Another important bit for performance
+is postingsFormat=FST50 resulting in a compact FST based in-memory data
+structure that is especially beneficial for the text tagger.
+
+Schema configuration:
+
+....
+curl -X POST -H 'Content-type:application/json'  http://localhost:8983/solr/geonames/schema -d '{
+  "add-field-type":{
+    "name":"tag",
+    "class":"solr.TextField",
+    "postingsFormat":"FST50",
+    "omitNorms":true,
+    "omitTermFreqAndPositions":true,
+    "indexAnalyzer":{
+      "tokenizer":{
+         "class":"solr.StandardTokenizerFactory" },
+      "filters":[
+        {"class":"solr.EnglishPossessiveFilterFactory"},
+        {"class":"solr.ASCIIFoldingFilterFactory"},
+        {"class":"solr.LowerCaseFilterFactory"},
+        {"class":"solr.ConcatenateGraphFilterFactory", "preservePositionIncrements":false }
+      ]},
+    "queryAnalyzer":{
+      "tokenizer":{
+         "class":"solr.StandardTokenizerFactory" },
+      "filters":[
+        {"class":"solr.EnglishPossessiveFilterFactory"},
+        {"class":"solr.ASCIIFoldingFilterFactory"},
+        {"class":"solr.LowerCaseFilterFactory"}
+      ]}
+    },
+
+  "add-field":{ "name":"name",     "type":"text_general"},
+
+  "add-field":{ "name":"name_tag", "type":"tag",          "stored":false },
+
+  "add-copy-field":{ "source":"name", "dest":[ "name_tag" ]}
+}'
+....
+
+Configure a custom Solr Request Handler:
+
+....
+curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/config -d '{
+  "add-requesthandler" : {
+    "name": "/tag",
+    "class":"solr.TaggerRequestHandler",
+    "defaults":{ "field":"name_tag" }
+  }
+}'
+....
+
+[[tagger-load-some-sample-data]]
+=== Load Some Sample Data
+
+We'll go with some Geonames.org data in CSV format. Solr is quite
+flexible in loading data in a variety of formats. This
+http://download.geonames.org/export/dump/cities1000.zip[cities1000.zip]
+should be almost 7MB file expanding to a cities1000.txt file around
+22.2MB containing 145k lines, each a city in the world of at least 1000
+population.
+
+Using bin/post:
+....
+bin/post -c geonames -type text/csv \
+  -params 'optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate' \
+  /tmp/cities1000.txt
+....
+or using curl:
+....
+curl -X POST --data-binary @/path/to/cities1000.txt -H 'Content-type:application/csv' \
+  'http://localhost:8983/solr/geonames/update?commit=true&optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate'
+....
+
+That might take around 35 seconds; it depends. It can be a lot faster if
+the schema were tuned to only have what we truly need (no text search if
+not needed).
+
+In that command we said optimize=true to put the index in a state that
+will make tagging faster. The encapsulator=%00 is a bit of a hack to
+disable the default double-quote.
+
+[[tagger-tag-time]]
+=== Tag Time!
+
+This is a trivial example tagging a small piece of text. For more
+options, see the earlier documentation.
+
+....
+curl -X POST \
+  'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \
+  -H 'Content-Type:text/plain' -d 'Hello New York City'
+....
+
+The response should be this (the QTime may vary):
+
+....
+{
+  "responseHeader":{
+    "status":0,
+    "QTime":1},
+  "tagsCount":1,
+  "tags":[[
+      "startOffset",6,
+      "endOffset",19,
+      "ids",["5128581"]]],
+  "response":{"numFound":1,"start":0,"docs":[
+      {
+        "id":"5128581",
+        "name":["New York City"],
+        "countrycode":["US"]}]
+  }}
+....
+
+[[tagger-tips]]
+== Tips
+
+Performance Tips:
+
+* Follow the recommended configuration field settings, especially `postingsFormat=FST50`.
+* "optimize" after loading your dictionary down to 1 Lucene segment, or at least to as few as possible.
+* For bulk tagging lots of documents, there are some strategies, not mutually exclusive:
+** Batch them.
+   The tagger doesn't directly support batching but as a hack you can send a bunch of documents concatenated with
+     a nonsense word that is not in the dictionary like "ZZYYXXAABBCC" between them.
+     You'll need to keep track of the character offsets of these so you can subtract them from the results.
+** For reducing tagging latency even further, consider embedding Solr with `EmbeddedSolrServer`.
+   See `EmbeddedSolrNoSerializeTest`.
+** Use more than one thread -- perhaps as many as there are CPU cores available to Solr.
\ No newline at end of file


Mime
View raw message