lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sar...@apache.org
Subject [06/12] lucene-solr:master: LUCENE-2899: Add OpenNLP Analysis capabilities as a module
Date Fri, 15 Dec 2017 16:25:59 GMT
LUCENE-2899: Add OpenNLP Analysis capabilities as a module


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/3e2f9e62
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/3e2f9e62
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/3e2f9e62

Branch: refs/heads/master
Commit: 3e2f9e62d772218bf1fcae6d58542fad3ec43742
Parents: d02d1f1
Author: Steve Rowe <sarowe@apache.org>
Authored: Fri Dec 15 11:24:18 2017 -0500
Committer: Steve Rowe <sarowe@apache.org>
Committed: Fri Dec 15 11:24:18 2017 -0500

----------------------------------------------------------------------
 dev-tools/idea/.idea/ant.xml                    |    1 +
 dev-tools/idea/.idea/modules.xml                |    1 +
 dev-tools/idea/.idea/workspace.xml              |   83 +-
 .../idea/lucene/analysis/opennlp/opennlp.iml    |   30 +
 .../contrib/analysis-extras/analysis-extras.iml |    1 +
 .../lucene/analysis/opennlp/pom.xml.template    |   78 +
 .../maven/lucene/analysis/pom.xml.template      |    1 +
 lucene/CHANGES.txt                              |    9 +
 lucene/analysis/README.txt                      |    5 +
 lucene/analysis/build.xml                       |    6 +-
 .../miscellaneous/TypeAsSynonymFilter.java      |   80 +
 .../TypeAsSynonymFilterFactory.java             |   55 +
 ...ache.lucene.analysis.util.TokenFilterFactory |    1 +
 .../analysis/minhash/MinHashFilterTest.java     |    6 +-
 .../TestTypeAsSynonymFilterFactory.java         |   50 +
 lucene/analysis/opennlp/build.xml               |  118 +
 lucene/analysis/opennlp/ivy.xml                 |   29 +
 .../analysis/opennlp/OpenNLPChunkerFilter.java  |  108 +
 .../opennlp/OpenNLPChunkerFilterFactory.java    |   81 +
 .../opennlp/OpenNLPLemmatizerFilter.java        |  123 +
 .../opennlp/OpenNLPLemmatizerFilterFactory.java |   89 +
 .../analysis/opennlp/OpenNLPPOSFilter.java      |   96 +
 .../opennlp/OpenNLPPOSFilterFactory.java        |   71 +
 .../opennlp/OpenNLPSentenceBreakIterator.java   |  224 ++
 .../analysis/opennlp/OpenNLPTokenizer.java      |   98 +
 .../opennlp/OpenNLPTokenizerFactory.java        |   79 +
 .../lucene/analysis/opennlp/package-info.java   |   21 +
 .../analysis/opennlp/tools/NLPChunkerOp.java    |   41 +
 .../analysis/opennlp/tools/NLPLemmatizerOp.java |   80 +
 .../analysis/opennlp/tools/NLPNERTaggerOp.java  |   56 +
 .../analysis/opennlp/tools/NLPPOSTaggerOp.java  |   41 +
 .../opennlp/tools/NLPSentenceDetectorOp.java    |   50 +
 .../analysis/opennlp/tools/NLPTokenizerOp.java  |   48 +
 .../opennlp/tools/OpenNLPOpsFactory.java        |  176 +
 .../analysis/opennlp/tools/package-info.java    |   21 +
 lucene/analysis/opennlp/src/java/overview.html  |   61 +
 ...ache.lucene.analysis.util.TokenFilterFactory |   18 +
 ...apache.lucene.analysis.util.TokenizerFactory |   16 +
 .../lucene/analysis/opennlp/en-test-chunker.bin |  Bin 0 -> 89915 bytes
 .../lucene/analysis/opennlp/en-test-lemmas.dict |   12 +
 .../analysis/opennlp/en-test-lemmatizer.bin     |  Bin 0 -> 7370 bytes
 .../analysis/opennlp/en-test-ner-person.bin     |  Bin 0 -> 1700 bytes
 .../analysis/opennlp/en-test-pos-maxent.bin     |  Bin 0 -> 18424 bytes
 .../lucene/analysis/opennlp/en-test-sent.bin    |  Bin 0 -> 1050 bytes
 .../analysis/opennlp/en-test-tokenizer.bin      |  Bin 0 -> 15096 bytes
 .../TestOpenNLPChunkerFilterFactory.java        |   74 +
 .../TestOpenNLPLemmatizerFilterFactory.java     |  169 +
 .../opennlp/TestOpenNLPPOSFilterFactory.java    |   95 +
 .../TestOpenNLPSentenceBreakIterator.java       |  201 +
 .../opennlp/TestOpenNLPTokenizerFactory.java    |   97 +
 .../src/tools/test-model-data/README.txt        |    6 +
 .../src/tools/test-model-data/chunks.txt        | 3566 ++++++++++++++++++
 .../src/tools/test-model-data/lemmas.txt        |  875 +++++
 .../tools/test-model-data/ner_TrainerParams.txt |   21 +
 .../src/tools/test-model-data/ner_flashman.txt  |  143 +
 .../opennlp/src/tools/test-model-data/pos.txt   |   30 +
 .../src/tools/test-model-data/sentences.txt     |  144 +
 .../src/tools/test-model-data/tokenizer.txt     |   69 +
 .../apache/lucene/analysis/TestStopFilter.java  |    9 +-
 lucene/ivy-versions.properties                  |    3 +
 lucene/licenses/opennlp-maxent-3.0.3.jar.sha1   |    1 +
 lucene/licenses/opennlp-maxent-LICENSE-ASL.txt  |  202 +
 lucene/licenses/opennlp-maxent-NOTICE.txt       |    6 +
 lucene/licenses/opennlp-tools-1.8.3.jar.sha1    |    1 +
 lucene/licenses/opennlp-tools-LICENSE-ASL.txt   |  202 +
 lucene/licenses/opennlp-tools-NOTICE.txt        |    6 +
 lucene/module-build.xml                         |   22 +
 .../analysis/BaseTokenStreamTestCase.java       |   32 +-
 solr/CHANGES.txt                                |    7 +
 solr/contrib/analysis-extras/README.txt         |   10 +-
 solr/contrib/analysis-extras/build.xml          |   20 +-
 solr/contrib/analysis-extras/ivy.xml            |    3 +
 ...ractNamedEntitiesUpdateProcessorFactory.java |  571 +++
 .../apache/solr/update/processor/package.html   |   24 +
 .../collection1/conf/en-test-ner-person.bin     |  Bin 0 -> 1700 bytes
 .../solr/collection1/conf/en-test-sent.bin      |  Bin 0 -> 1050 bytes
 .../solr/collection1/conf/en-test-tokenizer.bin |  Bin 0 -> 15096 bytes
 .../collection1/conf/schema-opennlp-extract.xml |   49 +
 .../conf/solrconfig-opennlp-extract.xml         |  206 +
 .../solrconfig.snippet.randomindexconfig.xml    |   48 +
 ...ractNamedEntitiesUpdateProcessorFactory.java |  192 +
 .../processor/UpdateProcessorTestBase.java      |  168 -
 solr/licenses/opennlp-maxent-3.0.3.jar.sha1     |    1 +
 solr/licenses/opennlp-maxent-LICENSE-ASL.txt    |  202 +
 solr/licenses/opennlp-maxent-NOTICE.txt         |    6 +
 solr/licenses/opennlp-tools-1.8.3.jar.sha1      |    1 +
 solr/licenses/opennlp-tools-LICENSE-ASL.txt     |  202 +
 solr/licenses/opennlp-tools-NOTICE.txt          |    6 +
 .../solr-ref-guide/src/filter-descriptions.adoc |   32 +
 solr/solr-ref-guide/src/language-analysis.adoc  |  208 +
 solr/solr-ref-guide/src/tokenizers.adoc         |    4 +
 .../src/update-request-processors.adoc          |    6 +
 .../processor/UpdateProcessorTestBase.java      |  168 +
 93 files changed, 10040 insertions(+), 232 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/.idea/ant.xml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/.idea/ant.xml b/dev-tools/idea/.idea/ant.xml
index 8723e63..6c7bc8c 100644
--- a/dev-tools/idea/.idea/ant.xml
+++ b/dev-tools/idea/.idea/ant.xml
@@ -11,6 +11,7 @@
     <buildFile url="file://$PROJECT_DIR$/lucene/analysis/icu/build.xml" />
     <buildFile url="file://$PROJECT_DIR$/lucene/analysis/kuromoji/build.xml" />
     <buildFile url="file://$PROJECT_DIR$/lucene/analysis/morfologik/build.xml" />
+    <buildFile url="file://$PROJECT_DIR$/lucene/analysis/opennlp/build.xml" />
     <buildFile url="file://$PROJECT_DIR$/lucene/analysis/phonetic/build.xml" />
     <buildFile url="file://$PROJECT_DIR$/lucene/analysis/smartcn/build.xml" />
     <buildFile url="file://$PROJECT_DIR$/lucene/analysis/stempel/build.xml" />

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/.idea/modules.xml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/.idea/modules.xml b/dev-tools/idea/.idea/modules.xml
index 7ad2a78..4df1000 100644
--- a/dev-tools/idea/.idea/modules.xml
+++ b/dev-tools/idea/.idea/modules.xml
@@ -15,6 +15,7 @@
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/icu/icu.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/kuromoji/kuromoji.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/morfologik/morfologik.iml" />
+      <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/opennlp/opennlp.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/phonetic/phonetic.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/smartcn/smartcn.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/stempel/stempel.iml" />

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/.idea/workspace.xml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/.idea/workspace.xml b/dev-tools/idea/.idea/workspace.xml
index e22108f..11794af 100644
--- a/dev-tools/idea/.idea/workspace.xml
+++ b/dev-tools/idea/.idea/workspace.xml
@@ -44,6 +44,14 @@
       <option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
       <patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
     </configuration>
+    <configuration default="false" name="Module analyzers-opennlp" type="JUnit" factoryName="JUnit">
+      <module name="opennlp" />
+      <option name="TEST_OBJECT" value="pattern" />
+      <option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/lucene/analysis/opennlp" />
+      <option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
+      <option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
+      <patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
+    </configuration>
     <configuration default="false" name="Module analyzers-phonetic" type="JUnit" factoryName="JUnit">
       <module name="phonetic" />
       <option name="TEST_OBJECT" value="pattern" />
@@ -333,48 +341,49 @@
       <patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
     </configuration>
 
-    <list size="41">
+    <list size="42">
       <item index="0" class="java.lang.String" itemvalue="JUnit.Lucene core" />
       <item index="1" class="java.lang.String" itemvalue="JUnit.Module analyzers-common" />
       <item index="2" class="java.lang.String" itemvalue="JUnit.Module analyzers-icu" />
       <item index="3" class="java.lang.String" itemvalue="JUnit.Module analyzers-kuromoji" />
       <item index="4" class="java.lang.String" itemvalue="JUnit.Module analyzers-morfologik" />
-      <item index="5" class="java.lang.String" itemvalue="JUnit.Module analyzers-phonetic" />
-      <item index="6" class="java.lang.String" itemvalue="JUnit.Module analyzers-smartcn" />
-      <item index="7" class="java.lang.String" itemvalue="JUnit.Module analyzers-stempel" />
-      <item index="8" class="java.lang.String" itemvalue="JUnit.Module analyzers-uima" />
-      <item index="9" class="java.lang.String" itemvalue="JUnit.Module backward-codecs" />
-      <item index="10" class="java.lang.String" itemvalue="JUnit.Module benchmark" />
-      <item index="11" class="java.lang.String" itemvalue="JUnit.Module classification" />
-      <item index="12" class="java.lang.String" itemvalue="JUnit.Module codecs" />
-      <item index="13" class="java.lang.String" itemvalue="JUnit.Module expressions" />
-      <item index="14" class="java.lang.String" itemvalue="JUnit.Module facet" />
-      <item index="15" class="java.lang.String" itemvalue="JUnit.Module grouping" />
-      <item index="16" class="java.lang.String" itemvalue="JUnit.Module highlighter" />
-      <item index="17" class="java.lang.String" itemvalue="JUnit.Module join" />
-      <item index="18" class="java.lang.String" itemvalue="JUnit.Module memory" />
-      <item index="19" class="java.lang.String" itemvalue="JUnit.Module misc" />
-      <item index="20" class="java.lang.String" itemvalue="JUnit.Module queries" />
-      <item index="21" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
-      <item index="22" class="java.lang.String" itemvalue="JUnit.Module replicator" />
-      <item index="23" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
-      <item index="24" class="java.lang.String" itemvalue="JUnit.Module spatial" />
-      <item index="25" class="java.lang.String" itemvalue="JUnit.Module spatial-extras" />
-      <item index="26" class="java.lang.String" itemvalue="JUnit.Module spatial3d" />
-      <item index="27" class="java.lang.String" itemvalue="JUnit.Module suggest" />
-      <item index="28" class="java.lang.String" itemvalue="Application.solrcloud" />
-      <item index="29" class="java.lang.String" itemvalue="JUnit.Solr core" />
-      <item index="30" class="java.lang.String" itemvalue="JUnit.Solrj" />
-      <item index="31" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
-      <item index="32" class="java.lang.String" itemvalue="JUnit.Solr analytics contrib" />
-      <item index="33" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
-      <item index="34" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
-      <item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
-      <item index="36" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
-      <item index="37" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
-      <item index="38" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
-      <item index="39" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
-      <item index="40" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
+      <item index="5" class="java.lang.String" itemvalue="JUnit.Module analyzers-opennlp" />
+      <item index="6" class="java.lang.String" itemvalue="JUnit.Module analyzers-phonetic" />
+      <item index="7" class="java.lang.String" itemvalue="JUnit.Module analyzers-smartcn" />
+      <item index="8" class="java.lang.String" itemvalue="JUnit.Module analyzers-stempel" />
+      <item index="9" class="java.lang.String" itemvalue="JUnit.Module analyzers-uima" />
+      <item index="10" class="java.lang.String" itemvalue="JUnit.Module backward-codecs" />
+      <item index="11" class="java.lang.String" itemvalue="JUnit.Module benchmark" />
+      <item index="12" class="java.lang.String" itemvalue="JUnit.Module classification" />
+      <item index="13" class="java.lang.String" itemvalue="JUnit.Module codecs" />
+      <item index="14" class="java.lang.String" itemvalue="JUnit.Module expressions" />
+      <item index="15" class="java.lang.String" itemvalue="JUnit.Module facet" />
+      <item index="16" class="java.lang.String" itemvalue="JUnit.Module grouping" />
+      <item index="17" class="java.lang.String" itemvalue="JUnit.Module highlighter" />
+      <item index="18" class="java.lang.String" itemvalue="JUnit.Module join" />
+      <item index="19" class="java.lang.String" itemvalue="JUnit.Module memory" />
+      <item index="20" class="java.lang.String" itemvalue="JUnit.Module misc" />
+      <item index="21" class="java.lang.String" itemvalue="JUnit.Module queries" />
+      <item index="22" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
+      <item index="23" class="java.lang.String" itemvalue="JUnit.Module replicator" />
+      <item index="24" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
+      <item index="25" class="java.lang.String" itemvalue="JUnit.Module spatial" />
+      <item index="26" class="java.lang.String" itemvalue="JUnit.Module spatial-extras" />
+      <item index="27" class="java.lang.String" itemvalue="JUnit.Module spatial3d" />
+      <item index="28" class="java.lang.String" itemvalue="JUnit.Module suggest" />
+      <item index="29" class="java.lang.String" itemvalue="Application.solrcloud" />
+      <item index="30" class="java.lang.String" itemvalue="JUnit.Solr core" />
+      <item index="31" class="java.lang.String" itemvalue="JUnit.Solrj" />
+      <item index="32" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
+      <item index="33" class="java.lang.String" itemvalue="JUnit.Solr analytics contrib" />
+      <item index="34" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
+      <item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
+      <item index="36" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
+      <item index="37" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
+      <item index="38" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
+      <item index="39" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
+      <item index="40" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
+      <item index="41" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
     </list>
   </component>
 </project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/lucene/analysis/opennlp/opennlp.iml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/lucene/analysis/opennlp/opennlp.iml b/dev-tools/idea/lucene/analysis/opennlp/opennlp.iml
new file mode 100644
index 0000000..7725065
--- /dev/null
+++ b/dev-tools/idea/lucene/analysis/opennlp/opennlp.iml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="false">
+    <output url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/opennlp/classes/java" />
+    <output-test url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/opennlp/classes/test" />
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/src/resources" type="java-resource" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="module-library">
+      <library>
+        <CLASSES>
+          <root url="file://$MODULE_DIR$/lib" />
+        </CLASSES>
+        <JAVADOC />
+        <SOURCES />
+        <jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
+      </library>
+    </orderEntry>
+    <orderEntry type="library" scope="TEST" name="JUnit" level="project" />
+    <orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
+    <orderEntry type="module" module-name="analysis-common" />
+    <orderEntry type="module" module-name="lucene-core" />
+  </component>
+</module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml b/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml
index 287b46a..7c0c0c1 100644
--- a/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml
+++ b/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml
@@ -37,5 +37,6 @@
     <orderEntry type="module" module-name="lucene-core" />
     <orderEntry type="module" module-name="misc" />
     <orderEntry type="module" module-name="sandbox" />
+    <orderEntry type="module" module-name="opennlp" />
   </component>
 </module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/maven/lucene/analysis/opennlp/pom.xml.template
----------------------------------------------------------------------
diff --git a/dev-tools/maven/lucene/analysis/opennlp/pom.xml.template b/dev-tools/maven/lucene/analysis/opennlp/pom.xml.template
new file mode 100644
index 0000000..4109a0a
--- /dev/null
+++ b/dev-tools/maven/lucene/analysis/opennlp/pom.xml.template
@@ -0,0 +1,78 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.lucene</groupId>
+    <artifactId>lucene-parent</artifactId>
+    <version>@version@</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+  <groupId>org.apache.lucene</groupId>
+  <artifactId>lucene-analyzers-opennlp</artifactId>
+  <packaging>jar</packaging>
+  <name>Lucene OpenNLP integration</name>
+  <description>
+    Lucene OpenNLP integration
+  </description>
+  <properties>
+    <module-directory>lucene/analysis/opennlp</module-directory>
+    <relative-top-level>../../../..</relative-top-level>
+    <module-path>${relative-top-level}/${module-directory}</module-path>
+  </properties>
+  <scm>
+    <connection>scm:git:${vc-anonymous-base-url}</connection>
+    <developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
+    <url>${vc-browse-base-url};f=${module-directory}</url>
+  </scm>
+  <dependencies>
+    <dependency>
+      <!-- lucene-test-framework dependency must be declared before lucene-core -->
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-test-framework</artifactId>
+      <scope>test</scope>
+    </dependency>
+    @lucene-analyzers-opennlp.internal.dependencies@
+    @lucene-analyzers-opennlp.external.dependencies@
+    @lucene-analyzers-opennlp.internal.test.dependencies@
+    @lucene-analyzers-opennlp.external.test.dependencies@
+  </dependencies>
+  <build>
+    <sourceDirectory>${module-path}/src/java</sourceDirectory>
+    <testSourceDirectory>${module-path}/src/test</testSourceDirectory>
+    <resources>
+      <resource>
+        <directory>${module-path}/src/resources</directory>
+      </resource>
+    </resources>
+    <testResources>
+      <testResource>
+        <directory>${project.build.testSourceDirectory}</directory>
+        <excludes>
+          <exclude>**/*.java</exclude>
+        </excludes>
+      </testResource>
+      <testResource>
+        <directory>${module-path}/src/test-files</directory>
+      </testResource>
+    </testResources>
+  </build>
+</project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/maven/lucene/analysis/pom.xml.template
----------------------------------------------------------------------
diff --git a/dev-tools/maven/lucene/analysis/pom.xml.template b/dev-tools/maven/lucene/analysis/pom.xml.template
index 9058abf..466ad30 100644
--- a/dev-tools/maven/lucene/analysis/pom.xml.template
+++ b/dev-tools/maven/lucene/analysis/pom.xml.template
@@ -35,6 +35,7 @@
     <module>icu</module>
     <module>kuromoji</module>
     <module>morfologik</module>
+    <module>opennlp</module>
     <module>phonetic</module>
     <module>smartcn</module>
     <module>stempel</module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0fbf446..db8aaab 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -65,6 +65,15 @@ API Changes
 * LUCENE-8051: LevensteinDistance renamed to LevenshteinDistance.
   (Pulak Ghosh via Adrien Grand)
 
+New Features
+
+* LUCENE-2899: Add new module analysis/opennlp, with analysis components
+  to perform tokenization, part-of-speech tagging, lemmatization and phrase
+  chunking by invoking the corresponding OpenNLP tools. Named entity
+  recognition is also provided as a Solr update request processor.
+  (Lance Norskog, Grant Ingersoll, Joern Kottmann, Em, Kai G├╝lzau,
+  Rene Nederhand, Robert Muir, Steven Bower, Steve Rowe)
+
 Improvements
 
 * LUCENE-8081: Allow IndexWriter to opt out of flushing on indexing threads

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/README.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/README.txt b/lucene/analysis/README.txt
index 7dc7f53..c68584e 100644
--- a/lucene/analysis/README.txt
+++ b/lucene/analysis/README.txt
@@ -28,6 +28,9 @@ lucene-analyzers-kuromoji-XX.jar
 lucene-analyzers-morfologik-XX.jar
   An analyzer using the Morfologik stemming library.
 
+lucene-analyzers-opennlp-XX.jar
+  An analyzer using the OpenNLP natural-language processing library.
+
 lucene-analyzers-phonetic-XX.jar
   An add-on analysis library that provides phonetic encoders via Apache
   Commons-Codec. Note: this module depends on the commons-codec jar 
@@ -49,6 +52,7 @@ common/src/java
 icu/src/java
 kuromoji/src/java
 morfologik/src/java
+opennlp/src/java
 phonetic/src/java
 smartcn/src/java
 stempel/src/java
@@ -59,6 +63,7 @@ common/src/test
 icu/src/test
 kuromoji/src/test
 morfologik/src/test
+opennlp/src/test
 phonetic/src/test
 smartcn/src/test
 stempel/src/test

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/build.xml b/lucene/analysis/build.xml
index 844f5f3..ed1566c 100644
--- a/lucene/analysis/build.xml
+++ b/lucene/analysis/build.xml
@@ -65,6 +65,10 @@
     <ant dir="morfologik" />
   </target>
 
+  <target name="opennlp">
+    <ant dir="opennlp" />
+  </target>
+
   <target name="phonetic">
     <ant dir="phonetic" />
   </target>
@@ -82,7 +86,7 @@
   </target>
 
   <target name="default" depends="compile"/>
-  <target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel,uima" />
+  <target name="compile" depends="common,icu,kuromoji,morfologik,opennlp,phonetic,smartcn,stempel,uima" />
 
   <target name="clean">
     <forall-analyzers target="clean"/>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilter.java
new file mode 100644
index 0000000..8269d5d
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilter.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Adds the {@link TypeAttribute#type()} as a synonym,
+ * i.e. another token at the same position, optionally with a specified prefix prepended.
+ */
+public final class TypeAsSynonymFilter extends TokenFilter {
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final String prefix;
+
+  AttributeSource.State savedToken = null;
+
+
+  public TypeAsSynonymFilter(TokenStream input) {
+    this(input, null);
+  }
+
+  /**
+   * @param input input tokenstream
+   * @param prefix Prepend this string to every token type emitted as token text.
+   *               If null, nothing will be prepended.
+   */
+  public TypeAsSynonymFilter(TokenStream input, String prefix) {
+    super(input);
+    this.prefix = prefix;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (savedToken != null) {         // Emit last token's type at the same position
+      restoreState(savedToken);
+      savedToken = null;
+      termAtt.setEmpty();
+      if (prefix != null) {
+        termAtt.append(prefix);
+      }
+      termAtt.append(typeAtt.type());
+      posIncrAtt.setPositionIncrement(0);
+      return true;
+    } else if (input.incrementToken()) { // Ho pending token type to emit
+      savedToken = captureState();
+      return true;
+    }
+    return false;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    savedToken = null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilterFactory.java
new file mode 100644
index 0000000..69708b7
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilterFactory.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link TypeAsSynonymFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_type_as_synonym" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.UAX29URLEmailTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ * <p>
+ * If the optional {@code prefix} parameter is used, the specified value will be prepended
+ * to the type, e.g. with prefix="_type_", for a token "example.com" with type "&lt;URL&gt;",
+ * the emitted synonym will have text "_type_&lt;URL&gt;".
+ */
+public class TypeAsSynonymFilterFactory extends TokenFilterFactory {
+  private final String prefix;
+
+  public TypeAsSynonymFilterFactory(Map<String,String> args) {
+    super(args);
+    prefix = get(args, "prefix");  // default value is null
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new TypeAsSynonymFilter(input, prefix);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index d871ad6..6dcc81c 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -80,6 +80,7 @@ org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
 org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
 org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
+org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory
 org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
 org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory
 org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java
index a4080fe..1bc6ed7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java
@@ -183,14 +183,14 @@ public class MinHashFilterTest extends BaseTokenStreamTestCase {
     TokenStream ts = createTokenStream(5, "woof woof woof woof woof", 1, 1, 100, false);
     assertTokenStreamContents(ts, hashes, new int[]{0},
         new int[]{24}, new String[]{MinHashFilter.MIN_HASH_TYPE}, new int[]{1}, new int[]{1}, 24, 0, null,
-        true);
+        true, null);
 
     ts = createTokenStream(5, "woof woof woof woof woof", 2, 1, 1, false);
     assertTokenStreamContents(ts, new String[]{new String(new char[]{0, 0, 8449, 54077, 64133, 32857, 8605, 41409}),
             new String(new char[]{0, 1, 16887, 58164, 39536, 14926, 6529, 17276})}, new int[]{0, 0},
         new int[]{24, 24}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
         new int[]{1, 1}, 24, 0, null,
-        true);
+        true, null);
   }
 
   @Test
@@ -203,7 +203,7 @@ public class MinHashFilterTest extends BaseTokenStreamTestCase {
         false);
     assertTokenStreamContents(ts, hashes, new int[]{0, 0},
         new int[]{49, 49}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
-        new int[]{1, 1}, 49, 0, null, true);
+        new int[]{1, 1}, 49, 0, null, true, null);
   }
 
   private ArrayList<String> getTokens(TokenStream ts) throws IOException {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTypeAsSynonymFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTypeAsSynonymFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTypeAsSynonymFilterFactory.java
new file mode 100644
index 0000000..6beb139
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTypeAsSynonymFilterFactory.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestTypeAsSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+  private static final Token[] TOKENS =  { token("Visit", "<ALPHANUM>"), token("example.com", "<URL>") };
+
+  public void testBasic() throws Exception {
+    TokenStream stream = new CannedTokenStream(TOKENS);
+    stream = tokenFilterFactory("TypeAsSynonym").create(stream);
+    assertTokenStreamContents(stream, new String[] { "Visit", "<ALPHANUM>", "example.com", "<URL>" },
+        null, null, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
+  }
+
+  public void testPrefix() throws Exception {
+    TokenStream stream = new CannedTokenStream(TOKENS);
+    stream = tokenFilterFactory("TypeAsSynonym", "prefix", "_type_").create(stream);
+    assertTokenStreamContents(stream, new String[] { "Visit", "_type_<ALPHANUM>", "example.com", "_type_<URL>" },
+        null, null, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
+  }
+
+  private static Token token(String term, String type) {
+    Token token = new Token();
+    token.setEmpty();
+    token.append(term);
+    token.setType(type);
+    return token;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/build.xml b/lucene/analysis/opennlp/build.xml
new file mode 100644
index 0000000..e2cd20a
--- /dev/null
+++ b/lucene/analysis/opennlp/build.xml
@@ -0,0 +1,118 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="analyzers-opennlp" default="default">
+
+  <description>
+    OpenNLP Library Integration
+  </description>
+
+  <path id="opennlpjars">
+    <fileset dir="lib"/>
+  </path>
+
+  <property name="test.model.data.dir" location="src/tools/test-model-data"/>
+  <property name="tests.userdir" location="src/test-files"/>
+  <property name="test.model.dir" location="${tests.userdir}/org/apache/lucene/analysis/opennlp"/>
+
+  <import file="../analysis-module-build.xml"/>
+
+  <property name="analysis-extras.conf.dir"
+            location="${common.dir}/../solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf"/>
+
+  <path id="classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <path refid="opennlpjars"/>
+    <path refid="base.classpath"/>
+  </path>
+
+  <path id="test.classpath">
+    <path refid="test.base.classpath"/>
+    <pathelement path="${tests.userdir}"/>
+  </path>
+
+  <target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
+
+  <!--
+    This does not create real NLP models, just small unencumbered ones for the unit tests.
+    All text taken from reuters corpus.
+    Tags applied with online demos at CCG Urbana-Champaign.
+    -->
+  <target name="train-test-models" description="Train all small test models for unit tests" depends="resolve">
+    <mkdir dir="${test.model.dir}"/>
+    <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.sentdetect.training -->
+    <trainModel command="SentenceDetectorTrainer" lang="en" data="sentences.txt" model="en-test-sent.bin"/>
+    <copy file="${test.model.dir}/en-test-sent.bin" todir="${analysis-extras.conf.dir}"/>
+
+    <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.tokenizer.training -->
+    <trainModel command="TokenizerTrainer" lang="en" data="tokenizer.txt" model="en-test-tokenizer.bin"/>
+    <copy file="${test.model.dir}/en-test-tokenizer.bin" todir="${analysis-extras.conf.dir}"/>
+
+    <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.postagger.training -->
+    <trainModel command="POSTaggerTrainer" lang="en" data="pos.txt" model="en-test-pos-maxent.bin"/>
+
+    <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.chunker.training -->
+    <trainModel command="ChunkerTrainerME" lang="en" data="chunks.txt" model="en-test-chunker.bin"/>
+
+    <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.namefind.training -->
+    <trainModel command="TokenNameFinderTrainer" lang="en" data="ner_flashman.txt" model="en-test-ner-person.bin">
+      <extra-args>
+        <arg value="-params"/>
+        <arg value="ner_TrainerParams.txt"/>
+      </extra-args>
+    </trainModel>
+    <copy file="${test.model.dir}/en-test-ner-person.bin" todir="${analysis-extras.conf.dir}"/>
+
+    <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.lemmatizer.training -->
+    <trainModel command="LemmatizerTrainerME" lang="en" data="lemmas.txt" model="en-test-lemmatizer.bin"/>
+  </target>
+
+  <macrodef name="trainModel">
+    <attribute name="command"/>
+    <attribute name="lang"/>
+    <attribute name="data"/>
+    <attribute name="model"/>
+    <element name="extra-args" optional="true"/>
+    <sequential>
+      <java classname="opennlp.tools.cmdline.CLI"
+            dir="${test.model.data.dir}"
+            fork="true"
+            failonerror="true">
+        <classpath>
+          <path refid="opennlpjars"/>
+        </classpath>
+
+        <arg value="@{command}"/>
+
+        <arg value="-lang"/>
+        <arg value="@{lang}"/>
+
+        <arg value="-data"/>
+        <arg value="@{data}"/>
+
+        <arg value="-model"/>
+        <arg value="${test.model.dir}/@{model}"/>
+
+        <extra-args/>
+      </java>
+    </sequential>
+  </macrodef>
+
+  <target name="regenerate" depends="train-test-models"/>
+</project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/ivy.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/ivy.xml b/lucene/analysis/opennlp/ivy.xml
new file mode 100644
index 0000000..c7b885f
--- /dev/null
+++ b/lucene/analysis/opennlp/ivy.xml
@@ -0,0 +1,29 @@
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+-->
+<ivy-module version="2.0">
+  <info organisation="org.apache.lucene" module="analyzers-opennlp" />
+  <configurations defaultconfmapping="compile->master">
+    <conf name="compile" transitive="false"/>
+  </configurations>
+  <dependencies>
+    <dependency org="org.apache.opennlp" name="opennlp-tools" rev="${/org.apache.opennlp/opennlp-tools}" transitive="false" conf="compile" />
+    <dependency org="org.apache.opennlp" name="opennlp-maxent" rev="${/org.apache.opennlp/opennlp-maxent}" transitive="false" conf="compile" />
+    <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
+  </dependencies>
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
new file mode 100644
index 0000000..cfc47e6
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Run OpenNLP chunker.  Prerequisite: the OpenNLPTokenizer and OpenNLPPOSFilter must precede this filter.
+ * Tags terms in the TypeAttribute, replacing the POS tags previously put there by OpenNLPPOSFilter.
+ */
+public final class OpenNLPChunkerFilter extends TokenFilter {
+
+  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+  private int tokenNum = 0;
+  private boolean moreTokensAvailable = true;
+  private String[] sentenceTerms = null;
+  private String[] sentenceTermPOSTags = null;
+
+  private final NLPChunkerOp chunkerOp;
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
+    super(input);
+    this.chunkerOp = chunkerOp;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if ( ! moreTokensAvailable) {
+      clear();
+      return false;
+    }
+    if (tokenNum == sentenceTokenAttrs.size()) {
+      nextSentence();
+      if (sentenceTerms == null) {
+        clear();
+        return false;
+      }
+      assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
+      tokenNum = 0;
+    }
+    clearAttributes();
+    sentenceTokenAttrs.get(tokenNum++).copyTo(this);
+    return true;
+  }
+
+  private void nextSentence() throws IOException {
+    List<String> termList = new ArrayList<>();
+    List<String> posTagList = new ArrayList<>();
+    sentenceTokenAttrs.clear();
+    boolean endOfSentence = false;
+    while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+      termList.add(termAtt.toString());
+      posTagList.add(typeAtt.type());
+      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+      sentenceTokenAttrs.add(input.cloneAttributes());
+    }
+    sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
+    sentenceTermPOSTags = posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
+  }
+
+  private void assignTokenTypes(String[] tags) {
+    for (int i = 0 ; i < tags.length ; ++i) {
+      sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    moreTokensAvailable = true;
+    clear();
+  }
+
+  private void clear() {
+    sentenceTokenAttrs.clear();
+    sentenceTerms = null;
+    sentenceTermPOSTags = null;
+    tokenNum = 0;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilterFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilterFactory.java
new file mode 100644
index 0000000..96eb672
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilterFactory.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link OpenNLPChunkerFilter}.
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_opennlp_chunked" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/&gt;
+ *     &lt;filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="filename"/&gt;
+ *     &lt;filter class="solr.OpenNLPChunkerFilterFactory" chunkerModel="filename"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * @since 7.3.0
+ */
+public class OpenNLPChunkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+  public static final String CHUNKER_MODEL = "chunkerModel";
+
+  private final String chunkerModelFile;
+
+  public OpenNLPChunkerFilterFactory(Map<String,String> args) {
+    super(args);
+    chunkerModelFile = get(args, CHUNKER_MODEL);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public OpenNLPChunkerFilter create(TokenStream in) {
+    try {
+      NLPChunkerOp chunkerOp = null;
+
+      if (chunkerModelFile != null) {
+        chunkerOp = OpenNLPOpsFactory.getChunker(chunkerModelFile);
+      }
+      return new OpenNLPChunkerFilter(in, chunkerOp);
+    } catch (IOException e) {
+      throw new IllegalArgumentException(e);
+    }
+  }
+
+  @Override
+  public void inform(ResourceLoader loader) {
+    try {
+      // load and register read-only models in cache with file/resource names
+      if (chunkerModelFile != null) {
+        OpenNLPOpsFactory.getChunkerModel(chunkerModelFile, loader);
+      }
+    } catch (IOException e) {
+      throw new IllegalArgumentException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
new file mode 100644
index 0000000..4c484b9
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * <p>Runs OpenNLP dictionary-based and/or MaxEnt lemmatizers.</p>
+ * <p>
+ *   Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported,
+ *   via the "dictionary" and "lemmatizerModel" params, respectively.
+ *   If both are configured, the dictionary-based lemmatizer is tried first,
+ *   and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
+ * </p>
+ * <p>
+ *   The dictionary file must be encoded as UTF-8, with one entry per line,
+ *   in the form <tt>word[tab]lemma[tab]part-of-speech</tt>
+ * </p>
+ */
+public class OpenNLPLemmatizerFilter extends TokenFilter {
+  private final NLPLemmatizerOp lemmatizerOp;
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+  private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
+  private boolean moreTokensAvailable = true;
+  private String[] sentenceTokens = null;     // non-keyword tokens
+  private String[] sentenceTokenTypes = null; // types for non-keyword tokens
+  private String[] lemmas = null;             // lemmas for non-keyword tokens
+  private int lemmaNum = 0;                   // lemma counter
+
+  public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
+    super(input);
+    this.lemmatizerOp = lemmatizerOp;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if ( ! moreTokensAvailable) {
+      clear();
+      return false;
+    }
+    if (sentenceTokenAttrsIter == null || ! sentenceTokenAttrsIter.hasNext()) {
+      nextSentence();
+      if (sentenceTokens == null) { // zero non-keyword tokens
+        clear();
+        return false;
+      }
+      lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
+      lemmaNum = 0;
+      sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
+    }
+    clearAttributes();
+    sentenceTokenAttrsIter.next().copyTo(this);
+    if ( ! keywordAtt.isKeyword()) {
+      termAtt.setEmpty().append(lemmas[lemmaNum++]);
+    }
+    return true;
+
+  }
+
+  private void nextSentence() throws IOException {
+    List<String> tokenList = new ArrayList<>();
+    List<String> typeList = new ArrayList<>();
+    sentenceTokenAttrs.clear();
+    boolean endOfSentence = false;
+    while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+      if ( ! keywordAtt.isKeyword()) {
+        tokenList.add(termAtt.toString());
+        typeList.add(typeAtt.type());
+      }
+      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+      sentenceTokenAttrs.add(input.cloneAttributes());
+    }
+    sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
+    sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    moreTokensAvailable = true;
+    clear();
+  }
+
+  private void clear() {
+    sentenceTokenAttrs.clear();
+    sentenceTokenAttrsIter = null;
+    sentenceTokens = null;
+    sentenceTokenTypes = null;
+    lemmas = null;
+    lemmaNum = 0;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilterFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilterFactory.java
new file mode 100644
index 0000000..90a0e43
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilterFactory.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link OpenNLPLemmatizerFilter}.
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_opennlp_lemma" class="solr.TextField" positionIncrementGap="100"
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.OpenNLPTokenizerFactory"
+ *                sentenceModel="filename"
+ *                tokenizerModel="filename"/&gt;
+ *     /&gt;
+ *     &lt;filter class="solr.OpenNLPLemmatizerFilterFactory"
+ *             dictionary="filename"
+ *             lemmatizerModel="filename"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * @since 7.3.0
+ */
+public class OpenNLPLemmatizerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+  public static final String DICTIONARY = "dictionary";
+  public static final String LEMMATIZER_MODEL = "lemmatizerModel";
+
+  private final String dictionaryFile;
+  private final String lemmatizerModelFile;
+
+  public OpenNLPLemmatizerFilterFactory(Map<String,String> args) {
+    super(args);
+    dictionaryFile = get(args, DICTIONARY);
+    lemmatizerModelFile = get(args, LEMMATIZER_MODEL);
+
+    if (dictionaryFile == null && lemmatizerModelFile == null) {
+      throw new IllegalArgumentException("Configuration Error: missing parameter: at least one of '"
+          + DICTIONARY + "' and '" + LEMMATIZER_MODEL + "' must be provided.");
+    }
+
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public OpenNLPLemmatizerFilter create(TokenStream in) {
+    try {
+      NLPLemmatizerOp lemmatizerOp = OpenNLPOpsFactory.getLemmatizer(dictionaryFile, lemmatizerModelFile);
+      return new OpenNLPLemmatizerFilter(in, lemmatizerOp);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    // register models in cache with file/resource names
+    if (dictionaryFile != null) {
+      OpenNLPOpsFactory.getLemmatizerDictionary(dictionaryFile, loader);
+    }
+    if (lemmatizerModelFile != null) {
+      OpenNLPOpsFactory.getLemmatizerModel(lemmatizerModelFile, loader);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
new file mode 100644
index 0000000..a5bea28
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Run OpenNLP POS tagger.  Tags all terms in the TypeAttribute.
+ */
+public final class OpenNLPPOSFilter extends TokenFilter {
+
+  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+  String[] tags = null;
+  private int tokenNum = 0;
+  private boolean moreTokensAvailable = true;
+
+  private final NLPPOSTaggerOp posTaggerOp;
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
+    super(input);
+    this.posTaggerOp = posTaggerOp;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if ( ! moreTokensAvailable) {
+      clear();
+      return false;
+    }
+    if (tokenNum == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
+      String[] sentenceTokens = nextSentence();
+      if (sentenceTokens == null) {
+        clear();
+        return false;
+      }
+      tags = posTaggerOp.getPOSTags(sentenceTokens);
+      tokenNum = 0;
+    }
+    clearAttributes();
+    sentenceTokenAttrs.get(tokenNum).copyTo(this);
+    typeAtt.setType(tags[tokenNum++]);
+    return true;
+  }
+
+  private String[] nextSentence() throws IOException {
+    List<String> termList = new ArrayList<>();
+    sentenceTokenAttrs.clear();
+    boolean endOfSentence = false;
+    while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+      termList.add(termAtt.toString());
+      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+      sentenceTokenAttrs.add(input.cloneAttributes());
+    }
+    return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    moreTokensAvailable = true;
+  }
+
+  private void clear() {
+    sentenceTokenAttrs.clear();
+    tags = null;
+    tokenNum = 0;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java
new file mode 100644
index 0000000..952218f
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link OpenNLPPOSFilter}.
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_opennlp_pos" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/&gt;
+ *     &lt;filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="filename"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * @since 7.3.0
+ */
+public class OpenNLPPOSFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+  public static final String POS_TAGGER_MODEL = "posTaggerModel";
+
+  private final String posTaggerModelFile;
+
+  public OpenNLPPOSFilterFactory(Map<String,String> args) {
+    super(args);
+    posTaggerModelFile = require(args, POS_TAGGER_MODEL);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public OpenNLPPOSFilter create(TokenStream in) {
+    try {
+      return new OpenNLPPOSFilter(in, OpenNLPOpsFactory.getPOSTagger(posTaggerModelFile));
+    } catch (IOException e) {
+      throw new IllegalArgumentException(e);
+    }
+  }
+
+  @Override
+  public void inform(ResourceLoader loader) {
+    try { // load and register the read-only model in cache with file/resource name
+      OpenNLPOpsFactory.getPOSTaggerModel(posTaggerModelFile, loader);
+    } catch (IOException e) {
+      throw new IllegalArgumentException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java
new file mode 100644
index 0000000..f69fbc6
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+import opennlp.tools.util.Span;
+import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
+import org.apache.lucene.analysis.util.CharArrayIterator;
+
+/**
+ * A {@link BreakIterator} that splits sentences using an OpenNLP sentence chunking model.
+ */
+public final class OpenNLPSentenceBreakIterator extends BreakIterator {
+
+  private CharacterIterator text;
+  private int currentSentence;
+  private int[] sentenceStarts;
+  private NLPSentenceDetectorOp sentenceOp;
+
+  public OpenNLPSentenceBreakIterator(NLPSentenceDetectorOp sentenceOp) {
+    this.sentenceOp = sentenceOp;
+  }
+
+  @Override
+  public int current() {
+    return text.getIndex();
+  }
+
+  @Override
+  public int first() {
+    currentSentence = 0;
+    text.setIndex(text.getBeginIndex());
+    return current();
+  }
+
+  @Override
+  public int last() {
+    if (sentenceStarts.length > 0) {
+      currentSentence = sentenceStarts.length - 1;
+      text.setIndex(text.getEndIndex());
+    } else { // there are no sentences; both the first and last positions are the begin index
+      currentSentence = 0;
+      text.setIndex(text.getBeginIndex());
+    }
+    return current();
+  }
+
+  @Override
+  public int next() {
+    if (text.getIndex() == text.getEndIndex() || 0 == sentenceStarts.length) {
+      return DONE;
+    } else if (currentSentence < sentenceStarts.length - 1) {
+      text.setIndex(sentenceStarts[++currentSentence]);
+      return current();
+    } else {
+      return last();
+    }
+  }
+
+  @Override
+  public int following(int pos) {
+    if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+      throw new IllegalArgumentException("offset out of bounds");
+    } else if (0 == sentenceStarts.length) {
+      text.setIndex(text.getBeginIndex());
+      return DONE;
+    } else if (pos >= sentenceStarts[sentenceStarts.length - 1]) {
+      // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+      // https://bugs.openjdk.java.net/browse/JDK-8015110
+      text.setIndex(text.getEndIndex());
+      currentSentence = sentenceStarts.length - 1;
+      return DONE;
+    } else { // there are at least two sentences
+      currentSentence = (sentenceStarts.length - 1) / 2; // start search from the middle
+      moveToSentenceAt(pos, 0, sentenceStarts.length - 2);
+      text.setIndex(sentenceStarts[++currentSentence]);
+      return current();
+    }
+  }
+
+  /** Binary search over sentences */
+  private void moveToSentenceAt(int pos, int minSentence, int maxSentence) {
+    if (minSentence != maxSentence) {
+      if (pos < sentenceStarts[currentSentence]) {
+        int newMaxSentence = currentSentence - 1;
+        currentSentence = minSentence + (currentSentence - minSentence) / 2;
+        moveToSentenceAt(pos, minSentence, newMaxSentence);
+      } else if (pos >= sentenceStarts[currentSentence + 1]) {
+        int newMinSentence = currentSentence + 1;
+        currentSentence = maxSentence - (maxSentence - currentSentence) / 2;
+        moveToSentenceAt(pos, newMinSentence, maxSentence);
+      }
+    } else {
+      assert currentSentence == minSentence;
+      assert pos >= sentenceStarts[currentSentence];
+      assert (currentSentence == sentenceStarts.length - 1 && pos <= text.getEndIndex())
+          || pos < sentenceStarts[currentSentence + 1];
+    }
+    // we have arrived - nothing to do
+  }
+
+  @Override
+  public int previous() {
+    if (text.getIndex() == text.getBeginIndex()) {
+      return DONE;
+    } else {
+      if (0 == sentenceStarts.length) {
+        text.setIndex(text.getBeginIndex());
+        return DONE;
+      }
+      if (text.getIndex() == text.getEndIndex()) {
+        text.setIndex(sentenceStarts[currentSentence]);
+      } else {
+        text.setIndex(sentenceStarts[--currentSentence]);
+      }
+      return current();
+    }
+  }
+
+  @Override
+  public int preceding(int pos) {
+    if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+      throw new IllegalArgumentException("offset out of bounds");
+    } else if (0 == sentenceStarts.length) {
+      text.setIndex(text.getBeginIndex());
+      currentSentence = 0;
+      return DONE;
+    } else if (pos < sentenceStarts[0]) {
+      // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+      // https://bugs.openjdk.java.net/browse/JDK-8015110
+      text.setIndex(text.getBeginIndex());
+      currentSentence = 0;
+      return DONE;
+    } else {
+      currentSentence = sentenceStarts.length / 2; // start search from the middle
+      moveToSentenceAt(pos, 0, sentenceStarts.length - 1);
+      if (0 == currentSentence) {
+        text.setIndex(text.getBeginIndex());
+        return DONE;
+      } else {
+        text.setIndex(sentenceStarts[--currentSentence]);
+        return current();
+      }
+    }
+  }
+
+  @Override
+  public int next(int n) {
+    currentSentence += n;
+    if (n < 0) {
+      if (text.getIndex() == text.getEndIndex()) {
+        ++currentSentence;
+      }
+      if (currentSentence < 0) {
+        currentSentence = 0;
+        text.setIndex(text.getBeginIndex());
+        return DONE;
+      } else {
+        text.setIndex(sentenceStarts[currentSentence]);
+      }
+    } else if (n > 0) {
+      if (currentSentence >= sentenceStarts.length) {
+        currentSentence = sentenceStarts.length - 1;
+        text.setIndex(text.getEndIndex());
+        return DONE;
+      } else {
+        text.setIndex(sentenceStarts[currentSentence]);
+      }
+    }
+    return current();
+  }
+
+  @Override
+  public CharacterIterator getText() {
+    return text;
+  }
+
+  @Override
+  public void setText(CharacterIterator newText) {
+    text = newText;
+    text.setIndex(text.getBeginIndex());
+    currentSentence = 0;
+    Span[] spans = sentenceOp.splitSentences(characterIteratorToString());
+    sentenceStarts = new int[spans.length];
+    for (int i = 0; i < spans.length; ++i) {
+      // Adjust start positions to match those of the passed-in CharacterIterator
+      sentenceStarts[i] = spans[i].getStart() + text.getBeginIndex();
+    }
+  }
+
+  private String characterIteratorToString() {
+    String fullText;
+    if (text instanceof CharArrayIterator) {
+      CharArrayIterator charArrayIterator = (CharArrayIterator)text;
+      fullText = new String(charArrayIterator.getText(), charArrayIterator.getStart(), charArrayIterator.getLength());
+    } else {
+      // TODO: is there a better way to extract full text from arbitrary CharacterIterators?
+      StringBuilder builder = new StringBuilder();
+      for (char ch = text.first(); ch != CharacterIterator.DONE; ch = text.next()) {
+        builder.append(ch);
+      }
+      fullText = builder.toString();
+      text.setIndex(text.getBeginIndex());
+    }
+    return fullText;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
new file mode 100644
index 0000000..75a3b81
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+
+import opennlp.tools.util.Span;
+
+import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
+import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
+import org.apache.lucene.util.AttributeFactory;
+
+/**
+ * Run OpenNLP SentenceDetector and Tokenizer.
+ * The last token in each sentence is marked by setting the {@link #EOS_FLAG_BIT} in the FlagsAttribute;
+ * following filters can use this information to apply operations to tokens one sentence at a time.
+ */
+public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
+  public static int EOS_FLAG_BIT = 1;
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  private Span[] termSpans = null;
+  private int termNum = 0;
+  private int sentenceStart = 0;
+
+  private NLPSentenceDetectorOp sentenceOp = null;
+  private NLPTokenizerOp tokenizerOp = null;
+
+  public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) throws IOException {
+    super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
+    if (sentenceOp == null || tokenizerOp == null) {
+      throw new IllegalArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
+    }
+    this.sentenceOp = sentenceOp;
+    this.tokenizerOp = tokenizerOp;
+  }
+
+  @Override
+  public void close() throws IOException {
+    super.close();
+    termSpans = null;
+    termNum = sentenceStart = 0;
+  };
+
+  @Override
+  protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+    this.sentenceStart = sentenceStart;
+    String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
+    termSpans = tokenizerOp.getTerms(sentenceText);
+    termNum = 0;
+  }
+
+  @Override
+  protected boolean incrementWord() {
+    if (termSpans == null || termNum == termSpans.length) {
+      return false;
+    }
+    clearAttributes();
+    Span term = termSpans[termNum];
+    termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
+    offsetAtt.setOffset(correctOffset(offset + sentenceStart + term.getStart()),
+                        correctOffset(offset + sentenceStart + term.getEnd()));
+    if (termNum == termSpans.length - 1) {
+      flagsAtt.setFlags(flagsAtt.getFlags() | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
+    }
+    ++termNum;
+    return true;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    termSpans = null;
+    termNum = sentenceStart = 0;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizerFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizerFactory.java
new file mode 100644
index 0000000..a60f23f
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizerFactory.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
+import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+
+/**
+ * Factory for {@link OpenNLPTokenizer}.
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_opennlp" class="solr.TextField" positionIncrementGap="100"
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * @since 7.3.0
+ */
+public class OpenNLPTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
+  public static final String SENTENCE_MODEL = "sentenceModel";
+  public static final String TOKENIZER_MODEL = "tokenizerModel";
+
+  private final String sentenceModelFile;
+  private final String tokenizerModelFile;
+
+  public OpenNLPTokenizerFactory(Map<String,String> args) {
+    super(args);
+    sentenceModelFile = require(args, SENTENCE_MODEL);
+    tokenizerModelFile = require(args, TOKENIZER_MODEL);
+    if ( ! args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public OpenNLPTokenizer create(AttributeFactory factory) {
+    try {
+      NLPSentenceDetectorOp sentenceOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+      NLPTokenizerOp tokenizerOp = OpenNLPOpsFactory.getTokenizer(tokenizerModelFile);
+      return new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    // register models in cache with file/resource names
+    if (sentenceModelFile != null) {
+      OpenNLPOpsFactory.getSentenceModel(sentenceModelFile, loader);
+    }
+    if (tokenizerModelFile != null) {
+      OpenNLPOpsFactory.getTokenizerModel(tokenizerModelFile, loader);
+    }
+  }
+}


Mime
View raw message