opennlp-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ma...@apache.org
Subject svn commit: r1550134 - in /opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder: ./ impls/
Date Wed, 11 Dec 2013 13:37:36 GMT
Author: markg
Date: Wed Dec 11 13:37:36 2013
New Revision: 1550134

URL: http://svn.apache.org/r1550134
Log:
OPENNLP-607
Fixed many issues. Added default file-based impls for all interfaces, and created a util class
wrapper to allow for easy use of the default implementations.

Added:
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
    opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java
      - copied, changed from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java

Added: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java?rev=1550134&view=auto
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
(added)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/DefaultModelBuilderUtil.java
Wed Dec 11 13:37:36 2013
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.modelbuilder;
+
+import java.io.File;
+import opennlp.modelbuilder.impls.BaseModelBuilderParams;
+import opennlp.modelbuilder.impls.FileKnownEntityProvider;
+import opennlp.modelbuilder.impls.FileModelValidatorImpl;
+import opennlp.modelbuilder.impls.FileSentenceProvider;
+import opennlp.modelbuilder.impls.GenericModelGenerator;
+import opennlp.modelbuilder.impls.GenericModelableImpl;
+
+/**
+ *
+ * Utilizes the filebased implementations to produce an NER model from user
+ * The basic processing is such
+ * read in the list of known entities
+ * annotate the sentences based on the list of known entities
+ * create a model from the annotations
+ * perform NER with the model on the sentences
+ * add the NER results to the annotations
+ * rebuild the model
+ * loop
+ * defined data
+ */
+public class DefaultModelBuilderUtil {
+
+  /**
+   *
+   * @param sentences                a file that contains one sentence per line.
+   *                                 There should be at least 15K sentences
+   *                                 consisting of a representative sample from
+   *                                 user data
+   * @param knownEntities            a file consisting of a simple list of
+   *                                 unambiguous entities, one entry per line.
+   *                                 For instance, if one was trying to build a
+   *                                 person NER model then this file would be a
+   *                                 list of person names that are unambiguous
+   *                                 and are known to exist in the sentences
+   *                                 file
+   * @param knownEntitiesBlacklist   This file contains a list of known bad hits
+   *                                 that the NER phase of this processing might
+   *                                 catch early one before the model iterates
+   *                                 to maturity
+   * @param modelOutFile             the location where the model will be
+   *                                 written to
+   * @param annotatedSentenceOutFile where the annotated sentences produced by
+   *                                 this process will be written to
+   * @param namedEntityType          the type of entity... for example, person,
+   *                                 location, organization...
+   * @param iterations               how many times to repeat the iterative loop
+   *                                 of annotation, model generation, and NER
+   */
+  public static void generateModel(File sentences, File knownEntities, File knownEntitiesBlacklist,
+          File modelOutFile, File annotatedSentenceOutFile, String namedEntityType, int iterations)
{
+    SemiSupervisedModelGenerator modelGenerator = new GenericModelGenerator();
+    BaseModelBuilderParams params = new BaseModelBuilderParams();
+    params.setAnnotatedTrainingDataFile(annotatedSentenceOutFile);
+    params.setSentenceFile(sentences);
+    params.setEntityType(namedEntityType);
+    params.setKnownEntitiesFile(knownEntities);
+    params.setModelFile(modelOutFile);
+    params.setKnownEntityBlacklist(knownEntitiesBlacklist);
+    /**
+     * sentence providers feed this process with user data derived sentences
+     * this impl just reads line by line through a file
+     */
+    SentenceProvider sentenceProvider = new FileSentenceProvider();
+    sentenceProvider.setParameters(params);
+    /**
+     * KnownEntityProviders provide a seed list of known entities... such as
+     * Barack Obama for person, or Germany for location obviously these would
+     * want to be prolific, non ambiguous names
+     */
+    KnownEntityProvider knownEntityProvider = new FileKnownEntityProvider();
+    knownEntityProvider.setParameters(params);
+    /**
+     * ModelGenerationValidators try to weed out bad hits by the iterations of
+     * the name finder. Since this is a recursive process, with each iteration
+     * the namefinder will get more and more greedy if bad entities are allowed
+     * in this provides a mechanism for throwing out obviously bad hits. A good
+     * impl may be to make sure a location is actually within a noun phrase
+     * etc...users can make this as specific as they need for their dat and
+     * their use case
+     */
+    ModelGenerationValidator validator = new FileModelValidatorImpl();
+    validator.setParameters(params);
+    /**
+     * Modelable's write and read the annotated sentences, as well as create and
+     * write the NER models
+     */
+    Modelable modelable = new GenericModelableImpl();
+    modelable.setParameters(params);
+
+    /**
+     * the modelGenerator actually runs the process with a set number of
+     * iterations... could be better by actually calculating the diff between
+     * runs and stopping based on a thresh, but for extrememly large sentence
+     * sets this may be too much.
+     */
+    modelGenerator.build(sentenceProvider, knownEntityProvider, validator, modelable, iterations);
+
+  }
+}

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/KnownEntityProvider.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/KnownEntityProvider.java
Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
 
 import java.util.Set;
 

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelGenerationValidator.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelGenerationValidator.java
Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
 
 import java.util.Collection;
 

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/ModelParameter.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/ModelParameter.java
Wed Dec 11 13:37:36 2013
@@ -13,14 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
 
-import java.util.Map;
+import opennlp.modelbuilder.impls.BaseModelBuilderParams;
 
 /**
  *
  */
-public interface ModelParameter {
+public interface ModelParameter<T extends  BaseModelBuilderParams>{
    
-  void setParameters(Map<String, String> params);
+  void setParameters(T params);
+  
+
 }

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/Modelable.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/Modelable.java
Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
 
 import java.util.Set;
 import opennlp.tools.namefind.TokenNameFinderModel;

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SemiSupervisedModelGenerator.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SemiSupervisedModelGenerator.java
Wed Dec 11 13:37:36 2013
@@ -13,13 +13,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
+
+import opennlp.modelbuilder.impls.BaseModelBuilderParams;
 
 /**
  *
 
  */
-public interface SemiSupervisedModelGenerator extends ModelParameter {
+public interface SemiSupervisedModelGenerator extends ModelParameter<BaseModelBuilderParams>
{
 
   void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,

           ModelGenerationValidator validator, Modelable modelable, int iterations);

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/SentenceProvider.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/SentenceProvider.java
Wed Dec 11 13:37:36 2013
@@ -13,14 +13,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder;
 
 import java.util.Set;
+import opennlp.modelbuilder.impls.BaseModelBuilderParams;
 
 /**
  *
  */
-public interface SentenceProvider extends ModelParameter {
+public interface SentenceProvider extends ModelParameter<BaseModelBuilderParams> {
 
   Set<String> getSentences();
 }

Added: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java?rev=1550134&view=auto
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
(added)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/BaseModelBuilderParams.java
Wed Dec 11 13:37:36 2013
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.modelbuilder.impls;
+
+import java.io.File;
+import java.util.Map;
+
+/**
+ *
+ * Used to pass params through the processing
+ */
+public class BaseModelBuilderParams {
+
+  private File modelFile;
+  private File sentenceFile;
+  private File knownEntitiesFile;
+  private File knownEntityBlacklist;
+  private File annotatedTrainingDataFile;
+  private String entityType;
+  private Map<String, String> additionalParams;
+
+  public File getModelFile() {
+    return modelFile;
+  }
+
+  public void setModelFile(File modelFile) {
+    this.modelFile = modelFile;
+  }
+
+  public File getSentenceFile() {
+    return sentenceFile;
+  }
+
+  public void setSentenceFile(File sentenceFile) {
+    this.sentenceFile = sentenceFile;
+  }
+
+  public File getKnownEntitiesFile() {
+    return knownEntitiesFile;
+  }
+
+  public void setKnownEntitiesFile(File knownEntitiesFile) {
+    this.knownEntitiesFile = knownEntitiesFile;
+  }
+
+  public File getKnownEntityBlacklist() {
+    return knownEntityBlacklist;
+  }
+
+  public void setKnownEntityBlacklist(File knownEntityBlacklist) {
+    this.knownEntityBlacklist = knownEntityBlacklist;
+  }
+
+  public Map<String, String> getAdditionalParams() {
+    return additionalParams;
+  }
+
+  public void setAdditionalParams(Map<String, String> additionalParams) {
+    this.additionalParams = additionalParams;
+  }
+
+  public String getEntityType() {
+    return entityType;
+  }
+
+  public void setEntityType(String entityType) {
+    this.entityType = entityType;
+  }
+
+  public File getAnnotatedTrainingDataFile() {
+    return annotatedTrainingDataFile;
+  }
+
+  public void setAnnotatedTrainingDataFile(File annotatedTrainingDataFile) {
+    this.annotatedTrainingDataFile = annotatedTrainingDataFile;
+  }
+}
\ No newline at end of file

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileKnownEntityProvider.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileKnownEntityProvider.java
Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2.impls;
+package opennlp.modelbuilder.impls;
 
 import java.io.BufferedReader;
 import java.io.FileInputStream;
@@ -22,21 +22,19 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
-import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
 import java.util.Set;
 import java.util.logging.Level;
 import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.KnownEntityProvider;
+import opennlp.modelbuilder.KnownEntityProvider;
 
 /**
  *
  */
 public class FileKnownEntityProvider implements KnownEntityProvider {
-  private Map<String, String> params = new HashMap<String, String>();
+ 
   Set<String> knownEntities = new HashSet<String>();
-
+  BaseModelBuilderParams params;
   @Override
   public Set<String> getKnownEntities() {
     if (knownEntities.isEmpty()) {
@@ -45,10 +43,10 @@ public class FileKnownEntityProvider imp
         BufferedReader br;
         String line;
 
-        fis = new FileInputStream(params.get("knownentityfile"));
+        fis = new FileInputStream(params.getKnownEntitiesFile());
         br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
         while ((line = br.readLine()) != null) {
-          knownEntities.add(line.split("\t")[2]);
+          knownEntities.add(line);
         }
 
         // Done with the file
@@ -72,13 +70,13 @@ public class FileKnownEntityProvider imp
   @Override
   public String getKnownEntitiesType() {
  
-    return params.get("knownentitytype");
+    return params.getEntityType();
   }
 
 
 
   @Override
-  public void setParameters(Map<String, String> params) {
+ public void setParameters(BaseModelBuilderParams params) {
     this.params = params;
   }
 }

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileModelValidatorImpl.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileModelValidatorImpl.java
Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2.impls;
+package opennlp.modelbuilder.impls;
 
 import java.io.BufferedReader;
 import java.io.FileInputStream;
@@ -23,27 +23,22 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
 import java.util.Set;
 import java.util.logging.Level;
 import java.util.logging.Logger;
-import java.util.regex.Pattern;
-import opennlp.modelbuilder.v2.ModelGenerationValidator;
+import opennlp.modelbuilder.ModelGenerationValidator;
 
 /**
- *
+ *Validates NER results input before inclusion into the model
  */
 public class FileModelValidatorImpl implements ModelGenerationValidator {
 
   private Set<String> badentities = new HashSet<String>();
-  private final double MIN_SCORE_FOR_TRAINING = 0.95d;
-  private Object validationData;
-  private Map<String, String> params = new HashMap<String, String>();
+  BaseModelBuilderParams params;
 
   @Override
-  public void setParameters(Map<String, String> params) {
+  public void setParameters(BaseModelBuilderParams params) {
     this.params = params;
   }
 
@@ -59,11 +54,11 @@ public class FileModelValidatorImpl impl
     if (badentities.isEmpty()) {
       getBlackList();
     }
-
-    Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
-    if (p.matcher(namedEntity).find()) {
-      return false;
-    }
+//
+//    Pattern p = Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+//    if (p.matcher(namedEntity).find()) {
+//      return false;
+//    }
     Boolean b = true;
     if (badentities.contains(namedEntity.toLowerCase())) {
       b = false;
@@ -73,17 +68,20 @@ public class FileModelValidatorImpl impl
 
   @Override
   public Collection<String> getBlackList() {
+    if (params.getKnownEntityBlacklist() == null) {
+      return badentities;
+    }
     if (!badentities.isEmpty()) {
       try {
         InputStream fis;
         BufferedReader br;
         String line;
 
-        fis = new FileInputStream(params.get("blacklistfile"));
+        fis = new FileInputStream(params.getKnownEntityBlacklist());
         br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
         while ((line = br.readLine()) != null) {
           badentities.add(line);
-        }        
+        }
         br.close();
         br = null;
         fis = null;

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/FileSentenceProvider.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/FileSentenceProvider.java
Wed Dec 11 13:37:36 2013
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2.impls;
+package opennlp.modelbuilder.impls;
 
 import java.io.BufferedReader;
 import java.io.FileInputStream;
@@ -22,20 +22,18 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
-import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
 import java.util.Set;
 import java.util.logging.Level;
 import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.SentenceProvider;
+import opennlp.modelbuilder.SentenceProvider;
 
 /**
- *
+ * Provides user sentences via a simple text file
  */
 public class FileSentenceProvider implements SentenceProvider {
 
-  private Map<String, String> params = new HashMap<String, String>();
+  BaseModelBuilderParams params ;
   Set<String> sentences = new HashSet<String>();
 
   public Set<String> getSentences() {
@@ -45,7 +43,7 @@ public class FileSentenceProvider implem
         BufferedReader br;
         String line;
 
-        fis = new FileInputStream(params.get("sentencesfile"));
+        fis = new FileInputStream(params.getSentenceFile());
         br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
         int i=0;
         while ((line = br.readLine()) != null) {
@@ -66,7 +64,7 @@ public class FileSentenceProvider implem
     return sentences;
   }
 
-  public void setParameters(Map<String, String> params) {
+ public void setParameters(BaseModelBuilderParams params) {
     this.params = params;
   }
 }

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/GenericModelGenerator.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelGenerator.java
Wed Dec 11 13:37:36 2013
@@ -13,24 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2;
+package opennlp.modelbuilder.impls;
 
 import java.util.HashMap;
 import java.util.Map;
+import opennlp.modelbuilder.KnownEntityProvider;
+import opennlp.modelbuilder.ModelGenerationValidator;
+import opennlp.modelbuilder.Modelable;
+import opennlp.modelbuilder.SemiSupervisedModelGenerator;
+import opennlp.modelbuilder.SentenceProvider;
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.util.Span;
 
 /**
  *
- *Generic impl
+ * Generic impl that handles all processing using the default file implementations
  */
-public class GenericModelGenerator implements SemiSupervisedModelGenerator{
- private Map<String, String> params = new HashMap<String, String>();
+public class GenericModelGenerator implements SemiSupervisedModelGenerator {
+
+  private Map<String, String> params = new HashMap<String, String>();
 
   @Override
-  public void setParameters(Map<String, String> params) {
-    this.params = params;
+  public void setParameters(BaseModelBuilderParams params) {
+    this.params = params.getAdditionalParams();
   }
+
   @Override
   public void build(SentenceProvider sentenceProvider, KnownEntityProvider knownEntityProvider,
           ModelGenerationValidator validator, Modelable modelable, int iterations) {
@@ -47,12 +54,23 @@ public class GenericModelGenerator imple
           }
         }
       }
+      if (sentenceProvider.getSentences().isEmpty()) {
+        System.out.println("No sentences in file");
+        return;
+      }
+      if (knownEntityProvider.getKnownEntities().isEmpty()) {
+        System.out.println("No known entities in file");
+        return;
+      }
       System.out.println("\t\twriting annotated sentences....: ");
       modelable.writeAnnotatedSentences();
+          System.out.println("\t\tbuilding model.... ");
       modelable.buildModel(knownEntityProvider.getKnownEntitiesType());
+      System.out.println("\t\tmodel building complete.... ");
       NameFinderME nf = new NameFinderME(modelable.getModel());
       System.out.println("\t\tannotated sentences: " + modelable.getAnnotatedSentences().size());
-      System.out.println("\tPerforming NER");
+      System.out.println("\tPerforming NER with new model");
+      System.out.println("\t\tPrinting NER Results. Add undesired results to the blacklist
file and start over");
       for (String sentence : sentenceProvider.getSentences()) {
         if (!validator.validSentence(sentence)) {
           continue;
@@ -65,10 +83,14 @@ public class GenericModelGenerator imple
         String[] namedEntities = Span.spansToStrings(find, tokens);
 
         for (String namedEntity : namedEntities) {
+          System.out.println("\t\t" + namedEntity);
           if (validator.validNamedEntity(namedEntity)) {
+
             knownEntityProvider.addKnownEntity(namedEntity);
             modelable.addAnnotatedSentence(modelable.annotate(sentence, namedEntity, knownEntityProvider.getKnownEntitiesType()));
 
+          } else {
+            System.out.println("\t\t" + namedEntity + "...already blacklisted");
           }
         }
       }

Copied: opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java
(from r1544512, opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java?p2=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java&p1=opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java&r1=1544512&r2=1550134&rev=1550134&view=diff
==============================================================================
--- opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/v2/impls/ModelableImpl.java
(original)
+++ opennlp/sandbox/modelbuilder-prototype/src/main/java/opennlp/modelbuilder/impls/GenericModelableImpl.java
Wed Dec 11 13:37:36 2013
@@ -13,57 +13,43 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.modelbuilder.v2.impls;
+package opennlp.modelbuilder.impls;
 
 import java.io.BufferedOutputStream;
-import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.charset.Charset;
-import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
 import java.util.Set;
 import java.util.logging.Level;
 import java.util.logging.Logger;
-import opennlp.modelbuilder.v2.Modelable;
+import opennlp.modelbuilder.Modelable;
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.NameSample;
 import opennlp.tools.namefind.NameSampleDataStream;
 import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.tokenize.TokenizerME;
-import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 
 /**
- *
+ * Creates annotations, writes annotations to file, and creates a model and writes to a file
  */
-public class ModelableImpl implements Modelable {
+public class GenericModelableImpl implements Modelable {
 
-  private TokenizerModel tm;
-  private TokenizerME wordBreaker;
-  private String path = "c:\\temp\\opennlpmodels\\";
-  private String trainingDataPath = "";
-  private String modelOutPath = "";
   private Set<String> annotatedSentences = new HashSet<String>();
-  private Map<String, String> params = new HashMap<String, String>();
+  BaseModelBuilderParams params;
 
   @Override
-  public void setParameters(Map<String, String> params) {
+  public void setParameters(BaseModelBuilderParams params) {
     this.params = params;
-    path = params.get("modelablepath");
-    trainingDataPath = path + "\\" + params.get("knownentitytype") + ".train";
-    modelOutPath = path + "\\" + params.get("knownentitytype")+".model";
   }
 
   @Override
   public String annotate(String sentence, String namedEntity, String entityType) {
     String annotation = sentence.replace(namedEntity, " <START:" + entityType + ">
" + namedEntity + " <END> ");
-
     return annotation;
   }
 
@@ -71,7 +57,7 @@ public class ModelableImpl implements Mo
   public void writeAnnotatedSentences() {
     try {
 
-      FileWriter writer = new FileWriter(trainingDataPath, false);
+      FileWriter writer = new FileWriter(params.getAnnotatedTrainingDataFile(), false);
 
       for (String s : annotatedSentences) {
         writer.write(s.replace("\n", " ").trim() + "\n");
@@ -104,13 +90,13 @@ public class ModelableImpl implements Mo
       System.out.println("\t\treading training data...");
       Charset charset = Charset.forName("UTF-8");
       ObjectStream<String> lineStream =
-              new PlainTextByLineStream(new FileInputStream(trainingDataPath), charset);
+              new PlainTextByLineStream(new FileInputStream(params.getAnnotatedTrainingDataFile()),
charset);
       ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
 
       TokenNameFinderModel model;
       model = NameFinderME.train("en", entityType, sampleStream, null);
       sampleStream.close();
-      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(new File(modelOutPath)));
+      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(params.getModelFile()));
       model.serialize(modelOut);
       if (modelOut != null) {
         modelOut.close();
@@ -126,9 +112,9 @@ public class ModelableImpl implements Mo
 
     TokenNameFinderModel nerModel = null;
     try {
-      nerModel = new TokenNameFinderModel(new FileInputStream(new File(modelOutPath)));
+      nerModel = new TokenNameFinderModel(new FileInputStream(params.getModelFile()));
     } catch (IOException ex) {
-      Logger.getLogger(ModelableImpl.class.getName()).log(Level.SEVERE, null, ex);
+      Logger.getLogger(GenericModelableImpl.class.getName()).log(Level.SEVERE, null, ex);
     }
     return nerModel;
   }



Mime
View raw message