opennlp-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jo...@apache.org
Subject svn commit: r1185047 - /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
Date Mon, 17 Oct 2011 09:08:10 GMT
Author: joern
Date: Mon Oct 17 09:08:10 2011
New Revision: 1185047

URL: http://svn.apache.org/viewvc?rev=1185047&view=rev
Log:
OPENNLP-327 Added option to only use all-letter tokens.

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java?rev=1185047&r1=1185046&r2=1185047&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
(original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
Mon Oct 17 09:08:10 2011
@@ -21,17 +21,37 @@ package opennlp.tools.doccat;
 import java.util.ArrayList;
 import java.util.Collection;
 
+import opennlp.tools.util.featuregen.StringPattern;
+
 /**
  * Generates a feature for each word in a document.
  */
 public class BagOfWordsFeatureGenerator implements FeatureGenerator {
 
+  private boolean useOnlyAllLetterTokens = false;
+  
+  public BagOfWordsFeatureGenerator() {
+  }
+  
+  BagOfWordsFeatureGenerator(boolean useOnlyAllLetterTokens) {
+    this.useOnlyAllLetterTokens = useOnlyAllLetterTokens;
+  }
+  
   public Collection<String> extractFeatures(String[] text) {
 
     Collection<String> bagOfWords = new ArrayList<String>(text.length);
 
     for (int i = 0; i < text.length; i++) {
-      bagOfWords.add("bow=" + text[i]);
+      
+      if (useOnlyAllLetterTokens) {
+        StringPattern pattern = StringPattern.recognize(text[i]);
+        
+        if (pattern.isAllLetter())
+          bagOfWords.add("bow=" + text[i]);
+      }
+      else {
+        bagOfWords.add("bow=" + text[i]);
+      }
     }
 
     return bagOfWords;



Mime
View raw message