lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tomm...@apache.org
Subject svn commit: r1401338 - in /lucene/dev/trunk/lucene/classification: ./ src/java/org/apache/lucene/classification/ src/test/org/apache/lucene/classification/
Date Tue, 23 Oct 2012 16:18:42 GMT
Author: tommaso
Date: Tue Oct 23 16:18:42 2012
New Revision: 1401338

URL: http://svn.apache.org/viewvc?rev=1401338&view=rev
Log:
[LUCENE-4345] - adding k-nearestneighbor classifier (based on mlt), improving testing by abstracting
a basetest class, added ngram test for simplenaivebayes, changed build.xml to incorporate
queries and analyzer-common deps

Added:
    lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
  (with props)
    lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
  (with props)
    lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
  (with props)
    lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
  (with props)
Modified:
    lucene/dev/trunk/lucene/classification/build.xml
    lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
    lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
    lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html
    lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java

Modified: lucene/dev/trunk/lucene/classification/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/build.xml?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/build.xml (original)
+++ lucene/dev/trunk/lucene/classification/build.xml Tue Oct 23 16:18:42 2012
@@ -23,4 +23,22 @@
   </description>
 
   <import file="../module-build.xml"/>
+
+  <path id="base.classpath">
+    <pathelement location="${common.dir}/build/core/classes/java"/>
+    <pathelement path="${queries.jar}"/>
+    <pathelement path="${project.classpath}"/>
+  </path>
+
+  <path id="test.classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <pathelement location="${common.dir}/build/test-framework/classes/java"/>
+    <pathelement location="${common.dir}/build/codecs/classes/java"/>
+    <path refid="classpath"/>
+    <path refid="junit-path"/>
+    <pathelement location="${build.dir}/classes/java"/>
+  </path>
+
+  <target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core"
/>
+
 </project>

Added: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java?rev=1401338&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
(added)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
Tue Oct 23 16:18:42 2012
@@ -0,0 +1,40 @@
+package org.apache.lucene.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The result of a call to {@link Classifier#assignClass(String)} holding an assigned class
and a score.
+ */
+public class ClassificationResult {
+
+  private String assignedClass;
+  private double score;
+
+  public ClassificationResult(String assignedClass, double score) {
+    this.assignedClass = assignedClass;
+    this.score = score;
+  }
+
+  public String getAssignedClass() {
+    return assignedClass;
+  }
+
+  public double getScore() {
+    return score;
+  }
+}

Modified: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
(original)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
Tue Oct 23 16:18:42 2012
@@ -29,12 +29,12 @@ import java.io.IOException;
 public interface Classifier {
 
   /**
-   * Assign a class to the given text String
+   * Assign a class (with score) to the given text String
    * @param text a String containing text to be classified
-   * @return a String representing a class
+   * @return a {@link ClassificationResult} holding assigned class and score
    * @throws IOException If there is a low-level I/O error.
    */
-  public String assignClass(String text) throws IOException;
+  public ClassificationResult assignClass(String text) throws IOException;
 
   /**
    * Train the classifier using the underlying Lucene index

Added: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java?rev=1401338&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
(added)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
Tue Oct 23 16:18:42 2012
@@ -0,0 +1,88 @@
+package org.apache.lucene.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.queries.mlt.MoreLikeThis;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>)
based
+ * on {@link MoreLikeThis}
+ */
+public class KNearestNeighborClassifier implements Classifier {
+
+  private MoreLikeThis mlt;
+  private String textFieldName;
+  private String classFieldName;
+  private IndexSearcher indexSearcher;
+  private int k;
+
+  public KNearestNeighborClassifier(int k) {
+    this.k = k;
+  }
+
+  @Override
+  public ClassificationResult assignClass(String text) throws IOException {
+    Query q = mlt.like(new StringReader(text), textFieldName);
+    TopDocs docs = indexSearcher.search(q, k);
+
+    // TODO : improve the nearest neighbor selection
+    Map<String, Integer> classCounts = new HashMap<String, Integer>();
+    for (ScoreDoc scoreDoc : docs.scoreDocs) {
+      String cl = indexSearcher.doc(scoreDoc.doc).getField(classFieldName).stringValue();
+      Integer count = classCounts.get(cl);
+      if (count != null) {
+        classCounts.put(cl, count + 1);
+      }
+      else {
+        classCounts.put(cl, 1);
+      }
+    }
+    int max = 0;
+    String assignedClass = null;
+    for (String cl : classCounts.keySet()) {
+      Integer count = classCounts.get(cl);
+      if (count > max) {
+        max = count;
+        assignedClass = cl;
+      }
+    }
+    double score = 1; // TODO : derive score from query
+    return new ClassificationResult(assignedClass, score);
+  }
+
+  @Override
+  public void train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer) throws IOException {
+    this.textFieldName = textFieldName;
+    this.classFieldName = classFieldName;
+    mlt = new MoreLikeThis(atomicReader);
+    mlt.setAnalyzer(analyzer);
+    mlt.setFieldNames(new String[]{textFieldName});
+    indexSearcher = new IndexSearcher(atomicReader);
+  }
+}

Modified: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
(original)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
Tue Oct 23 16:18:42 2012
@@ -80,7 +80,7 @@ public class SimpleNaiveBayesClassifier 
     return result.toArray(new String[result.size()]);
   }
 
-  public String assignClass(String inputDocument) throws IOException {
+  public ClassificationResult assignClass(String inputDocument) throws IOException {
     if (atomicReader == null) {
       throw new RuntimeException("need to train the classifier first");
     }
@@ -98,7 +98,7 @@ public class SimpleNaiveBayesClassifier 
         foundClass = next.utf8ToString();
       }
     }
-    return foundClass;
+    return new ClassificationResult(foundClass, max);
   }
 
 

Modified: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html
(original)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html
Tue Oct 23 16:18:42 2012
@@ -18,6 +18,6 @@
 <body>
 Uses already seen data (the indexed documents) to classify new documents.
 Currently only contains a (simplistic) Lucene based Naive Bayes classifier 
-but more implementations will be added in the future.
+and a k-Nearest Neighbor classifier
 </body>
 </html>

Added: lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java?rev=1401338&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
(added)
+++ lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
Tue Oct 23 16:18:42 2012
@@ -0,0 +1,125 @@
+package org.apache.lucene.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.After;
+import org.junit.Before;
+
+/**
+ * Base class for testing {@link Classifier}s
+ */
+public class ClassificationTestBase extends LuceneTestCase {
+
+  private RandomIndexWriter indexWriter;
+  private String textFieldName;
+  private String classFieldName;
+  private Directory dir;
+
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    dir = newDirectory();
+    indexWriter = new RandomIndexWriter(random(), dir);
+    textFieldName = "text";
+    classFieldName = "cat";
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    super.tearDown();
+    indexWriter.close();
+    dir.close();
+  }
+
+  protected void checkCorrectClassification(Classifier classifier, Analyzer analyzer) throws
Exception {
+    SlowCompositeReaderWrapper compositeReaderWrapper = null;
+    try {
+      populateIndex(analyzer);
+      compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
+      classifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
+      String newText = "Much is made of what the likes of Facebook, Google and Apple know
about users. Truth is, Amazon may know more.";
+      ClassificationResult classificationResult = classifier.assignClass(newText);
+      assertEquals("technology", classificationResult.getAssignedClass());
+      assertTrue(classificationResult.getScore() > 0);
+    } finally {
+      if (compositeReaderWrapper != null)
+        compositeReaderWrapper.close();
+    }
+  }
+
+  private void populateIndex(Analyzer analyzer) throws Exception {
+
+    Document doc = new Document();
+    doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost
his cool and cursed at reporters " +
+        "who attempted to ask questions of the Republican presidential candidate in a public
plaza near the Tomb of " +
+        "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
+    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+
+    indexWriter.addDocument(doc, analyzer);
+
+    doc = new Document();
+    doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as
well as Jewish voters in the United" +
+        " States, that he will be tougher against Iran's nuclear ambitions than President
Barack Obama.", Field.Store.YES));
+    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+    indexWriter.addDocument(doc, analyzer);
+
+    doc = new Document();
+    doc.add(new TextField(textFieldName, "And there's a threshold question that he has to
answer for the American people and " +
+        "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we
look to the past events, we " +
+        "know that this raises some questions about his preparedness and we'll see how the
rest of his trip goes.\"", Field.Store.YES));
+    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+    indexWriter.addDocument(doc, analyzer);
+
+    doc = new Document();
+    doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional
Democrats have \"decided to " +
+        "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State
University of New York at " +
+        "Albany's School of Criminal Justice.", Field.Store.YES));
+    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+    indexWriter.addDocument(doc, analyzer);
+
+    doc = new Document();
+    doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the
state Capitol, Jorstad, director of " +
+        "technology at the University of Wisconsin-La Crosse, documented the historic moment
and shared it with the " +
+        "world through the Internet.", Field.Store.YES));
+    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+    indexWriter.addDocument(doc, analyzer);
+
+    doc = new Document();
+    doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've
spent the past year or so saying " +
+        "Facebook was going to make a phone. A new expert has stepped forward to say it's
not going to happen.", Field.Store.YES));
+    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+    indexWriter.addDocument(doc, analyzer);
+
+    doc = new Document();
+    doc.add(new TextField(textFieldName, "More than 400 million people trust Google with
their e-mail, and 50 million store files" +
+        " in the cloud using the Dropbox service. People manage their bank accounts, pay
bills, trade stocks and " +
+        "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
+    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+    indexWriter.addDocument(doc, analyzer);
+
+    indexWriter.commit();
+  }
+}

Added: lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java?rev=1401338&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
(added)
+++ lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
Tue Oct 23 16:18:42 2012
@@ -0,0 +1,33 @@
+package org.apache.lucene.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.junit.Test;
+
+/**
+ * Testcase for {@link KNearestNeighborClassifier}
+ */
+public class KNearestNeighborClassifierTest extends ClassificationTestBase {
+
+  @Test
+  public void testBasicUsage() throws Exception {
+     checkCorrectClassification(new KNearestNeighborClassifier(1), new MockAnalyzer(random()));
+  }
+
+}

Modified: lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
(original)
+++ lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
Tue Oct 23 16:18:42 2012
@@ -19,112 +19,32 @@ package org.apache.lucene.classification
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.SlowCompositeReaderWrapper;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.After;
-import org.junit.Before;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.junit.Test;
 
+import java.io.Reader;
+
 /**
  * Testcase for {@link SimpleNaiveBayesClassifier}
  */
-public class SimpleNaiveBayesClassifierTest extends LuceneTestCase {
-
-  private RandomIndexWriter indexWriter;
-  private String textFieldName;
-  private String classFieldName;
-  private Analyzer analyzer;
-  private Directory dir;
-
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    analyzer = new MockAnalyzer(random());
-    dir = newDirectory();
-    indexWriter = new RandomIndexWriter(random(), dir);
-    textFieldName = "text";
-    classFieldName = "cat";
-  }
-
-  @After
-  public void tearDown() throws Exception {
-    super.tearDown();
-    indexWriter.close();
-    dir.close();
-  }
+public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase {
 
   @Test
   public void testBasicUsage() throws Exception {
-    SlowCompositeReaderWrapper compositeReaderWrapper = null;
-    try {
-      populateIndex();
-      SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier();
-      compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
-      simpleNaiveBayesClassifier.train(compositeReaderWrapper, textFieldName, classFieldName,
analyzer);
-      String newText = "Much is made of what the likes of Facebook, Google and Apple know
about users. Truth is, Amazon may know more. ";
-      assertEquals("technology", simpleNaiveBayesClassifier.assignClass(newText));
-    } finally {
-      if (compositeReaderWrapper != null)
-        compositeReaderWrapper.close();
-    }
+    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()));
   }
 
-  private void populateIndex() throws Exception {
-
-    Document doc = new Document();
-    doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost
his cool and cursed at reporters " +
-        "who attempted to ask questions of the Republican presidential candidate in a public
plaza near the Tomb of " +
-        "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as
well as Jewish voters in the United" +
-        " States, that he will be tougher against Iran's nuclear ambitions than President
Barack Obama.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "And there's a threshold question that he has to
answer for the American people and " +
-        "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we
look to the past events, we " +
-        "know that this raises some questions about his preparedness and we'll see how the
rest of his trip goes.\"", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional
Democrats have \"decided to " +
-        "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State
University of New York at " +
-        "Albany's School of Criminal Justice.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the
state Capitol, Jorstad, director of " +
-        "technology at the University of Wisconsin-La Crosse, documented the historic moment
and shared it with the " +
-        "world through the Internet.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've
spent the past year or so saying " +
-        "Facebook was going to make a phone. A new expert has stepped forward to say it's
not going to happen.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "More than 400 million people trust Google with
their e-mail, and 50 million store files" +
-        " in the cloud using the Dropbox service. People manage their bank accounts, pay
bills, trade stocks and " +
-        "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
+  @Test
+  public void testNGramUsage() throws Exception {
+    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new NGramAnalyzer());
+  }
 
-    indexWriter.commit();
+  private class NGramAnalyzer extends Analyzer {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      return new TokenStreamComponents(new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK,
+          10, 20));
+    }
   }
 
 }



Mime
View raw message