lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject svn commit: r513876 - in /lucene/java/trunk: CHANGES.txt contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
Date Fri, 02 Mar 2007 18:19:54 GMT
Author: otis
Date: Fri Mar  2 10:19:53 2007
New Revision: 513876

URL: http://svn.apache.org/viewvc?view=rev&rev=513876
Log:
- LUCENE-759: Two n-gram producting TokenFilters (using them for the spellchecker in SOLR-81)

Added:
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
Modified:
    lucene/java/trunk/CHANGES.txt

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=513876&r1=513875&r2=513876
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Mar  2 10:19:53 2007
@@ -34,6 +34,11 @@
     implementations to be specified via the System property
     org.apache.lucene.store.FSDirectoryLockFactoryClass.  (Mike McCandless)
 
+New features
+
+ 1. LUCENE-759: Added two n-gram-producing TokenFilters.
+    (Otis Gospodnetic)
+
 Optimizations
 
 ======================= Release 2.1.0 2007-02-14 =======================

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?view=auto&rev=513876
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
Fri Mar  2 10:19:53 2007
@@ -0,0 +1,119 @@
+package org.apache.lucene.analysis.ngram;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests {@link EdgeNGramTokenFilter} for correctness.
+ * @author Otis Gospodnetic
+ */
+public class EdgeNGramTokenFilterTest extends TestCase {
+  private TokenStream input;
+
+  public void setUp() {
+    input = new WhitespaceTokenizer(new StringReader("abcde"));
+  }
+
+  public void testInvalidInput() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 0, 0);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+
+  public void testInvalidInput2() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 2, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+
+  public void testInvalidInput3() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, -1, 2);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+
+  public void testFrontUnigram() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT,
1, 1);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(a,0,1)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testBackUnigram() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK,
1, 1);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(e,4,5)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testOversizedNgrams() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT,
6, 6);
+    Token token = null;
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testFrontRangeOfNgrams() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT,
1, 3);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(a,0,1)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(ab,0,2)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(abc,0,3)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testBackRangeOfNgrams() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK,
1, 3);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(e,4,5)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(de,3,5)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(cde,2,5)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+}

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?view=auto&rev=513876
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
Fri Mar  2 10:19:53 2007
@@ -0,0 +1,139 @@
+package org.apache.lucene.analysis.ngram;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests {@link NGramTokenFilter} for correctness.
+ * @author Otis Gospodnetic
+ */
+public class NGramTokenFilterTest extends TestCase {
+    private TokenStream input;
+    private ArrayList tokens = new ArrayList();
+    
+    public void setUp() {
+        input = new WhitespaceTokenizer(new StringReader("abcde"));
+    }
+
+    public void testInvalidInput() throws Exception {
+        boolean gotException = false;
+        try {        
+            new NGramTokenFilter(input, 2, 1);
+        } catch (IllegalArgumentException e) {
+            gotException = true;
+        }
+        assertTrue(gotException);
+    }
+
+    public void testInvalidInput2() throws Exception {
+        boolean gotException = false;
+        try {        
+            new NGramTokenFilter(input, 0, 1);
+        } catch (IllegalArgumentException e) {
+            gotException = true;
+        }
+        assertTrue(gotException);
+    }
+
+    public void testUnigrams() throws Exception {
+      NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
+        
+        Token token = null;
+        do { 
+            token = filter.next();
+            if (token != null) {
+                tokens.add(token.toString());
+//                System.out.println(token.termText());
+//                System.out.println(token);
+//                Thread.sleep(1000);
+            }
+        } while (token != null);
+
+        assertEquals(5, tokens.size());
+        ArrayList exp = new ArrayList();
+        exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
+        assertEquals(exp, tokens);
+    }
+
+    public void testBigrams() throws Exception {
+      NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
+        
+        Token token = null;
+        do { 
+            token = filter.next();
+            if (token != null) {
+                tokens.add(token.toString());
+//                System.out.println(token.termText());
+//                System.out.println(token);
+//                Thread.sleep(1000);
+            }
+        } while (token != null);
+
+        assertEquals(4, tokens.size());
+        ArrayList exp = new ArrayList();
+        exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
+        assertEquals(exp, tokens);
+    }
+
+    public void testNgrams() throws Exception {
+      NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+        
+        Token token = null;
+        do { 
+            token = filter.next();
+            if (token != null) {
+                tokens.add(token.toString());
+//                System.out.println(token.termText());
+//                System.out.println(token);
+//                Thread.sleep(1000);
+            }
+        } while (token != null);
+
+        assertEquals(12, tokens.size());
+        ArrayList exp = new ArrayList();
+        exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
+        exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
+        exp.add("(abc,0,3)"); exp.add("(bcd,1,4)"); exp.add("(cde,2,5)");
+        assertEquals(exp, tokens);
+    }
+
+    public void testOversizedNgrams() throws Exception {
+      NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+        
+        Token token = null;
+        do { 
+            token = filter.next();
+            if (token != null) {
+                tokens.add(token.toString());
+//                System.out.println(token.termText());
+//                System.out.println(token);
+//                Thread.sleep(1000);
+            }
+        } while (token != null);
+
+        assertTrue(tokens.isEmpty());
+    }
+}



Mime
View raw message