lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject svn commit: r794034 - in /lucene/java/trunk/contrib: ./ analyzers/src/java/org/apache/lucene/analysis/ngram/ analyzers/src/test/org/apache/lucene/analysis/ngram/
Date Tue, 14 Jul 2009 19:44:52 GMT
Author: otis
Date: Tue Jul 14 19:44:52 2009
New Revision: 794034

URL: http://svn.apache.org/viewvc?rev=794034&view=rev
Log:
LUCENE-1491 - EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size.
- line, and those below, will be ignored--

M    CHANGES.txt
M    analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
M    analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
M    analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
M    analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=794034&r1=794033&r2=794034&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Tue Jul 14 19:44:52 2009
@@ -36,6 +36,9 @@
     StandardTokenizer so that stop words with mixed case are filtered
     out.  (Rafael Cunha de Almeida, Douglas Campos via Mike McCandless)
 
+ 8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram
size.
+    (Todd Teak via Otis Gospodnetic)
+
 New features
 
  1. LUCENE-1531: Added support for BoostingTermQuery to XML query parser. (Karl Wettin)

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=794034&r1=794033&r2=794034&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
Tue Jul 14 19:44:52 2009
@@ -117,19 +117,25 @@
   /** Returns the next token in the stream, or null at EOS. */
   public final Token next(final Token reusableToken) throws IOException {
     assert reusableToken != null;
-    if (ngrams.size() > 0) {
-      return (Token) ngrams.removeFirst();
+    if (!ngrams.isEmpty()) {
+        return (Token)ngrams.removeFirst();
     }
 
-    Token nextToken = input.next(reusableToken);
-    if (nextToken == null)
-      return null;
-
-    ngram(nextToken);
-    if (ngrams.size() > 0)
-      return (Token) ngrams.removeFirst();
-    else
-      return null;
+    Token token = null;
+
+    while (ngrams.isEmpty() && (token = input.next()) != null) {
+        ngram(token);
+    }
+
+    if (token == null) {
+        return null;
+    }
+
+    if (!ngrams.isEmpty()) {
+        return (Token)ngrams.removeFirst();
+    } else {
+        return null;
+    }
   }
 
   private void ngram(final Token token) {

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=794034&r1=794033&r2=794034&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
Tue Jul 14 19:44:52 2009
@@ -64,19 +64,25 @@
   /** Returns the next token in the stream, or null at EOS. */
   public final Token next(final Token reusableToken) throws IOException {
     assert reusableToken != null;
-    if (ngrams.size() > 0) {
-      return (Token) ngrams.removeFirst();
+    if (!ngrams.isEmpty()) {
+        return (Token)ngrams.removeFirst();
     }
 
-    Token nextToken = input.next(reusableToken);
-    if (nextToken == null)
-      return null;
+    Token token = null;
 
-    ngram(nextToken);
-    if (ngrams.size() > 0)
-      return (Token) ngrams.removeFirst();
-    else
-      return null;
+    while (ngrams.isEmpty() && (token = input.next()) != null) {
+        ngram(token);
+    }
+
+    if (token == null) {
+        return null;
+    }
+
+    if (!ngrams.isEmpty()) {
+        return (Token)ngrams.removeFirst();
+    } else {
+        return null;
+    }
   }
 
   private void ngram(Token token) { 

Modified: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=794034&r1=794033&r2=794034&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
(original)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
Tue Jul 14 19:44:52 2009
@@ -109,4 +109,16 @@
     assertEquals("(cde,2,5)", nextToken.toString());
     assertNull(tokenizer.next(reusableToken));
   }
+  
+  public void testSmallTokenInStream() throws Exception {
+    input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT,
3, 3);
+    final Token reusableToken = new Token();
+    Token nextToken = tokenizer.next(reusableToken);
+    assertEquals("(abc,0,3)", nextToken.toString());
+    nextToken = tokenizer.next(reusableToken);
+    assertNotNull(nextToken);
+    assertEquals("(fgh,0,3)", nextToken.toString());
+    assertNull(tokenizer.next(reusableToken));
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=794034&r1=794033&r2=794034&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
(original)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
Tue Jul 14 19:44:52 2009
@@ -120,4 +120,16 @@
 
         assertTrue(tokens.isEmpty());
     }
+    
+    public void testSmallTokenInStream() throws Exception {
+      input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+      NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
+      final Token reusableToken = new Token();
+      Token nextToken = filter.next(reusableToken);
+      assertEquals("(abc,0,3)", nextToken.toString());
+      nextToken = filter.next(reusableToken);
+      assertNotNull(nextToken);
+      assertEquals("(fgh,0,3)", nextToken.toString());
+      assertNull(filter.next(reusableToken));
+    }
 }



Mime
View raw message