lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From uschind...@apache.org
Subject svn commit: r812779 - in /lucene/java/trunk/contrib: CHANGES.txt analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
Date Wed, 09 Sep 2009 06:02:54 GMT
Author: uschindler
Date: Wed Sep  9 06:02:54 2009
New Revision: 812779

URL: http://svn.apache.org/viewvc?rev=812779&view=rev
Log:
LUCENE-1903: Fix incorrect ShingleFilter behavior when outputUnigrams == false

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=812779&r1=812778&r2=812779&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Wed Sep  9 06:02:54 2009
@@ -28,8 +28,10 @@
  * LUCENE-1460: Change contrib TokenStreams/Filters to use the new
     TokenStream API. (Robert Muir, Michael Busch)
 
- * LUCENE-1775: Change remaining TokenFilters (shingle, prefix-suffix) to
-    use the new TokenStream API. (Robert Muir, Michael Busch)
+ * LUCENE-1775, LUCENE-1903: Change remaining TokenFilters (shingle, prefix-suffix)
+    to use the new TokenStream API. ShingleFilter is much more efficient now,
+    it clones much less often and computes the tokens mostly on the fly now.
+    Also added more tests. (Robert Muir, Michael Busch)
     
  * LUCENE-1685: The position aware SpanScorer has become the default scorer
     for Highlighting. The SpanScorer implementation has replaced QueryScorer

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=812779&r1=812778&r2=812779&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
Wed Sep  9 06:02:54 2009
@@ -88,7 +88,6 @@
     this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
     this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
     this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
-    
   }
 
   /**
@@ -174,11 +173,15 @@
       
       nextToken = (AttributeSource.State) shingleBuf.getFirst();
       
-      if (shingleBufferPosition == 0 && (! shingleBuf.isEmpty()) && outputUnigrams)
{
-        restoreState(nextToken);
-        posIncrAtt.setPositionIncrement(1);
+      if (outputUnigrams) {
+        if (shingleBufferPosition == 0) {
+          restoreState(nextToken);
+          posIncrAtt.setPositionIncrement(1);
+          shingleBufferPosition++;
+          return true;
+        }
+      } else {
         shingleBufferPosition++;
-        return true;
       }
   
       if (shingleBufferPosition < shingleBuf.size()) {
@@ -277,7 +280,7 @@
         shingleBuf.add(captureState());
         if (shingleBuf.size() > maxShingleSize)
         {
-          shingleBuf.remove(0);
+          shingleBuf.removeFirst();
         }
         addedToken = true;
       } else {
@@ -294,7 +297,7 @@
      * the end of the input stream and have to discard the least recent token.
      */
     if (! addedToken) {
-      shingleBuf.remove(0);
+      shingleBuf.removeFirst();
     }
     
     if (shingleBuf.isEmpty()) {

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=812779&r1=812778&r2=812779&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
Wed Sep  9 06:02:54 2009
@@ -115,6 +115,60 @@
     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
   };
 
+  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("divide this", 7, 18),
+    createToken("this sentence", 14, 27),
+    createToken("sentence into", 19, 32),
+    createToken("into shingles", 28, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+    1, 1, 1, 1, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+    "shingle", "shingle", "shingle", "shingle", "shingle"
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("divide _", 7, 19),
+    createToken("_ sentence", 19, 27),
+    createToken("sentence _", 19, 33),
+    createToken("_ shingles", 33, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS = new
int[] {
+    1, 1, 1, 1, 1, 1
+  };
+
+
+  public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
+    createToken("please", 0, 6)
+  };
+
+  public static final Token[] SINGLE_TOKEN = new Token[] {
+    createToken("please", 0, 6)
+  };
+
+  public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
+    1
+  };
+
+  public static final String[] SINGLE_TOKEN_TYPES = new String[] {
+    "word"
+  };
+
+  public static final Token[] EMPTY_TOKEN_ARRAY = new Token[] {
+  };
+
+  public static final int[] EMPTY_TOKEN_INCREMENTS_ARRAY = new int[] {
+  };
+
+  public static final String[] EMPTY_TOKEN_TYPES_ARRAY = new String[] {
+  };
+
   public static final Token[] TRI_GRAM_TOKENS = new Token[] {
     createToken("please", 0, 6),
     createToken("please divide", 0, 13),
@@ -165,18 +219,59 @@
    */
   public void testBiGramFilter() throws IOException {
     this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
-                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+                           true);
   }
 
   public void testBiGramFilterWithHoles() throws IOException {
     this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
-                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+                           true);
+  }
+
+  public void testBiGramFilterWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+                           false);
+  }
+
+  public void testBiGramFilterWithHolesWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS,
+                           BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+                           false);
+  }
+
+  public void testBiGramFilterWithSingleToken() throws IOException {
+    this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
+                           SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
+                           true);
+  }
+
+  public void testBiGramFilterWithSingleTokenWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, TEST_SINGLE_TOKEN, EMPTY_TOKEN_ARRAY,
+                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
+                           false);
+  }
+
+  public void testBiGramFilterWithEmptyTokenStream() throws IOException {
+    this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
+                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
+                           true);
+  }
+
+  public void testBiGramFilterWithEmptyTokenStreamWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
+                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
+                           false);
   }
 
   public void testTriGramFilter() throws IOException {
     this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
-                           TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
+                           TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
+                           true);
   }
+
+
   
   public void testReset() throws Exception {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this
sentence"));
@@ -197,10 +292,13 @@
   }
   
   protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
-                                   int[] positionIncrements, String[] types)
+                                   int[] positionIncrements, String[] types,
+                                   boolean outputUnigrams)
     throws IOException {
 
-    TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+    ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+    filter.setOutputUnigrams(outputUnigrams);
+
     TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
     OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
     PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
@@ -208,6 +306,7 @@
 
     int i = 0;
     while (filter.incrementToken()) {
+      assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
       String termText = termAtt.term();
       String goldText = tokensToCompare[i].term();
       assertEquals("Wrong termText", goldText, termText);
@@ -220,6 +319,8 @@
       assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
       i++;
     }
+    assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected
=" + tokensToCompare.length + ")",
+                 tokensToCompare.length, i);
   }
 
   private static Token createToken(String term, int start, int offset)



Mime
View raw message