lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jpou...@apache.org
Subject svn commit: r1484078 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/ lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/
Date Sat, 18 May 2013 09:31:31 GMT
Author: jpountz
Date: Sat May 18 09:31:31 2013
New Revision: 1484078

URL: http://svn.apache.org/r1484078
Log:
Fix EdgeNGramTokenFilter to correctly handle graph inputs.

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=1484078&r1=1484077&r2=1484078&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
Sat May 18 09:31:31 2013
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.revers
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.Version;
 
 import java.io.IOException;
@@ -81,11 +82,12 @@ public final class EdgeNGramTokenFilter 
   private int tokEnd; // only used if the length changed before this filter
   private boolean updateOffsets; // never if the length changed before this filter
   private int savePosIncr;
-  private boolean isFirstToken = true;
+  private int savePosLen;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
 
   /**
    * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@@ -172,7 +174,8 @@ public final class EdgeNGramTokenFilter 
             // this is a synonym and don't adjust the offsets.
             updateOffsets = (tokStart + curTermLength) == tokEnd;
           }
-          savePosIncr = posIncrAtt.getPositionIncrement();
+          savePosIncr += posIncrAtt.getPositionIncrement();
+          savePosLen = posLenAtt.getPositionLength();
         }
       }
       if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size
range, quit
@@ -188,16 +191,14 @@ public final class EdgeNGramTokenFilter 
           }
           // first ngram gets increment, others don't
           if (curGramSize == minGram) {
-            //  Leave the first token position increment at the cleared-attribute value of
1
-            if ( ! isFirstToken) {
-              posIncrAtt.setPositionIncrement(savePosIncr);
-            }
+            posIncrAtt.setPositionIncrement(savePosIncr);
+            savePosIncr = 0;
           } else {
             posIncrAtt.setPositionIncrement(0);
           }
+          posLenAtt.setPositionLength(savePosLen);
           termAtt.copyBuffer(curTermBuffer, start, curGramSize);
           curGramSize++;
-          isFirstToken = false;
           return true;
         }
       }
@@ -209,6 +210,6 @@ public final class EdgeNGramTokenFilter 
   public void reset() throws IOException {
     super.reset();
     curTermBuffer = null;
-    isFirstToken = true;
+    savePosIncr = 0;
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1484078&r1=1484077&r2=1484078&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
Sat May 18 09:31:31 2013
@@ -29,8 +29,10 @@ import org.apache.lucene.analysis.TokenF
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.Version;
 
@@ -247,4 +249,19 @@ public class EdgeNGramTokenFilterTest ex
     };
     checkAnalysisConsistency(random, b, random.nextBoolean(), "");
   }
+
+  public void testGraphs() throws IOException {
+    TokenStream tk = new LetterTokenizer(Version.LUCENE_44, new StringReader("abc d efgh
ij klmno p q"));
+    tk = new ShingleFilter(tk);
+    tk = new EdgeNGramTokenFilter(Version.LUCENE_44, tk, 7, 10);
+    tk.reset();
+    assertTokenStreamContents(tk,
+        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
+        new int[]    { 6,11,11,14 },
+        new int[]    { 13,19,19,21 },
+        new int[]    { 3,1,0,1 },
+        new int[]    { 2,2,2,2 },
+        23
+    );
+  }
 }



Mime
View raw message