lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sar...@apache.org
Subject svn commit: r1470497 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/ lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/ lucene/analysis/morfologik/src/tes...
Date Mon, 22 Apr 2013 13:28:35 GMT
Author: sarowe
Date: Mon Apr 22 13:28:35 2013
New Revision: 1470497

URL: http://svn.apache.org/r1470497
Log:
LUCENE-4810: first output token from EdgeNGramTokenFilter must be > 0 (merged trunk r1470496)

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
    lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=1470497&r1=1470496&r2=1470497&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
Mon Apr 22 13:28:35 2013
@@ -75,6 +75,7 @@ public final class EdgeNGramTokenFilter 
   private int tokEnd; // only used if the length changed before this filter
   private boolean hasIllegalOffsets; // only if the length changed before this filter
   private int savePosIncr;
+  private boolean isFirstToken = true;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -138,9 +139,8 @@ public final class EdgeNGramTokenFilter 
           savePosIncr = posIncrAtt.getPositionIncrement();
         }
       }
-      if (curGramSize <= maxGram) {
-        if (! (curGramSize > curTermLength         // if the remaining input is too short,
we can't generate any n-grams
-            || curGramSize > maxGram)) {       // if we have hit the end of our n-gram
size range, quit
+      if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size
range, quit
+        if (curGramSize <= curTermLength) { // if the remaining input is too short, we
can't generate any n-grams
           // grab gramSize chars from front or back
           int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
           int end = start + curGramSize;
@@ -152,12 +152,16 @@ public final class EdgeNGramTokenFilter 
           }
           // first ngram gets increment, others don't
           if (curGramSize == minGram) {
-            posIncrAtt.setPositionIncrement(savePosIncr);
+            //  Leave the first token position increment at the cleared-attribute value of
1
+            if ( ! isFirstToken) {
+              posIncrAtt.setPositionIncrement(savePosIncr);
+            }
           } else {
             posIncrAtt.setPositionIncrement(0);
           }
           termAtt.copyBuffer(curTermBuffer, start, curGramSize);
           curGramSize++;
+          isFirstToken = false;
           return true;
         }
       }
@@ -169,5 +173,6 @@ public final class EdgeNGramTokenFilter 
   public void reset() throws IOException {
     super.reset();
     curTermBuffer = null;
+    isFirstToken = true;
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1470497&r1=1470496&r2=1470497&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
Mon Apr 22 13:28:35 2013
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Tokeni
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.position.PositionFilter;
 
 import java.io.Reader;
 import java.io.StringReader;
@@ -120,6 +121,21 @@ public class EdgeNGramTokenFilterTest ex
                               false);
   }
 
+  public void testFirstTokenPositionIncrement() throws Exception {
+    TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE,
false);
+    ts = new PositionFilter(ts, 0); // All but first token will get 0 position increment
+    EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT,
2, 3);
+    // The first token "a" will not be output, since it's smaller than the mingram size of
2.
+    // The second token on input to EdgeNGramTokenFilter will have position increment of
0,
+    // which should be increased to 1, since this is the first output token in the stream.
+    assertTokenStreamContents(filter,
+        new String[] { "ab", "abc" },
+        new int[]    {    2,     2 },
+        new int[]    {    4,     5 },
+        new int[]    {    1,     0 }
+    );
+  }
+
   public void testTokenizerPositions() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT,
1, 3);
     assertTokenStreamContents(tokenizer,

Modified: lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1470497&r1=1470496&r2=1470497&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
Mon Apr 22 13:28:35 2013
@@ -52,6 +52,14 @@ public class TestMorfologikAnalyzer exte
       new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
       new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
       new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
+
+    assertAnalyzesToReuse(
+        a,
+        "T. Gl\u00FCcksberg",
+        new String[] { "to", "tom", "tona", "Gl\u00FCcksberg" },
+        new int[] { 0, 0, 0, 3  },
+        new int[] { 1, 1, 1, 13 },
+        new int[] { 1, 0, 0, 1  });
   }
 
   /** Test reuse of MorfologikFilter with leftover stems. */



Mime
View raw message