lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1579141 - in /lucene/dev/branches/branch_4x/lucene/analysis/common/src: java/org/apache/lucene/analysis/miscellaneous/ test/org/apache/lucene/analysis/core/
Date Wed, 19 Mar 2014 05:26:05 GMT
Author: rmuir
Date: Wed Mar 19 05:26:05 2014
New Revision: 1579141

URL: http://svn.apache.org/r1579141
Log:
don't output an initial posinc=0 token in WDF if the first word is all delimiters

Modified:
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java?rev=1579141&r1=1579140&r2=1579141&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
Wed Mar 19 05:26:05 2014
@@ -258,7 +258,8 @@ public final class WordDelimiterFilter e
         // word of simply delimiters
         if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL))
{
           // if the posInc is 1, simply ignore it in the accumulation
-          if (posIncAttribute.getPositionIncrement() == 1) {
+          // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous
logic!
+          if (posIncAttribute.getPositionIncrement() == 1 && !first) {
             accumPosInc--;
           }
           continue;

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java?rev=1579141&r1=1579140&r2=1579141&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
Wed Mar 19 05:26:05 2014
@@ -4,6 +4,8 @@ import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.nio.CharBuffer;
+import java.util.Arrays;
+import java.util.HashSet;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -17,10 +19,12 @@ import org.apache.lucene.analysis.Tokeni
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 
 /*
@@ -256,4 +260,28 @@ public class TestBugInSomething extends 
     };
     checkRandomData(random(), analyzer, 2000);
   }
+  
+  public void testCuriousWikipediaString() throws Exception {
+    final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<>(
+        Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")),
false);
+    final byte table[] = new byte[] { 
+        -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63,

+        5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106,

+        -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71,

+        -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104,
+        -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64,

+        -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20
+    };
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new WikipediaTokenizer(reader);
+        TokenStream stream = new SopTokenFilter(tokenizer);
+        stream = new WordDelimiterFilter(TEST_VERSION_CURRENT, stream, table, -50, protWords);
+        stream = new SopTokenFilter(stream);
+        return new TokenStreamComponents(tokenizer, stream);
+      }  
+    };
+    checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p>
jb");
+  }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1579141&r1=1579140&r2=1579141&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
Wed Mar 19 05:26:05 2014
@@ -707,7 +707,7 @@ public class TestRandomChains extends Ba
         */
         descr.append("\n  ");
         descr.append(ctor.getDeclaringClass().getName());
-        String params = Arrays.toString(args);
+        String params = Arrays.deepToString(args);
         params = params.substring(1, params.length()-1);
         descr.append("(").append(params).append(")");
         return instance;



Mime
View raw message