lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1154939 [3/3] - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/backwards/ lucene/backwards/src/test-framework/ lucene/backwards/src/test/ lucene/src/java/org/apache/lucene/analysis/standard/ lucene/src/java/org/apache/lucene/analysis...
Date Mon, 08 Aug 2011 12:14:23 GMT
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex?rev=1154939&r1=1154938&r2=1154939&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
Mon Aug  8 12:14:22 2011
@@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokena
 %function getNextToken
 %char
 
-%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
 ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
 Format =  ([\p{WB:Format}] | {FormatSupp})
 Numeric = ([\p{WB:Numeric}] | {NumericSupp})

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java?rev=1154939&r1=1154938&r2=1154939&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
Mon Aug  8 12:14:22 2011
@@ -1,7 +1,9 @@
 package org.apache.lucene.analysis;
 
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
 
 import java.io.BufferedReader;
 import java.io.IOException;
@@ -38,7 +40,7 @@ public class TestUAX29URLEmailTokenizer 
     sb.append(whitespace);
     sb.append("testing 1234");
     String input = sb.toString();
-    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
+    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new
StringReader(input));
     BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing",
"1234" });
   }
 
@@ -47,7 +49,7 @@ public class TestUAX29URLEmailTokenizer 
     protected TokenStreamComponents createComponents
       (String fieldName, Reader reader) {
 
-      Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+      Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
       return new TokenStreamComponents(tokenizer);
     }
   };
@@ -63,7 +65,7 @@ public class TestUAX29URLEmailTokenizer 
     public final boolean incrementToken() throws java.io.IOException {
       boolean isTokenAvailable = false;
       while (input.incrementToken()) {
-        if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
+        if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL])
{
           isTokenAvailable = true;
           break;
         }
@@ -82,7 +84,7 @@ public class TestUAX29URLEmailTokenizer 
     public final boolean incrementToken() throws java.io.IOException {
       boolean isTokenAvailable = false;
       while (input.incrementToken()) {
-        if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
+        if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL])
{
           isTokenAvailable = true;
           break;
         }
@@ -94,7 +96,7 @@ public class TestUAX29URLEmailTokenizer 
   private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT,
reader);
       tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
       TokenFilter filter = new URLFilter(tokenizer);
       return new TokenStreamComponents(tokenizer, filter);
@@ -104,7 +106,7 @@ public class TestUAX29URLEmailTokenizer 
   private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT,
reader);
       TokenFilter filter = new EmailFilter(tokenizer);
       return new TokenStreamComponents(tokenizer, filter);
     }
@@ -412,7 +414,32 @@ public class TestUAX29URLEmailTokenizer 
         new String[] { "仮", "名", "遣", "い", "カタカナ"
},
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>",
"<HIRAGANA>", "<KATAKANA>" });
   }
-  
+
+  public void testCombiningMarks() throws Exception {
+    checkOneTerm(a, "ざ", "ざ"); // hiragana
+    checkOneTerm(a, "ザ", "ザ"); // katakana
+    checkOneTerm(a, "壹゙", "壹゙"); // ideographic
+    checkOneTerm(a, "아゙",  "아゙"); // hangul
+  }
+
+  /** @deprecated remove this and sophisticated backwards layer in 5.0 */
+  @Deprecated
+  public void testCombiningMarksBackwards() throws Exception {
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents
+        (String fieldName, Reader reader) {
+
+        Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+        return new TokenStreamComponents(tokenizer);
+      }
+    };
+    checkOneTerm(a, "ざ", "さ"); // hiragana Bug
+    checkOneTerm(a, "ザ", "ザ"); // katakana Works
+    checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
+    checkOneTerm(a, "아゙",  "아゙"); // hangul Works
+  }
+
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);

Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java?rev=1154939&r1=1154938&r2=1154939&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
Mon Aug  8 12:14:22 2011
@@ -51,7 +51,7 @@ public class UAX29URLEmailTokenizerFacto
   }
 
   public UAX29URLEmailTokenizer create(Reader input) {
-    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(input); 
+    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, input);

     tokenizer.setMaxTokenLength(maxTokenLength);
     return tokenizer;
   }

Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java?rev=1154939&r1=1154938&r2=1154939&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
Mon Aug  8 12:14:22 2011
@@ -19,6 +19,7 @@ package org.apache.solr.analysis;
 
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -173,4 +174,22 @@ public class TestUAX29URLEmailTokenizerF
     assertTokenStreamContents(stream, 
         new String[] {"one", "two", "three", longWord, "four", "five", "six" });
   }
+  
+  /** @deprecated nuke this test in lucene 5.0 */
+  @Deprecated
+  public void testMatchVersion() throws Exception {
+    Reader reader = new StringReader("ざ");
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"ざ"});
+    
+    reader = new StringReader("ざ");
+    factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(Collections.singletonMap("luceneMatchVersion", "3.1"));
+    stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"さ"}); // old broken behavior
+  }
 }



Mime
View raw message