Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex?rev=1154939&r1=1154938&r2=1154939&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
Mon Aug 8 12:14:22 2011
@@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokena
%function getNextToken
%char
-%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java?rev=1154939&r1=1154938&r2=1154939&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
Mon Aug 8 12:14:22 2011
@@ -1,7 +1,9 @@
package org.apache.lucene.analysis;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
import java.io.BufferedReader;
import java.io.IOException;
@@ -38,7 +40,7 @@ public class TestUAX29URLEmailTokenizer
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
- UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
+ UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new
StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing",
"1234" });
}
@@ -47,7 +49,7 @@ public class TestUAX29URLEmailTokenizer
protected TokenStreamComponents createComponents
(String fieldName, Reader reader) {
- Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+ Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer);
}
};
@@ -63,7 +65,7 @@ public class TestUAX29URLEmailTokenizer
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
- if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
+ if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL])
{
isTokenAvailable = true;
break;
}
@@ -82,7 +84,7 @@ public class TestUAX29URLEmailTokenizer
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
- if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
+ if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL])
{
isTokenAvailable = true;
break;
}
@@ -94,7 +96,7 @@ public class TestUAX29URLEmailTokenizer
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+ UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT,
reader);
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
@@ -104,7 +106,7 @@ public class TestUAX29URLEmailTokenizer
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+ UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT,
reader);
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
@@ -412,7 +414,32 @@ public class TestUAX29URLEmailTokenizer
new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã"
},
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>",
"<HIRAGANA>", "<KATAKANA>" });
}
-
+
+ public void testCombiningMarks() throws Exception {
+ checkOneTerm(a, "ãã", "ãã"); // hiragana
+ checkOneTerm(a, "ãµã", "ãµã"); // katakana
+ checkOneTerm(a, "壹ã", "壹ã"); // ideographic
+ checkOneTerm(a, "ìã", "ìã"); // hangul
+ }
+
+ /** @deprecated remove this and sophisticated backwards layer in 5.0 */
+ @Deprecated
+ public void testCombiningMarksBackwards() throws Exception {
+ Analyzer a = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents
+ (String fieldName, Reader reader) {
+
+ Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+ return new TokenStreamComponents(tokenizer);
+ }
+ };
+ checkOneTerm(a, "ãã", "ã"); // hiragana Bug
+ checkOneTerm(a, "ãµã", "ãµã"); // katakana Works
+ checkOneTerm(a, "壹ã", "壹"); // ideographic Bug
+ checkOneTerm(a, "ìã", "ìã"); // hangul Works
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java?rev=1154939&r1=1154938&r2=1154939&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
Mon Aug 8 12:14:22 2011
@@ -51,7 +51,7 @@ public class UAX29URLEmailTokenizerFacto
}
public UAX29URLEmailTokenizer create(Reader input) {
- UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(input);
+ UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, input);
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java?rev=1154939&r1=1154938&r2=1154939&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
Mon Aug 8 12:14:22 2011
@@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
+import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@@ -173,4 +174,22 @@ public class TestUAX29URLEmailTokenizerF
assertTokenStreamContents(stream,
new String[] {"one", "two", "three", longWord, "four", "five", "six" });
}
+
+ /** @deprecated nuke this test in lucene 5.0 */
+ @Deprecated
+ public void testMatchVersion() throws Exception {
+ Reader reader = new StringReader("ãã");
+ UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+ factory.init(DEFAULT_VERSION_PARAM);
+ Tokenizer stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] {"ãã"});
+
+ reader = new StringReader("ãã");
+ factory = new UAX29URLEmailTokenizerFactory();
+ factory.init(Collections.singletonMap("luceneMatchVersion", "3.1"));
+ stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] {"ã"}); // old broken behavior
+ }
}
|