lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sar...@apache.org
Subject svn commit: r1043071 [5/5] - in /lucene/dev/trunk: modules/analysis/ modules/analysis/common/ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ modules/analysis/common/src/test/org/apache/lucene/analysis/core/ modules/analysis/commo...
Date Tue, 07 Dec 2010 14:53:14 GMT
Copied: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
(from r1042243, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?p2=lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex&p1=lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex&r1=1042243&r2=1043071&rev=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
(original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
Tue Dec  7 14:53:13 2010
@@ -32,11 +32,14 @@ import org.apache.lucene.util.AttributeS
  * This class implements Word Break rules from the Unicode Text Segmentation 
  * algorithm, as specified in 
  * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>

+ * URLs and email addresses are also tokenized according to the relevant RFCs.
  * <p/>
  * Tokens produced are of the following types:
  * <ul>
  *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
  *   <li>&lt;NUM&gt;: A number</li>
+ *   <li>&lt;URL&gt;: A URL</li>
+ *   <li>&lt;EMAIL&gt;: An email address</li>
  *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
  *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
  *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@@ -57,7 +60,7 @@ import org.apache.lucene.util.AttributeS
 %final
 %public
 %apiprivate
-%class UAX29Tokenizer
+%class UAX29URLEmailTokenizer
 %extends Tokenizer
 %type boolean
 %function getNextToken
@@ -67,7 +70,7 @@ import org.apache.lucene.util.AttributeS
   super(in);
 %init}
 
-// WB4. X (Extend | Format)* --> X
+// UAX#29 WB4. X (Extend | Format)* --> X
 //
 ALetterEx      = \p{WB:ALetter}                     [\p{WB:Format}\p{WB:Extend}]*
 // TODO: Convert hard-coded full-width numeric range to property intersection (something
like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
@@ -77,6 +80,85 @@ MidLetterEx    = [\p{WB:MidLetter}\p{WB:
 MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]*
 ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*
 
+
+// URL and E-mail syntax specifications:
+//
+//     RFC-952:  DOD INTERNET HOST TABLE SPECIFICATION
+//     RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
+//     RFC-1123: Requirements for Internet Hosts - Application and Support
+//     RFC-1738: Uniform Resource Locators (URL)
+//     RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
+//     RFC-5234: Augmented BNF for Syntax Specifications: ABNF
+//     RFC-5321: Simple Mail Transfer Protocol
+//     RFC-5322: Internet Message Format
+
+%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
+
+DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
+DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
+DomainNameLoose  = {DomainLabel} ("." {DomainLabel})*
+
+IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] |
"5" [0-5])
+IPv4Address  = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3} 
+IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
+IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
+IPv6Address =                                                  ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
+            |                                             "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
+            |                            {IPv6Hex16Bit}?  "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::"  {IPv6Hex16Bit} ":"     {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::"                         {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::"                         {IPv6Hex16Bit}
+            | (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
+
+URIunreserved = [-._~A-Za-z0-9]
+URIpercentEncoded = "%" [0-9A-Fa-f]{2}
+URIsubDelims = [!$&'()*+,;=]
+URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
+URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
+URIquery    = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
+URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
+URIport = ":" [0-9]{1,5}
+URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}  
+URIhostLoose  = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose} 
+
+URIauthorityStrict =             {URIhostStrict} {URIport}?
+URIauthorityLoose  = {URIlogin}? {URIhostLoose}  {URIport}?
+
+HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
+HTTPpath = ("/" {HTTPsegment})*
+HTTPscheme = [hH][tT][tT][pP][sS]? "://"
+HTTPurlFull = {HTTPscheme} {URIauthorityLoose}  {HTTPpath}? {URIquery}? {URIfragment}?
+// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
+HTTPurlNoScheme =          {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
+HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
+
+FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
+FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
+FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
+FTPscheme = [fF][tT][pP] "://"
+FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
+
+FILEscheme = [fF][iI][lL][eE] "://"
+FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
+
+URL = {HTTPurl} | {FTPurl} | {FILEurl}
+
+EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E]
| [\\] [\u0000-\u007F])* [\"]
+EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
+EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
+EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
+EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
+// DFA minimization allows {IPv6Address} and {IPv4Address} to be included 
+// in the {EMAILbracketedHost} definition without incurring any size penalties, 
+// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
+// The IP address regexes are included in {EMAILbracketedHost} simply as a 
+// reminder that they are acceptable bracketed host forms.
+EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address})
"]"
+EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
+
+
 %{
   /** Alphanumeric sequences */
   public static final String WORD_TYPE = "<ALPHANUM>";
@@ -84,6 +166,12 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}    
   /** Numbers */
   public static final String NUMERIC_TYPE = "<NUM>";
   
+  /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
+  public static final String URL_TYPE = "<URL>";
+  
+  /** E-mail addresses */
+  public static final String EMAIL_TYPE = "<EMAIL";
+  
   /**
    * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
    * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
@@ -112,7 +200,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}    
    * @param source The AttributeSource to use
    * @param input The input reader
    */
-  public UAX29Tokenizer(AttributeSource source, Reader input) {
+  public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
     super(source, input);
     zzReader = input;
   }
@@ -121,7 +209,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}    
    * @param factory The AttributeFactory to use
    * @param input The input reader
    */
-  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
+  public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
     super(factory, input); 
     zzReader = input;
   }
@@ -201,17 +289,19 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}    
 
 %%
 
-// WB1. 	sot 	÷ 	
-// WB2. 		÷ 	eot
+// UAX#29 WB1. 	sot 	÷ 	
+//        WB2. 		÷ 	eot
 //
 <<EOF>> { return false; }
 
+{URL}   { if (populateAttributes(URL_TYPE)) return true; }
+{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
 
-// WB8.   Numeric × Numeric
-// WB11.  Numeric (MidNum | MidNumLet) × Numeric
-// WB12.  Numeric × (MidNum | MidNumLet) Numeric
-// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+// UAX#29 WB8.   Numeric × Numeric
+//        WB11.  Numeric (MidNum | MidNumLet) × Numeric
+//        WB12.  Numeric × (MidNum | MidNumLet) Numeric
+//        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 //
 {ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx} 
                               | {MidNumericEx} {NumericEx} 
@@ -220,14 +310,14 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}    
   { if (populateAttributes(NUMERIC_TYPE)) return true; }
 
 
-// WB5.   ALetter × ALetter
-// WB6.   ALetter × (MidLetter | MidNumLet) ALetter
-// WB7.   ALetter (MidLetter | MidNumLet) × ALetter
-// WB9.   ALetter × Numeric
-// WB10.  Numeric × ALetter
-// WB13.  Katakana × Katakana
-// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+// UAX#29 WB5.   ALetter × ALetter
+//        WB6.   ALetter × (MidLetter | MidNumLet) ALetter
+//        WB7.   ALetter (MidLetter | MidNumLet) × ALetter
+//        WB9.   ALetter × Numeric
+//        WB10.  Numeric × ALetter
+//        WB13.  Katakana × Katakana
+//        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 //
 {ExtendNumLetEx}*  ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* 
                    | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx}
| {NumericEx})*
@@ -260,15 +350,15 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}    
 //
 \p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
 
-// WB14.  Any ÷ Any
+// UAX#29 WB14.  Any ÷ Any
 //
 \p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
 \p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
 
 
-// WB3.   CR × LF
-// WB3a.  (Newline | CR | LF) ÷
-// WB3b.  ÷ (Newline | CR | LF)
-// WB14.  Any ÷ Any
+// UAX#29 WB3.   CR × LF
+//        WB3a.  (Newline | CR | LF) ÷
+//        WB3b.  ÷ (Newline | CR | LF)
+//        WB14.  Any ÷ Any
 //
 [^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
(original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
Tue Dec  7 14:53:13 2010
@@ -27,7 +27,10 @@
         as of Lucene 3.1, implements the Word Break rules from the Unicode Text 
         Segmentation algorithm, as specified in 
         <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
-        URLs and email addresses are also tokenized according to the relevant RFCs.
+        Unlike <code>UAX29URLEmailTokenizer</code>, URLs and email addresses
are
+        <b>not</b> tokenized as single tokens, but are instead split up into

+        tokens according to the UAX#29 word break rules.
+        <br/>
         <code><a href="StandardAnalyzer">StandardAnalyzer</a></code>
includes
         <code>StandardTokenizer</code>, 
         <code><a href="StandardFilter">StandardFilter</a></code>,

@@ -46,13 +49,11 @@
         <code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
         and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
     </li>
-    <li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>:

-        implements the Word Break rules from the Unicode Text Segmentation 
-        algorithm, as specified in
+    <li><code><a href="UAX29URLEmailTokenizer.html">UAX29URLEmailTokenizer</a></code>:

+        implements the Word Break rules from the Unicode Text Segmentation
+        algorithm, as specified in 
         <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
-        Unlike <code>StandardTokenizer</code>, URLs and email addresses are
-        <b>not</b> tokenized as single tokens, but are instead split up into

-        tokens according to the UAX#29 word break rules.
+        URLs and email addresses are also tokenized according to the relevant RFCs.
     </li>
 </ul>
 </body>

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
(original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
Tue Dec  7 14:53:13 2010
@@ -2,21 +2,14 @@ package org.apache.lucene.analysis.core;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 
-import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.List;
 
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -58,63 +51,6 @@ public class TestStandardAnalyzer extend
     }
   };
 
-  /** Passes through tokens with type "<URL>" and blocks all other types. */
-  private class URLFilter extends TokenFilter {
-    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-    public URLFilter(TokenStream in) {
-      super(in);
-    }
-    @Override
-    public final boolean incrementToken() throws java.io.IOException {
-      boolean isTokenAvailable = false;
-      while (input.incrementToken()) {
-        if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
-          isTokenAvailable = true;
-          break;
-        }
-      }
-      return isTokenAvailable;
-    }
-  }
-  
-  /** Passes through tokens with type "<EMAIL>" and blocks all other types. */
-  private class EmailFilter extends TokenFilter {
-    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-    public EmailFilter(TokenStream in) {
-      super(in);
-    }
-    @Override
-    public final boolean incrementToken() throws java.io.IOException {
-      boolean isTokenAvailable = false;
-      while (input.incrementToken()) {
-        if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
-          isTokenAvailable = true;
-          break;
-        }
-      }
-      return isTokenAvailable;
-    }
-  }
-
-  private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
-    @Override
-    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
-      tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
-      TokenFilter filter = new URLFilter(tokenizer);
-      return new TokenStreamComponents(tokenizer, filter);
-    }
-  };
-
-  private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
-    @Override
-    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
-      TokenFilter filter = new EmailFilter(tokenizer);
-      return new TokenStreamComponents(tokenizer, filter);
-    }
-  };
-
   public void testArmenian() throws Exception {
     BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի
13 միլիոն հոդվածները (4,600` հայերեն
վիքիպեդիայում) գրվել են կամավորների
կողմից ու համարյա բոլոր հոդվածները
կարող է խմբագրել ցանկաց մարդ
ով կարող է բացել Վիքիպեդիայի
կայքը։",
         new String[] { "Վիքիպեդիայի", "13", "միլիոն",
"հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում",
"գրվել", "են", "կամավորների", "կողմից",

@@ -261,138 +197,6 @@ public class TestStandardAnalyzer extend
         new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>"
});
   }
   
-  public void testWikiURLs() throws Exception {
-    Reader reader = null;
-    String luceneResourcesWikiPage;
-    try {
-      reader = new InputStreamReader
-        (getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
-      StringBuilder builder = new StringBuilder();
-      char[] buffer = new char[1024];
-      int numCharsRead;
-      while (-1 != (numCharsRead = reader.read(buffer))) {
-        builder.append(buffer, 0, numCharsRead);
-      }
-      luceneResourcesWikiPage = builder.toString(); 
-    } finally {
-      if (null != reader) {
-        reader.close();
-      }
-    }
-    assertTrue(null != luceneResourcesWikiPage 
-               && luceneResourcesWikiPage.length() > 0);
-    BufferedReader bufferedReader = null;
-    String[] urls;
-    try {
-      List<String> urlList = new ArrayList<String>();
-      bufferedReader = new BufferedReader(new InputStreamReader
-        (getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
-      String line;
-      while (null != (line = bufferedReader.readLine())) {
-        line = line.trim();
-        if (line.length() > 0) {
-          urlList.add(line);
-        }
-      }
-      urls = urlList.toArray(new String[urlList.size()]);
-    } finally {
-      if (null != bufferedReader) {
-        bufferedReader.close();
-      }
-    }
-    assertTrue(null != urls && urls.length > 0);
-    BaseTokenStreamTestCase.assertAnalyzesTo
-      (urlAnalyzer, luceneResourcesWikiPage, urls);
-  }
-  
-  public void testEmails() throws Exception {
-    Reader reader = null;
-    String randomTextWithEmails;
-    try {
-      reader = new InputStreamReader
-        (getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
-      StringBuilder builder = new StringBuilder();
-      char[] buffer = new char[1024];
-      int numCharsRead;
-      while (-1 != (numCharsRead = reader.read(buffer))) {
-        builder.append(buffer, 0, numCharsRead);
-      }
-      randomTextWithEmails = builder.toString(); 
-    } finally {
-      if (null != reader) {
-        reader.close();
-      }
-    }
-    assertTrue(null != randomTextWithEmails 
-               && randomTextWithEmails.length() > 0);
-    BufferedReader bufferedReader = null;
-    String[] emails;
-    try {
-      List<String> emailList = new ArrayList<String>();
-      bufferedReader = new BufferedReader(new InputStreamReader
-        (getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"),
"UTF-8"));
-      String line;
-      while (null != (line = bufferedReader.readLine())) {
-        line = line.trim();
-        if (line.length() > 0) {
-          emailList.add(line);
-        }
-      }
-      emails = emailList.toArray(new String[emailList.size()]);
-    } finally {
-      if (null != bufferedReader) {
-        bufferedReader.close();
-      }
-    }
-    assertTrue(null != emails && emails.length > 0);
-    BaseTokenStreamTestCase.assertAnalyzesTo
-      (emailAnalyzer, randomTextWithEmails, emails);
-  }
-
-  public void testURLs() throws Exception {
-    Reader reader = null;
-    String randomTextWithURLs;
-    try {
-      reader = new InputStreamReader
-        (getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
-      StringBuilder builder = new StringBuilder();
-      char[] buffer = new char[1024];
-      int numCharsRead;
-      while (-1 != (numCharsRead = reader.read(buffer))) {
-        builder.append(buffer, 0, numCharsRead);
-      }
-      randomTextWithURLs = builder.toString(); 
-    } finally {
-      if (null != reader) {
-        reader.close();
-      }
-    }
-    assertTrue(null != randomTextWithURLs 
-               && randomTextWithURLs.length() > 0);
-    BufferedReader bufferedReader = null;
-    String[] urls;
-    try {
-      List<String> urlList = new ArrayList<String>();
-      bufferedReader = new BufferedReader(new InputStreamReader
-        (getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
-      String line;
-      while (null != (line = bufferedReader.readLine())) {
-        line = line.trim();
-        if (line.length() > 0) {
-          urlList.add(line);
-        }
-      }
-      urls = urlList.toArray(new String[urlList.size()]);
-    } finally {
-      if (null != bufferedReader) {
-        bufferedReader.close();
-      }
-    }
-    assertTrue(null != urls && urls.length > 0);
-    BaseTokenStreamTestCase.assertAnalyzesTo
-      (urlAnalyzer, randomTextWithURLs, urls);
-  }
-
   public void testUnicodeWordBreaks() throws Exception {
     WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
     wordBreakTest.test(a);

Copied: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
(from r1042261, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?p2=lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java&p1=lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java&r1=1042261&r2=1043071&rev=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29Tokenizer.java
(original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
Tue Dec  7 14:53:13 2010
@@ -2,14 +2,21 @@ package org.apache.lucene.analysis.core;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -28,7 +35,7 @@ import java.util.Arrays;
  * limitations under the License.
  */
 
-public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
+public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
   
   public void testHugeDoc() throws IOException {
     StringBuilder sb = new StringBuilder();
@@ -37,7 +44,7 @@ public class TestUAX29Tokenizer extends 
     sb.append(whitespace);
     sb.append("testing 1234");
     String input = sb.toString();
-    UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
+    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
     BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing",
"1234" });
   }
 
@@ -46,11 +53,70 @@ public class TestUAX29Tokenizer extends 
     protected TokenStreamComponents createComponents
       (String fieldName, Reader reader) {
 
-      Tokenizer tokenizer = new UAX29Tokenizer(reader);
+      Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
       return new TokenStreamComponents(tokenizer);
     }
   };
 
+
+  /** Passes through tokens with type "<URL>" and blocks all other types. */
+  private class URLFilter extends TokenFilter {
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    public URLFilter(TokenStream in) {
+      super(in);
+    }
+    @Override
+    public final boolean incrementToken() throws java.io.IOException {
+      boolean isTokenAvailable = false;
+      while (input.incrementToken()) {
+        if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
+          isTokenAvailable = true;
+          break;
+        }
+      }
+      return isTokenAvailable;
+    }
+  }
+  
+  /** Passes through tokens with type "<EMAIL>" and blocks all other types. */
+  private class EmailFilter extends TokenFilter {
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    public EmailFilter(TokenStream in) {
+      super(in);
+    }
+    @Override
+    public final boolean incrementToken() throws java.io.IOException {
+      boolean isTokenAvailable = false;
+      while (input.incrementToken()) {
+        if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
+          isTokenAvailable = true;
+          break;
+        }
+      }
+      return isTokenAvailable;
+    }
+  }
+
+  private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+      tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
+      TokenFilter filter = new URLFilter(tokenizer);
+      return new TokenStreamComponents(tokenizer, filter);
+    }
+  };
+
+  private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+      TokenFilter filter = new EmailFilter(tokenizer);
+      return new TokenStreamComponents(tokenizer, filter);
+    }
+  };
+  
+  
   public void testArmenian() throws Exception {
     BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի
13 միլիոն հոդվածները (4,600` հայերեն
վիքիպեդիայում) գրվել են կամավորների
կողմից ու համարյա բոլոր հոդվածները
կարող է խմբագրել ցանկաց մարդ
ով կարող է բացել Վիքիպեդիայի
կայքը։",
         new String[] { "Վիքիպեդիայի", "13", "միլիոն",
"հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում",
"գրվել", "են", "կամավորների", "կողմից",

@@ -163,7 +229,6 @@ public class TestUAX29Tokenizer extends 
     BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
     BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
     BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
   }
 
   public void testTextWithNumbersSA() throws Exception {
@@ -197,6 +262,140 @@ public class TestUAX29Tokenizer extends 
         new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>"
});
   }
   
+  public void testWikiURLs() throws Exception {
+    Reader reader = null;
+    String luceneResourcesWikiPage;
+    try {
+      reader = new InputStreamReader(getClass().getResourceAsStream
+        ("LuceneResourcesWikiPage.html"), "UTF-8");
+      StringBuilder builder = new StringBuilder();
+      char[] buffer = new char[1024];
+      int numCharsRead;
+      while (-1 != (numCharsRead = reader.read(buffer))) {
+        builder.append(buffer, 0, numCharsRead);
+      }
+      luceneResourcesWikiPage = builder.toString(); 
+    } finally {
+      if (null != reader) {
+        reader.close();
+      }
+    }
+    assertTrue(null != luceneResourcesWikiPage 
+               && luceneResourcesWikiPage.length() > 0);
+    BufferedReader bufferedReader = null;
+    String[] urls;
+    try {
+      List<String> urlList = new ArrayList<String>();
+      bufferedReader = new BufferedReader(new InputStreamReader
+        (getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
+      String line;
+      while (null != (line = bufferedReader.readLine())) {
+        line = line.trim();
+        if (line.length() > 0) {
+          urlList.add(line);
+        }
+      }
+      urls = urlList.toArray(new String[urlList.size()]);
+    } finally {
+      if (null != bufferedReader) {
+        bufferedReader.close();
+      }
+    }
+    assertTrue(null != urls && urls.length > 0);
+    BaseTokenStreamTestCase.assertAnalyzesTo
+      (urlAnalyzer, luceneResourcesWikiPage, urls);
+  }
+  
+  public void testEmails() throws Exception {
+    Reader reader = null;
+    String randomTextWithEmails;
+    try {
+      reader = new InputStreamReader(getClass().getResourceAsStream
+        ("random.text.with.email.addresses.txt"), "UTF-8");
+      StringBuilder builder = new StringBuilder();
+      char[] buffer = new char[1024];
+      int numCharsRead;
+      while (-1 != (numCharsRead = reader.read(buffer))) {
+        builder.append(buffer, 0, numCharsRead);
+      }
+      randomTextWithEmails = builder.toString(); 
+    } finally {
+      if (null != reader) {
+        reader.close();
+      }
+    }
+    assertTrue(null != randomTextWithEmails 
+               && randomTextWithEmails.length() > 0);
+    BufferedReader bufferedReader = null;
+    String[] emails;
+    try {
+      List<String> emailList = new ArrayList<String>();
+      bufferedReader = new BufferedReader(new InputStreamReader
+        (getClass().getResourceAsStream
+          ("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
+      String line;
+      while (null != (line = bufferedReader.readLine())) {
+        line = line.trim();
+        if (line.length() > 0) {
+          emailList.add(line);
+        }
+      }
+      emails = emailList.toArray(new String[emailList.size()]);
+    } finally {
+      if (null != bufferedReader) {
+        bufferedReader.close();
+      }
+    }
+    assertTrue(null != emails && emails.length > 0);
+    BaseTokenStreamTestCase.assertAnalyzesTo
+      (emailAnalyzer, randomTextWithEmails, emails);
+  }
+
+  public void testURLs() throws Exception {
+    Reader reader = null;
+    String randomTextWithURLs;
+    try {
+      reader = new InputStreamReader(getClass().getResourceAsStream
+        ("random.text.with.urls.txt"), "UTF-8");
+      StringBuilder builder = new StringBuilder();
+      char[] buffer = new char[1024];
+      int numCharsRead;
+      while (-1 != (numCharsRead = reader.read(buffer))) {
+        builder.append(buffer, 0, numCharsRead);
+      }
+      randomTextWithURLs = builder.toString(); 
+    } finally {
+      if (null != reader) {
+        reader.close();
+      }
+    }
+    assertTrue(null != randomTextWithURLs 
+               && randomTextWithURLs.length() > 0);
+    BufferedReader bufferedReader = null;
+    String[] urls;
+    try {
+      List<String> urlList = new ArrayList<String>();
+      bufferedReader = new BufferedReader(new InputStreamReader
+        (getClass().getResourceAsStream
+          ("urls.from.random.text.with.urls.txt"), "UTF-8"));
+      String line;
+      while (null != (line = bufferedReader.readLine())) {
+        line = line.trim();
+        if (line.length() > 0) {
+          urlList.add(line);
+        }
+      }
+      urls = urlList.toArray(new String[urlList.size()]);
+    } finally {
+      if (null != bufferedReader) {
+        bufferedReader.close();
+      }
+    }
+    assertTrue(null != urls && urls.length > 0);
+    BaseTokenStreamTestCase.assertAnalyzesTo
+      (urlAnalyzer, randomTextWithURLs, urls);
+  }
+
   public void testUnicodeWordBreaks() throws Exception {
     WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
     wordBreakTest.test(a);

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
(original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
Tue Dec  7 14:53:13 2010
@@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends Ba
       assertAnalyzesToReuse(
           analyzer,
           "บริษัทชื่อ XY&Z - คุยกับ
xyz@demo.com",
-          new String[] { "บริษัท", "ชื่อ",
"xy", "z", "คุย", "กับ", "xyz@demo.com" });
+          new String[] { "บริษัท", "ชื่อ",
"xy", "z", "คุย", "กับ", "xyz", "demo.com" });
 	}
 	
 	/** @deprecated (3.1) for version back compat */

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1043071&r1=1043070&r2=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Tue Dec  7 14:53:13 2010
@@ -302,8 +302,10 @@ New Features
 * SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese) 
   tokenizer and filters to contrib/analysis-extras (rmuir)
 
-* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm 
-  with good results for most languages.  (Tom Burton-West via rmuir)
+* SOLR-2211,LUCENE-2763: Added UAX29URLEmailTokenizerFactory, which implements
+  UAX#29, a unicode algorithm with good results for most languages, as well as
+  URL and E-mail tokenization according to the relevant RFCs.
+  (Tom Burton-West via rmuir)
 
 * SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)
 

Copied: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
(from r1042243, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java?p2=lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java&p1=lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java&r1=1042243&r2=1043071&rev=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29TokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
Tue Dec  7 14:53:13 2010
@@ -20,7 +20,7 @@ package org.apache.solr.analysis;
 
 
 
-import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 
 import java.io.Reader;
 import java.util.Map;
@@ -30,14 +30,14 @@ import java.util.Map;
  * 
  */
 
-public class UAX29TokenizerFactory extends BaseTokenizerFactory {
+public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
   @Override
   public void init(Map<String,String> args) {
     super.init(args);
     assureMatchVersion();
   }
 
-  public UAX29Tokenizer create(Reader input) {
-    return new UAX29Tokenizer(input);
+  public UAX29URLEmailTokenizer create(Reader input) {
+    return new UAX29URLEmailTokenizer(input);
   }
 }

Copied: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
(from r1042243, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java?p2=lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java&p1=lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java&r1=1042243&r2=1043071&rev=1043071&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
(original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
Tue Dec  7 14:53:13 2010
@@ -22,16 +22,14 @@ import java.io.StringReader;
 import org.apache.lucene.analysis.Tokenizer;
 
 /**
- * A few tests based on  org.apache.lucene.analysis.TestUAX29Tokenizer;
+ * A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer
  */
 
-public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
-  /**
-   * Test UAX29TokenizerFactory
-   */
-  public void testUAX29Tokenizer() throws Exception {
+public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
+
+  public void testUAX29URLEmailTokenizer() throws Exception {
     Reader reader = new StringReader("Wha\u0301t's this thing do?");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
     factory.init(DEFAULT_VERSION_PARAM);
     Tokenizer stream = factory.create(reader);
     assertTokenStreamContents(stream, 
@@ -40,7 +38,7 @@ public class TestUAX29TokenizerFactory e
   
   public void testArabic() throws Exception {
     Reader reader = new StringReader("الفيلم الوثائقي
الأول عن ويكيبيديا يسمى \"الحقيقة
بالأرقام: قصة ويكيبيديا\" (بالإنجليزية:
Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في
2008.");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
     factory.init(DEFAULT_VERSION_PARAM);
     Tokenizer stream = factory.create(reader);
     assertTokenStreamContents(stream, 
@@ -50,15 +48,16 @@ public class TestUAX29TokenizerFactory e
   
   public void testChinese() throws Exception {
     Reader reader = new StringReader("我是中国人。 1234
Tests ");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
     factory.init(DEFAULT_VERSION_PARAM);
     Tokenizer stream = factory.create(reader);
     assertTokenStreamContents(stream, 
         new String[] {"我", "是", "中", "国", "人", "1234",
"Tests"});
   }
+
   public void testKorean() throws Exception {
     Reader reader = new StringReader("안녕하세요 한글입니다");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
     factory.init(DEFAULT_VERSION_PARAM);
     Tokenizer stream = factory.create(reader);
     assertTokenStreamContents(stream, 
@@ -67,15 +66,90 @@ public class TestUAX29TokenizerFactory e
     
   public void testHyphen() throws Exception {
     Reader reader = new StringReader("some-dashed-phrase");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
     factory.init(DEFAULT_VERSION_PARAM);
     Tokenizer stream = factory.create(reader);
     assertTokenStreamContents(stream, 
         new String[] {"some", "dashed", "phrase"});
   }
 
+  // Test with some URLs from TestUAX29URLEmailTokenizer's 
+  // urls.from.random.text.with.urls.txt
+  public void testURLs() throws Exception {
+    String textWithURLs 
+      = "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on\n"
+        + " some extra\nWords thrown in here. "
+        + "http://c5-3486.bisynxu.FR/aI.YnNms/"
+        + " samba Halta gamba "
+        + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
+        + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
+        + "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
+        + " inter Locutio "
+        + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
+        + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
+        + " blah Sirrah woof "
+        + "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
+    Reader reader = new StringReader(textWithURLs);
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] { 
+          "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on",
+          "some", "extra", "Words", "thrown", "in", "here",
+          "http://c5-3486.bisynxu.FR/aI.YnNms/",
+          "samba", "Halta", "gamba",
+          "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
+          "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
+          "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
+          "inter", "Locutio",
+          "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
+          "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
+          "blah", "Sirrah", "woof",
+          "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
+        }
+    );
+  }
+
+  // Test with some emails from TestUAX29URLEmailTokenizer's 
+  // email.addresses.from.random.text.with.email.addresses.txt
+  public void testEmails() throws Exception {
+    String textWithEmails 
+      =  " some extra\nWords thrown in here. "
+         + "dJ8ngFi@avz13m.CC\n"
+         + "kU-l6DS@[082.015.228.189]\n"
+         + "\"%U\u0012@?\\B\"@Fl2d.md"
+         + " samba Halta gamba "
+         + "Bvd#@tupjv.sn\n"
+         + "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n"
+         + "~+Kdz@3mousnl.SE\n"
+         + " inter Locutio "
+         + "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n"
+         + "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM"
+         + " blah Sirrah woof "
+         + "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
+         + "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
+    Reader reader = new StringReader(textWithEmails);
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] { 
+          "some", "extra", "Words", "thrown", "in", "here",
+          "dJ8ngFi@avz13m.CC",
+          "kU-l6DS@[082.015.228.189]",
+          "\"%U\u0012@?\\B\"@Fl2d.md",
+          "samba", "Halta", "gamba",
+          "Bvd#@tupjv.sn",
+          "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt",
+          "~+Kdz@3mousnl.SE",
+          "inter", "Locutio",
+          "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY",
+          "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM",
+          "blah", "Sirrah", "woof",
+          "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae",
+          "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H"
+        }
+    );
+  }
 }
-    
-  
-  
-  



Mime
View raw message