lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1154936 [4/6] - in /lucene/dev/trunk: lucene/ modules/analysis/common/ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/ modules/analysis/commo...
Date Mon, 08 Aug 2011 11:58:00 GMT
Copied: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
(from r1154038, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex?p2=lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex&p1=lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex&r1=1154038&r2=1154936&rev=1154936&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
(original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
Mon Aug  8 11:57:59 2011
@@ -17,16 +17,7 @@ package org.apache.lucene.analysis.stand
  * limitations under the License.
  */
 
-import java.io.IOException;
-import java.io.Reader;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.AttributeSource;
-
 
 /**
  * This class implements Word Break rules from the Unicode Text Segmentation 
@@ -49,20 +40,14 @@ import org.apache.lucene.util.AttributeS
 %%
 
 %unicode 6.0
+%integer
 %final
 %public
-%apiprivate
-%class UAX29URLEmailTokenizer
-%extends Tokenizer
-%type boolean
+%class UAX29URLEmailTokenizerImpl
+%implements StandardTokenizerInterface
 %function getNextToken
 %char
 
-%init{
-  super(in);
-%init}
-
-
 %include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
 ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
 Format =  ([\p{WB:Format}] | {FormatSupp})
@@ -89,6 +74,8 @@ MidLetterEx    = ({MidLetter} | {MidNumL
 MidNumericEx   = ({MidNum} | {MidNumLet})      ({Format} | {Extend})*
 ExtendNumLetEx = {ExtendNumLet}                ({Format} | {Extend})*
 
+HanEx = {Han} ({Format} | {Extend})*
+HiraganaEx = {Hiragana} ({Format} | {Extend})*
 
 // URL and E-mail syntax specifications:
 //
@@ -170,16 +157,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 
 %{
   /** Alphanumeric sequences */
-  public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
+  public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
   
   /** Numbers */
-  public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
-  
-  /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
-  public static final String URL_TYPE = "<URL>";
-  
-  /** E-mail addresses */
-  public static final String EMAIL_TYPE = "<EMAIL>";
+  public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
   
   /**
    * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
@@ -189,114 +170,30 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
    * <p>
    * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
    */
-  public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN];
+  public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
   
-  public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+  public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
   
-  public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+  public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
   
-  public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
-
-  public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
-
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final PositionIncrementAttribute posIncrAtt 
-    = addAttribute(PositionIncrementAttribute.class);
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-  
-  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
-  private int posIncr;
-
+  public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
   
-  /**
-   * @param source The AttributeSource to use
-   * @param input The input reader
-   */
-  public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
-    super(source, input);
-    zzReader = input;
-  }
+  public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
   
-  /**
-   * @param factory The AttributeFactory to use
-   * @param input The input reader
-   */
-  public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
-    super(factory, input); 
-    zzReader = input;
-  }
+  public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
   
-  /** 
-   * Set the max allowed token length.  Any token longer than this is skipped.
-   * @param length the new max allowed token length
-   */
-  public void setMaxTokenLength(int length) {
-    this.maxTokenLength = length;
-  }
-
-  /**
-   * Returns the max allowed token length.  Any token longer than this is 
-   * skipped.
-   * @return the max allowed token length 
-   */
-  public int getMaxTokenLength() {
-    return maxTokenLength;
-  }
-
-  @Override
-  public final void end() {
-    // set final offset
-    int finalOffset = correctOffset(yychar + yylength());
-    offsetAtt.setOffset(finalOffset, finalOffset);
-  }
-
-  @Override
-  public void reset(Reader reader) throws IOException {
-    super.reset(reader);
-    yyreset(reader);
-  }
+  public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
 
-  @Override
-  public final boolean incrementToken() throws IOException {
-    // This method is required because of two JFlex limitations:
-    // 1. No way to insert code at the beginning of the generated scanning
-    //    get-next-token method; and
-    // 2. No way to declare @Override on the generated scanning method.
-    clearAttributes();
-    posIncr = 1;
-    return getNextToken();
+  public final int yychar()
+  {
+    return yychar;
   }
 
   /**
-   * Populates this TokenStream's CharTermAttribute and OffsetAttribute from
-   * the current match, the TypeAttribute from the passed-in tokenType, and
-   * the PositionIncrementAttribute to one, unless the immediately previous
-   * token(s) was/were skipped because maxTokenLength was exceeded, in which
-   * case the PositionIncrementAttribute is set to one plus the number of
-   * skipped overly long tokens. 
-   * <p/> 
-   * If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
-   * and false is returned.
-   * 
-   * @param tokenType The type of the matching token
-   * @return true there is a token available (not too long); false otherwise 
+   * Fills CharTermAttribute with the current token text.
    */
-  private boolean populateAttributes(String tokenType) {
-    boolean isTokenAvailable = false;
-    if (yylength() > maxTokenLength) {
-      // When we skip a too-long token, we treat it like a stopword, introducing
-      // a position increment gap
-      ++posIncr;
-    } else {
-      termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
-      posIncrAtt.setPositionIncrement(posIncr);
-      offsetAtt.setOffset(correctOffset(yychar),
-                          correctOffset(yychar + yylength()));
-      typeAtt.setType(tokenType);
-      isTokenAvailable = true;
-    }
-    return isTokenAvailable;
+  public final void getText(CharTermAttribute t) {
+    t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 %}
 
@@ -305,10 +202,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 // UAX#29 WB1. 	sot 	÷ 	
 //        WB2. 		÷ 	eot
 //
-<<EOF>> { return false; }
+<<EOF>> { return StandardTokenizerInterface.YYEOF; }
 
-{URL}   { if (populateAttributes(URL_TYPE)) return true; }
-{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
+{URL}   { return URL_TYPE; }
+{EMAIL} { return EMAIL_TYPE; }
 
 // UAX#29 WB8.   Numeric × Numeric
 //        WB11.  Numeric (MidNum | MidNumLet) × Numeric
@@ -320,14 +217,14 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
                               | {MidNumericEx} {NumericEx} 
                               | {NumericEx})*
 {ExtendNumLetEx}* 
-  { if (populateAttributes(NUMERIC_TYPE)) return true; }
+  { return NUMERIC_TYPE; }
 
 // subset of the below for typing purposes only!
 {HangulEx}+
-  { if (populateAttributes(HANGUL_TYPE)) return true; }
+  { return HANGUL_TYPE; }
 
 {KatakanaEx}+
-  { if (populateAttributes(KATAKANA_TYPE)) return true; }
+  { return KATAKANA_TYPE; }
 
 // UAX#29 WB5.   ALetter × ALetter
 //        WB6.   ALetter × (MidLetter | MidNumLet) ALetter
@@ -345,7 +242,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
                    | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx}
| {NumericEx})*
                      | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx}
| {ALetterEx})* )+ ) )*
 {ExtendNumLetEx}*  
-  { if (populateAttributes(WORD_TYPE)) return true; }
+  { return WORD_TYPE; }
 
 
 // From UAX #29:
@@ -367,12 +264,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 //
 //    http://www.unicode.org/reports/tr14/#SA
 //
-{ComplexContext}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
+{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
 
 // UAX#29 WB14.  Any ÷ Any
 //
-{Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
-{Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
+{HanEx} { return IDEOGRAPHIC_TYPE; }
+{HiraganaEx} { return HIRAGANA_TYPE; }
 
 
 // UAX#29 WB3.   CR × LF

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/ASCIITLD.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/ASCIITLD.jflex-macro?rev=1154936&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/ASCIITLD.jflex-macro
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/ASCIITLD.jflex-macro
Mon Aug  8 11:57:59 2011
@@ -0,0 +1,330 @@
+/*
+ * Copyright 2001-2005 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
+// file version from Wednesday, February 9, 2011 12:34:10 PM UTC
+// generated on Wednesday, February 9, 2011 4:45:18 PM UTC
+// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
+
+ASCIITLD = "." (
+	  [aA][cC]
+	| [aA][dD]
+	| [aA][eE]
+	| [aA][eE][rR][oO]
+	| [aA][fF]
+	| [aA][gG]
+	| [aA][iI]
+	| [aA][lL]
+	| [aA][mM]
+	| [aA][nN]
+	| [aA][oO]
+	| [aA][qQ]
+	| [aA][rR]
+	| [aA][rR][pP][aA]
+	| [aA][sS]
+	| [aA][sS][iI][aA]
+	| [aA][tT]
+	| [aA][uU]
+	| [aA][wW]
+	| [aA][xX]
+	| [aA][zZ]
+	| [bB][aA]
+	| [bB][bB]
+	| [bB][dD]
+	| [bB][eE]
+	| [bB][fF]
+	| [bB][gG]
+	| [bB][hH]
+	| [bB][iI]
+	| [bB][iI][zZ]
+	| [bB][jJ]
+	| [bB][mM]
+	| [bB][nN]
+	| [bB][oO]
+	| [bB][rR]
+	| [bB][sS]
+	| [bB][tT]
+	| [bB][vV]
+	| [bB][wW]
+	| [bB][yY]
+	| [bB][zZ]
+	| [cC][aA]
+	| [cC][aA][tT]
+	| [cC][cC]
+	| [cC][dD]
+	| [cC][fF]
+	| [cC][gG]
+	| [cC][hH]
+	| [cC][iI]
+	| [cC][kK]
+	| [cC][lL]
+	| [cC][mM]
+	| [cC][nN]
+	| [cC][oO]
+	| [cC][oO][mM]
+	| [cC][oO][oO][pP]
+	| [cC][rR]
+	| [cC][uU]
+	| [cC][vV]
+	| [cC][xX]
+	| [cC][yY]
+	| [cC][zZ]
+	| [dD][eE]
+	| [dD][jJ]
+	| [dD][kK]
+	| [dD][mM]
+	| [dD][oO]
+	| [dD][zZ]
+	| [eE][cC]
+	| [eE][dD][uU]
+	| [eE][eE]
+	| [eE][gG]
+	| [eE][rR]
+	| [eE][sS]
+	| [eE][tT]
+	| [eE][uU]
+	| [fF][iI]
+	| [fF][jJ]
+	| [fF][kK]
+	| [fF][mM]
+	| [fF][oO]
+	| [fF][rR]
+	| [gG][aA]
+	| [gG][bB]
+	| [gG][dD]
+	| [gG][eE]
+	| [gG][fF]
+	| [gG][gG]
+	| [gG][hH]
+	| [gG][iI]
+	| [gG][lL]
+	| [gG][mM]
+	| [gG][nN]
+	| [gG][oO][vV]
+	| [gG][pP]
+	| [gG][qQ]
+	| [gG][rR]
+	| [gG][sS]
+	| [gG][tT]
+	| [gG][uU]
+	| [gG][wW]
+	| [gG][yY]
+	| [hH][kK]
+	| [hH][mM]
+	| [hH][nN]
+	| [hH][rR]
+	| [hH][tT]
+	| [hH][uU]
+	| [iI][dD]
+	| [iI][eE]
+	| [iI][lL]
+	| [iI][mM]
+	| [iI][nN]
+	| [iI][nN][fF][oO]
+	| [iI][nN][tT]
+	| [iI][oO]
+	| [iI][qQ]
+	| [iI][rR]
+	| [iI][sS]
+	| [iI][tT]
+	| [jJ][eE]
+	| [jJ][mM]
+	| [jJ][oO]
+	| [jJ][oO][bB][sS]
+	| [jJ][pP]
+	| [kK][eE]
+	| [kK][gG]
+	| [kK][hH]
+	| [kK][iI]
+	| [kK][mM]
+	| [kK][nN]
+	| [kK][pP]
+	| [kK][rR]
+	| [kK][wW]
+	| [kK][yY]
+	| [kK][zZ]
+	| [lL][aA]
+	| [lL][bB]
+	| [lL][cC]
+	| [lL][iI]
+	| [lL][kK]
+	| [lL][rR]
+	| [lL][sS]
+	| [lL][tT]
+	| [lL][uU]
+	| [lL][vV]
+	| [lL][yY]
+	| [mM][aA]
+	| [mM][cC]
+	| [mM][dD]
+	| [mM][eE]
+	| [mM][gG]
+	| [mM][hH]
+	| [mM][iI][lL]
+	| [mM][kK]
+	| [mM][lL]
+	| [mM][mM]
+	| [mM][nN]
+	| [mM][oO]
+	| [mM][oO][bB][iI]
+	| [mM][pP]
+	| [mM][qQ]
+	| [mM][rR]
+	| [mM][sS]
+	| [mM][tT]
+	| [mM][uU]
+	| [mM][uU][sS][eE][uU][mM]
+	| [mM][vV]
+	| [mM][wW]
+	| [mM][xX]
+	| [mM][yY]
+	| [mM][zZ]
+	| [nN][aA]
+	| [nN][aA][mM][eE]
+	| [nN][cC]
+	| [nN][eE]
+	| [nN][eE][tT]
+	| [nN][fF]
+	| [nN][gG]
+	| [nN][iI]
+	| [nN][lL]
+	| [nN][oO]
+	| [nN][pP]
+	| [nN][rR]
+	| [nN][uU]
+	| [nN][zZ]
+	| [oO][mM]
+	| [oO][rR][gG]
+	| [pP][aA]
+	| [pP][eE]
+	| [pP][fF]
+	| [pP][gG]
+	| [pP][hH]
+	| [pP][kK]
+	| [pP][lL]
+	| [pP][mM]
+	| [pP][nN]
+	| [pP][rR]
+	| [pP][rR][oO]
+	| [pP][sS]
+	| [pP][tT]
+	| [pP][wW]
+	| [pP][yY]
+	| [qQ][aA]
+	| [rR][eE]
+	| [rR][oO]
+	| [rR][sS]
+	| [rR][uU]
+	| [rR][wW]
+	| [sS][aA]
+	| [sS][bB]
+	| [sS][cC]
+	| [sS][dD]
+	| [sS][eE]
+	| [sS][gG]
+	| [sS][hH]
+	| [sS][iI]
+	| [sS][jJ]
+	| [sS][kK]
+	| [sS][lL]
+	| [sS][mM]
+	| [sS][nN]
+	| [sS][oO]
+	| [sS][rR]
+	| [sS][tT]
+	| [sS][uU]
+	| [sS][vV]
+	| [sS][yY]
+	| [sS][zZ]
+	| [tT][cC]
+	| [tT][dD]
+	| [tT][eE][lL]
+	| [tT][fF]
+	| [tT][gG]
+	| [tT][hH]
+	| [tT][jJ]
+	| [tT][kK]
+	| [tT][lL]
+	| [tT][mM]
+	| [tT][nN]
+	| [tT][oO]
+	| [tT][pP]
+	| [tT][rR]
+	| [tT][rR][aA][vV][eE][lL]
+	| [tT][tT]
+	| [tT][vV]
+	| [tT][wW]
+	| [tT][zZ]
+	| [uU][aA]
+	| [uU][gG]
+	| [uU][kK]
+	| [uU][sS]
+	| [uU][yY]
+	| [uU][zZ]
+	| [vV][aA]
+	| [vV][cC]
+	| [vV][eE]
+	| [vV][gG]
+	| [vV][iI]
+	| [vV][nN]
+	| [vV][uU]
+	| [wW][fF]
+	| [wW][sS]
+	| [xX][nN]--0[zZ][wW][mM]56[dD]
+	| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
+	| [xX][nN]--3[eE]0[bB]707[eE]
+	| [xX][nN]--45[bB][rR][jJ]9[cC]
+	| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
+	| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
+	| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
+	| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
+	| [xX][nN]--[fF][iI][qQ][sS]8[sS]
+	| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
+	| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
+	| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
+	| [xX][nN]--[gG]6[wW]251[dD]
+	| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
+	| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
+	| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
+	| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
+	| [xX][nN]--[jJ]6[wW]193[gG]
+	| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
+	| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
+	| [xX][nN]--[kK][pP][rR][wW]13[dD]
+	| [xX][nN]--[kK][pP][rR][yY]57[dD]
+	| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
+	| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
+	| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
+	| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
+	| [xX][nN]--[oO]3[cC][wW]4[hH]
+	| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
+	| [xX][nN]--[pP]1[aA][iI]
+	| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
+	| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
+	| [xX][nN]--[wW][gG][bB][hH]1[cC]
+	| [xX][nN]--[wW][gG][bB][lL]6[aA]
+	| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
+	| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
+	| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
+	| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
+	| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
+	| [yY][eE]
+	| [yY][tT]
+	| [zZ][aA]
+	| [zZ][mM]
+	| [zZ][wW]
+	) "."?   // Accept trailing root (empty) domain
+

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro?rev=1154936&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
Mon Aug  8 11:57:59 2011
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2010 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Generated using ICU4J 4.6.0.0 on Wednesday, February 9, 2011 4:45:11 PM UTC
+// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
+
+
+ALetterSupp = (
+	  ([\ud80d][\uDC00-\uDC2E])
+	| ([\ud80c][\uDC00-\uDFFF])
+	| ([\ud809][\uDC00-\uDC62])
+	| ([\ud808][\uDC00-\uDF6E])
+	| ([\ud81a][\uDC00-\uDE38])
+	| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
+	| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
+	| ([\ud801][\uDC00-\uDC9D])
+	| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
+	| ([\ud803][\uDC00-\uDC48])
+	| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
+)
+FormatSupp = (
+	  ([\ud804][\uDCBD])
+	| ([\ud834][\uDD73-\uDD7A])
+	| ([\udb40][\uDC01\uDC20-\uDC7F])
+)
+ExtendSupp = (
+	  ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
+	| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
+	| ([\ud800][\uDDFD])
+	| ([\udb40][\uDD00-\uDDEF])
+	| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
+)
+NumericSupp = (
+	  ([\ud804][\uDC66-\uDC6F])
+	| ([\ud835][\uDFCE-\uDFFF])
+	| ([\ud801][\uDCA0-\uDCA9])
+)
+KatakanaSupp = (
+	  ([\ud82c][\uDC00])
+)
+MidLetterSupp = (
+	  []
+)
+MidNumSupp = (
+	  []
+)
+MidNumLetSupp = (
+	  []
+)
+ExtendNumLetSupp = (
+	  []
+)
+ExtendNumLetSupp = (
+	  []
+)
+ComplexContextSupp = (
+	  []
+)
+HanSupp = (
+	  ([\ud87e][\uDC00-\uDE1D])
+	| ([\ud86b][\uDC00-\uDFFF])
+	| ([\ud86a][\uDC00-\uDFFF])
+	| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
+	| ([\ud868][\uDC00-\uDFFF])
+	| ([\ud86e][\uDC00-\uDC1D])
+	| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
+	| ([\ud86c][\uDC00-\uDFFF])
+	| ([\ud863][\uDC00-\uDFFF])
+	| ([\ud862][\uDC00-\uDFFF])
+	| ([\ud861][\uDC00-\uDFFF])
+	| ([\ud860][\uDC00-\uDFFF])
+	| ([\ud867][\uDC00-\uDFFF])
+	| ([\ud866][\uDC00-\uDFFF])
+	| ([\ud865][\uDC00-\uDFFF])
+	| ([\ud864][\uDC00-\uDFFF])
+	| ([\ud858][\uDC00-\uDFFF])
+	| ([\ud859][\uDC00-\uDFFF])
+	| ([\ud85a][\uDC00-\uDFFF])
+	| ([\ud85b][\uDC00-\uDFFF])
+	| ([\ud85c][\uDC00-\uDFFF])
+	| ([\ud85d][\uDC00-\uDFFF])
+	| ([\ud85e][\uDC00-\uDFFF])
+	| ([\ud85f][\uDC00-\uDFFF])
+	| ([\ud850][\uDC00-\uDFFF])
+	| ([\ud851][\uDC00-\uDFFF])
+	| ([\ud852][\uDC00-\uDFFF])
+	| ([\ud853][\uDC00-\uDFFF])
+	| ([\ud854][\uDC00-\uDFFF])
+	| ([\ud855][\uDC00-\uDFFF])
+	| ([\ud856][\uDC00-\uDFFF])
+	| ([\ud857][\uDC00-\uDFFF])
+	| ([\ud849][\uDC00-\uDFFF])
+	| ([\ud848][\uDC00-\uDFFF])
+	| ([\ud84b][\uDC00-\uDFFF])
+	| ([\ud84a][\uDC00-\uDFFF])
+	| ([\ud84d][\uDC00-\uDFFF])
+	| ([\ud84c][\uDC00-\uDFFF])
+	| ([\ud84f][\uDC00-\uDFFF])
+	| ([\ud84e][\uDC00-\uDFFF])
+	| ([\ud841][\uDC00-\uDFFF])
+	| ([\ud840][\uDC00-\uDFFF])
+	| ([\ud843][\uDC00-\uDFFF])
+	| ([\ud842][\uDC00-\uDFFF])
+	| ([\ud845][\uDC00-\uDFFF])
+	| ([\ud844][\uDC00-\uDFFF])
+	| ([\ud847][\uDC00-\uDFFF])
+	| ([\ud846][\uDC00-\uDFFF])
+)
+HiraganaSupp = (
+	  ([\ud83c][\uDE00])
+	| ([\ud82c][\uDC01])
+)

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex?rev=1154936&r1=1154935&r2=1154936&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
(original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
Mon Aug  8 11:57:59 2011
@@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokena
 %function getNextToken
 %char
 
-%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
 ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
 Format =  ([\p{WB:Format}] | {FormatSupp})
 Numeric = ([\p{WB:Numeric}] | {NumericSupp})



Mime
View raw message