lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r564036 - in /lucene/java/trunk: ./ contrib/miscellaneous/src/test/org/apache/lucene/misc/ src/java/org/apache/lucene/analysis/standard/ src/test/org/apache/lucene/analysis/
Date Wed, 08 Aug 2007 22:26:55 GMT
Author: mikemccand
Date: Wed Aug  8 15:26:44 2007
New Revision: 564036

URL: http://svn.apache.org/viewvc?view=rev&rev=564036
Log:
LUCENE-966: sizable (~6X faster) speedups to StandardTokenizer by using JFlex instead of JavaCC

Added:
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex   (with props)
Removed:
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/FastCharStream.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/ParseException.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/Token.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/TokenMgrError.java
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/build.xml
    lucene/java/trunk/common-build.xml
    lucene/java/trunk/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/package.html
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=564036&r1=564035&r2=564036
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed Aug  8 15:26:44 2007
@@ -102,6 +102,11 @@
  6. LUCENE-939: Check explicitly for boundary conditions in FieldInfos
     and don't rely on exceptions. (Michael Busch)
 
+ 7. LUCENE-966: Very substantial speedups (~6X faster) for
+    StandardTokenizer (StandardAnalyzer) by using JFlex instead of
+    JavaCC to generate the tokenizer.
+    (Stanislaw Osinski via Mike McCandless)
+
 Documentation
 
 Build

Modified: lucene/java/trunk/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/build.xml?view=diff&rev=564036&r1=564035&r2=564036
==============================================================================
--- lucene/java/trunk/build.xml (original)
+++ lucene/java/trunk/build.xml Wed Aug  8 15:26:44 2007
@@ -414,22 +414,7 @@
     </delete>
   </target>
 
-  <target name="javacc" depends="clean-javacc,javacc-StandardAnalyzer,javacc-QueryParser,javacc-HTMLParser"/>
-
-  <target name="javacc-StandardAnalyzer" depends="init,javacc-check" if="javacc.present">
-    <!-- generate this in a build directory so we can exclude ParseException -->
-    <mkdir dir="${build.dir}/gen/org/apache/lucene/analysis/standard"/>
-
-    <invoke-javacc target="src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj"
-                   outputDir="${build.dir}/gen/org/apache/lucene/analysis/standard"
-    />
-    <copy todir="src/java/org/apache/lucene/analysis/standard">
-      <fileset dir="${build.dir}/gen/org/apache/lucene/analysis/standard">
-        <include name="*.java"/>
-        <exclude name="ParseException.java"/>
-      </fileset>
-    </copy>
-  </target>
+  <target name="javacc" depends="clean-javacc,javacc-QueryParser,javacc-HTMLParser"/>
 
   <target name="javacc-QueryParser" depends="init,javacc-check" if="javacc.present">
     <invoke-javacc target="src/java/org/apache/lucene/queryParser/QueryParser.jj"
@@ -443,6 +428,30 @@
     />
   </target>
   
+  <!-- ================================================================== -->
+  <!-- Build the JFlex files into the source tree                         -->
+  <!-- ================================================================== -->
+
+  <target name="jflex" depends="clean-jflex,jflex-StandardAnalyzer" />
+
+  <target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
+    <taskdef classname="JFlex.anttask.JFlexTask" name="jflex">
+      <classpath location="${jflex.home}/lib/JFlex.jar" />
+    </taskdef>
+
+    <jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex"
+           outdir="src/java/org/apache/lucene/analysis/standard"
+           nobak="on" />
+  </target>
+
+  <target name="clean-jflex">
+    <delete>
+      <fileset dir="src/java/org/apache/lucene/analysis/standard" includes="*.java">
+        <containsregexp expression="generated.*by.*JFlex"/>
+      </fileset>
+    </delete>
+  </target>
+
   <macrodef name="contrib-crawl">
     <attribute name="target" default=""/>
     <attribute name="failonerror" default="true"/>

Modified: lucene/java/trunk/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/common-build.xml?view=diff&rev=564036&r1=564035&r2=564036
==============================================================================
--- lucene/java/trunk/common-build.xml (original)
+++ lucene/java/trunk/common-build.xml Wed Aug  8 15:26:44 2007
@@ -58,6 +58,7 @@
   <property name="maven.dist.dir" location="dist/maven"/>
 
   <property name="javacc.home" location="${common.dir}"/>
+  <property name="jflex.home" location="${common.dir}"/>
 
   <property name="junit.output.dir" location="${build.dir}/test"/>
   <property name="junit.reports" location="${build.dir}/test/reports"/>
@@ -97,6 +98,12 @@
     classpath="${javacc.home}/bin/lib/javacc.jar"
     />
 
+   <available
+    property="jflex.present"
+    classname="JFlex.anttask.JFlexTask"
+    classpath="${jflex.home}/lib/JFlex.jar"
+    />
+
   <available
     property="junit.present"
     classname="junit.framework.TestCase"
@@ -131,7 +138,21 @@
     </echo>
   </target>
 
-  <target name="init" depends="javacc-uptodate-check, javacc-notice">
+  <target name="init" depends="javacc-uptodate-check, javacc-notice, jflex-uptodate-check, jflex-notice">
+  </target>
+
+  <target name="jflex-uptodate-check">
+    <uptodate property="jflex.files.uptodate">
+      <srcfiles dir="src" includes="**/*.jflex" />
+      <mapper type="glob" from="*.jflex" to="*.java"/>
+    </uptodate>
+  </target>
+ 
+  <target name="jflex-notice" unless="jflex.files.uptodate">
+    <echo>
+      One or more of the JFlex .jflex files is newer than its corresponding
+      .java file.  Run the "jflex" target to regenerate the artifacts.
+    </echo>
   </target>
 
   <target name="javacc-check">
@@ -162,6 +183,28 @@
 
   </target>
 	
+  <target name="jflex-check">
+    <fail unless="jflex.present">
+      ##################################################################
+      JFlex not found.
+      JFlex Home: ${jflex.home}
+
+      Please download and install JFlex from:
+
+      &lt;http://jflex.de/download.html&gt;
+
+      Then, create a build.properties file either in your home
+      directory, or within the Lucene directory and set the jflex.home
+      property to the path where JFlex is installed. For example,
+      if you installed JFlex in /usr/local/java/jflex-1.4.1, then set the
+      jflex.home property to:
+
+      jflex.home=/usr/local/java/jflex-1.4.1
+
+      ##################################################################
+    </fail>
+  </target>
+
   <target name="compile-core" depends="init, clover"
           description="Compiles core classes">
     <compile

Modified: lucene/java/trunk/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java?view=diff&rev=564036&r1=564035&r2=564036
==============================================================================
--- lucene/java/trunk/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java (original)
+++ lucene/java/trunk/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java Wed Aug  8 15:26:44 2007
@@ -18,8 +18,7 @@
  */
 
 import junit.framework.TestCase;
-import java.util.Calendar;
-import java.util.Date;
+import java.util.*;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import org.apache.lucene.index.IndexWriter;
@@ -131,7 +130,7 @@
   }
 
   private Date parseDate(String s) throws ParseException {
-    return new SimpleDateFormat("yyyy MMM dd").parse(s);
+    return new SimpleDateFormat("yyyy MMM dd", Locale.US).parse(s);
   }
 
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java?view=diff&rev=564036&r1=564035&r2=564036
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java Wed Aug  8 15:26:44 2007
@@ -17,12 +17,12 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
 
 /** Normalizes tokens extracted with {@link StandardTokenizer}. */
 
-public final class StandardFilter extends TokenFilter
-  implements StandardTokenizerConstants  {
+public final class StandardFilter extends TokenFilter {
 
 
   /** Construct filtering <i>in</i>. */
@@ -30,9 +30,9 @@
     super(in);
   }
 
-  private static final String APOSTROPHE_TYPE = tokenImage[APOSTROPHE];
-  private static final String ACRONYM_TYPE = tokenImage[ACRONYM];
-  
+  private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
+  private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
+
   /** Returns the next token in the stream, or null at EOS.
    * <p>Removes <tt>'s</tt> from the end of words.
    * <p>Removes dots from acronyms.

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?view=diff&rev=564036&r1=564035&r2=564036
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Wed Aug  8 15:26:44 2007
@@ -1,9 +1,29 @@
-/* Generated By:JavaCC: Do not edit this line. StandardTokenizer.java */
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.lucene.analysis.standard;
 
-import java.io.*;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
 
-/** A grammar-based tokenizer constructed with JavaCC.
+/** A grammar-based tokenizer constructed with JFlex
  *
  * <p> This should be a good tokenizer for most European-language documents:
  *
@@ -19,188 +39,47 @@
  * not suit your application, please consider copying this source code
  * directory to your project and maintaining your own grammar-based tokenizer.
  */
-public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer implements StandardTokenizerConstants {
 
-  /** Constructs a tokenizer for this Reader. */
-  public StandardTokenizer(Reader reader) {
-    this(new FastCharStream(reader));
-    this.input = reader;
-  }
-
-/** Returns the next token in the stream, or null at EOS.
- * <p>The returned token's type is set to an element of {@link
- * StandardTokenizerConstants#tokenImage}.
- */
-  final public org.apache.lucene.analysis.Token next() throws ParseException, IOException {
-  Token token = null;
-    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-    case ALPHANUM:
-      token = jj_consume_token(ALPHANUM);
-      break;
-    case APOSTROPHE:
-      token = jj_consume_token(APOSTROPHE);
-      break;
-    case ACRONYM:
-      token = jj_consume_token(ACRONYM);
-      break;
-    case COMPANY:
-      token = jj_consume_token(COMPANY);
-      break;
-    case EMAIL:
-      token = jj_consume_token(EMAIL);
-      break;
-    case HOST:
-      token = jj_consume_token(HOST);
-      break;
-    case NUM:
-      token = jj_consume_token(NUM);
-      break;
-    case CJ:
-      token = jj_consume_token(CJ);
-      break;
-    case 0:
-      token = jj_consume_token(0);
-      break;
-    default:
-      jj_la1[0] = jj_gen;
-      jj_consume_token(-1);
-      throw new ParseException();
-    }
-      if (token.kind == EOF) {
-        {if (true) return null;}
-      } else {
-        {if (true) return
-          new org.apache.lucene.analysis.Token(token.image,
-                                        token.beginColumn,token.endColumn,
-                                        tokenImage[token.kind]);}
-      }
-    throw new Error("Missing return statement in function");
-  }
-
-  public StandardTokenizerTokenManager token_source;
-  public Token token, jj_nt;
-  private int jj_ntk;
-  private int jj_gen;
-  final private int[] jj_la1 = new int[1];
-  static private int[] jj_la1_0;
-  static {
-      jj_la1_0();
-   }
-   private static void jj_la1_0() {
-      jj_la1_0 = new int[] {0x10ff,};
-   }
-
-  public StandardTokenizer(CharStream stream) {
-    token_source = new StandardTokenizerTokenManager(stream);
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 1; i++) jj_la1[i] = -1;
-  }
-
-  public void ReInit(CharStream stream) {
-    token_source.ReInit(stream);
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 1; i++) jj_la1[i] = -1;
-  }
-
-  public StandardTokenizer(StandardTokenizerTokenManager tm) {
-    token_source = tm;
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 1; i++) jj_la1[i] = -1;
-  }
-
-  public void ReInit(StandardTokenizerTokenManager tm) {
-    token_source = tm;
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 1; i++) jj_la1[i] = -1;
-  }
-
-  final private Token jj_consume_token(int kind) throws ParseException {
-    Token oldToken;
-    if ((oldToken = token).next != null) token = token.next;
-    else token = token.next = token_source.getNextToken();
-    jj_ntk = -1;
-    if (token.kind == kind) {
-      jj_gen++;
-      return token;
+public class StandardTokenizer extends Tokenizer {
+    /** A private instance of the JFlex-constructed scanner */
+    private final StandardTokenizerImpl scanner;
+
+    /**
+     * Creates a new instance of the {@link StandardTokenizer}. Attaches the
+     * <code>input</code> to a newly created JFlex scanner.
+     */
+    public StandardTokenizer(Reader input) {
+	this.input = input;
+	this.scanner = new StandardTokenizerImpl(input);
+    }
+
+    /*
+     * (non-Javadoc)
+     *
+     * @see org.apache.lucene.analysis.TokenStream#next()
+     */
+    public Token next() throws IOException {
+	int tokenType = scanner.getNextToken();
+
+	if (tokenType == StandardTokenizerImpl.YYEOF) {
+	    return null;
+	}
+
+	int startPosition = scanner.yychar();
+
+	final String tokenImage = scanner.yytext();
+	return new Token(tokenImage, startPosition, startPosition
+		+ tokenImage.length(),
+		StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+    }
+
+    /*
+     * (non-Javadoc)
+     *
+     * @see org.apache.lucene.analysis.TokenStream#reset()
+     */
+    public void reset() throws IOException {
+	super.reset();
+	scanner.yyreset(input);
     }
-    token = oldToken;
-    jj_kind = kind;
-    throw generateParseException();
-  }
-
-  final public Token getNextToken() {
-    if (token.next != null) token = token.next;
-    else token = token.next = token_source.getNextToken();
-    jj_ntk = -1;
-    jj_gen++;
-    return token;
-  }
-
-  final public Token getToken(int index) {
-    Token t = token;
-    for (int i = 0; i < index; i++) {
-      if (t.next != null) t = t.next;
-      else t = t.next = token_source.getNextToken();
-    }
-    return t;
-  }
-
-  final private int jj_ntk() {
-    if ((jj_nt=token.next) == null)
-      return (jj_ntk = (token.next=token_source.getNextToken()).kind);
-    else
-      return (jj_ntk = jj_nt.kind);
-  }
-
-  private java.util.Vector jj_expentries = new java.util.Vector();
-  private int[] jj_expentry;
-  private int jj_kind = -1;
-
-  public ParseException generateParseException() {
-    jj_expentries.removeAllElements();
-    boolean[] la1tokens = new boolean[16];
-    for (int i = 0; i < 16; i++) {
-      la1tokens[i] = false;
-    }
-    if (jj_kind >= 0) {
-      la1tokens[jj_kind] = true;
-      jj_kind = -1;
-    }
-    for (int i = 0; i < 1; i++) {
-      if (jj_la1[i] == jj_gen) {
-        for (int j = 0; j < 32; j++) {
-          if ((jj_la1_0[i] & (1<<j)) != 0) {
-            la1tokens[j] = true;
-          }
-        }
-      }
-    }
-    for (int i = 0; i < 16; i++) {
-      if (la1tokens[i]) {
-        jj_expentry = new int[1];
-        jj_expentry[0] = i;
-        jj_expentries.addElement(jj_expentry);
-      }
-    }
-    int[][] exptokseq = new int[jj_expentries.size()][];
-    for (int i = 0; i < jj_expentries.size(); i++) {
-      exptokseq[i] = (int[])jj_expentries.elementAt(i);
-    }
-    return new ParseException(token, exptokseq, tokenImage);
-  }
-
-  final public void enable_tracing() {
-  }
-
-  final public void disable_tracing() {
-  }
-
 }

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?view=auto&rev=564036
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java Wed Aug  8 15:26:44 2007
@@ -0,0 +1,639 @@
+/* The following code was generated by JFlex 1.4.1 on 8/8/07 10:18 PM */
+
+package org.apache.lucene.analysis.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+class StandardTokenizerImpl {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int YYINITIAL = 0;
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED = 
+    "\11\0\1\0\1\16\1\0\1\0\1\15\22\0\1\0\5\0\1\3"+
+    "\1\1\4\0\1\7\1\5\1\2\1\7\12\11\6\0\1\4\32\10"+
+    "\4\0\1\6\1\0\32\10\105\0\27\10\1\0\37\10\1\0\u0568\10"+
+    "\12\12\206\10\12\12\u026c\10\12\12\166\10\12\12\166\10\12\12\166\10"+
+    "\12\12\166\10\12\12\167\10\11\12\166\10\12\12\166\10\12\12\166\10"+
+    "\12\12\340\10\12\12\166\10\12\12\u0166\10\12\12\266\10\u0100\10\u0e00\10"+
+    "\u1040\0\u0150\14\140\0\20\14\u0100\0\200\14\200\0\u19c0\14\100\0\u5200\14"+
+    "\u0c00\0\u2bb0\13\u2150\0\u0200\14\u0465\0\73\14\75\10\43\0";
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /** 
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\1\0\1\1\4\2\1\3\1\1\14\0\1\4\4\5"+
+    "\2\6\2\0\1\7\1\0\1\7\3\5\6\7\3\5"+
+    "\1\10\4\0\1\10\2\0\2\10\2\5\1\11";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[57];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /** 
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\17\0\36\0\55\0\74\0\113\0\17\0\132"+
+    "\0\151\0\170\0\207\0\226\0\245\0\264\0\303\0\322"+
+    "\0\341\0\360\0\377\0\u010e\0\u011d\0\u012c\0\u013b\0\u014a"+
+    "\0\u0159\0\207\0\u0168\0\u0177\0\u0186\0\u0195\0\u01a4\0\u01b3"+
+    "\0\u01c2\0\u01d1\0\u01e0\0\u01ef\0\u01fe\0\u020d\0\u021c\0\u022b"+
+    "\0\u023a\0\u0249\0\u0258\0\u0267\0\u0276\0\u0285\0\u0294\0\u02a3"+
+    "\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\170\0\377\0\u02ee\0\u02fd"+
+    "\0\u030c";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[57];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /** 
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\10\2\1\3\1\4\1\5\1\6\1\7\1\10\1\2"+
+    "\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\3"+
+    "\1\4\1\5\1\6\5\0\1\17\1\0\1\20\2\21"+
+    "\1\22\3\4\1\6\4\0\1\11\1\23\1\13\1\14"+
+    "\2\21\1\22\1\5\1\4\1\5\1\6\5\0\1\24"+
+    "\1\0\1\20\2\15\1\16\4\6\21\0\1\2\10\0"+
+    "\1\25\1\0\1\25\14\0\1\26\1\27\1\30\1\31"+
+    "\13\0\1\32\1\0\1\32\14\0\1\33\1\34\1\33"+
+    "\1\34\13\0\1\35\2\36\1\37\13\0\1\16\2\40"+
+    "\14\0\1\41\2\42\1\43\13\0\4\34\13\0\1\44"+
+    "\2\45\1\46\13\0\1\47\2\50\1\51\13\0\1\52"+
+    "\1\42\1\53\1\43\13\0\1\54\2\27\1\31\4\0"+
+    "\1\11\6\0\1\25\1\0\1\25\6\0\1\55\1\0"+
+    "\1\20\2\56\1\0\1\26\1\27\1\30\1\31\5\0"+
+    "\1\57\1\0\1\20\2\60\1\61\3\27\1\31\5\0"+
+    "\1\62\1\0\1\20\2\60\1\61\1\30\1\27\1\30"+
+    "\1\31\5\0\1\63\1\0\1\20\2\56\1\0\4\31"+
+    "\5\0\1\64\2\0\1\64\2\0\1\33\1\34\1\33"+
+    "\1\34\5\0\1\64\2\0\1\64\2\0\4\34\5\0"+
+    "\1\56\1\0\1\20\2\56\1\0\1\35\2\36\1\37"+
+    "\5\0\1\60\1\0\1\20\2\60\1\61\3\36\1\37"+
+    "\5\0\1\56\1\0\1\20\2\56\1\0\4\37\5\0"+
+    "\1\61\2\0\3\61\3\40\6\0\1\24\1\0\1\20"+
+    "\2\15\1\16\1\41\2\42\1\43\5\0\1\17\1\0"+
+    "\1\20\2\21\1\22\3\42\1\43\5\0\1\24\1\0"+
+    "\1\20\2\15\1\16\4\43\5\0\1\15\1\0\1\20"+
+    "\2\15\1\16\1\44\2\45\1\46\5\0\1\21\1\0"+
+    "\1\20\2\21\1\22\3\45\1\46\5\0\1\15\1\0"+
+    "\1\20\2\15\1\16\4\46\5\0\1\16\2\0\3\16"+
+    "\1\47\2\50\1\51\5\0\1\22\2\0\3\22\3\50"+
+    "\1\51\5\0\1\16\2\0\3\16\4\51\5\0\1\65"+
+    "\1\0\1\20\2\15\1\16\1\52\1\42\1\53\1\43"+
+    "\5\0\1\66\1\0\1\20\2\21\1\22\1\53\1\42"+
+    "\1\53\1\43\5\0\1\63\1\0\1\20\2\56\1\0"+
+    "\1\54\2\27\1\31\13\0\1\67\1\31\1\67\1\31"+
+    "\13\0\4\37\13\0\4\43\13\0\4\46\13\0\4\51"+
+    "\13\0\1\70\1\43\1\70\1\43\13\0\4\31\13\0"+
+    "\4\71\5\0\1\55\1\0\1\20\2\56\1\0\1\67"+
+    "\1\31\1\67\1\31\5\0\1\65\1\0\1\20\2\15"+
+    "\1\16\1\70\1\43\1\70\1\43\5\0\1\64\2\0"+
+    "\1\64\2\0\4\71\3\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[795];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\1\0\1\11\4\1\1\11\1\1\14\0\7\1\2\0"+
+    "\1\1\1\0\16\1\4\0\1\1\2\0\5\1";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[57];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the textposition at the last state to be included in yytext */
+  private int zzPushbackPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /* user code: */
+
+public static final int ALPHANUM          = 0;
+public static final int APOSTROPHE        = 1;
+public static final int ACRONYM           = 2;
+public static final int COMPANY           = 3;
+public static final int EMAIL             = 4;
+public static final int HOST              = 5;
+public static final int NUM               = 6;
+public static final int CJ                = 7;
+
+public static final String [] TOKEN_TYPES = new String [] {
+    "<ALPHANUM>",
+    "<APOSTROPHE>",
+    "<ACRONYM>",
+    "<COMPANY>",
+    "<EMAIL>",
+    "<HOST>",
+    "<NUM>",
+    "<CJ>"
+};
+
+public final int yychar()
+{
+    return yychar;
+}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  StandardTokenizerImpl(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  StandardTokenizerImpl(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /** 
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 156) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzPushbackPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead < 0) {
+      return true;
+    }
+    else {
+      zzEndRead+= numRead;
+      return false;
+    }
+  }
+
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public int getNextToken() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      yychar+= zzMarkedPosL-zzStartRead;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+      zzState = zzLexicalState;
+
+
+      zzForAction: {
+        while (true) {
+    
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 5: 
+          { return HOST;
+          }
+        case 10: break;
+        case 8: 
+          { return ACRONYM;
+          }
+        case 11: break;
+        case 1: 
+          { /* ignore */
+          }
+        case 12: break;
+        case 7: 
+          { return NUM;
+          }
+        case 13: break;
+        case 3: 
+          { return CJ;
+          }
+        case 14: break;
+        case 2: 
+          { return ALPHANUM;
+          }
+        case 15: break;
+        case 6: 
+          { return COMPANY;
+          }
+        case 16: break;
+        case 4: 
+          { return APOSTROPHE;
+          }
+        case 17: break;
+        case 9: 
+          { return EMAIL;
+          }
+        case 18: break;
+        default: 
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return YYEOF;
+          } 
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex?view=auto&rev=564036
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex Wed Aug  8 15:26:44 2007
@@ -0,0 +1,121 @@
+package org.apache.lucene.analysis.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+%%
+
+%class StandardTokenizerImpl
+%unicode
+%integer
+%function getNextToken
+%pack
+%char
+
+%{
+
+public static final int ALPHANUM          = 0;
+public static final int APOSTROPHE        = 1;
+public static final int ACRONYM           = 2;
+public static final int COMPANY           = 3;
+public static final int EMAIL             = 4;
+public static final int HOST              = 5;
+public static final int NUM               = 6;
+public static final int CJ                = 7;
+
+public static final String [] TOKEN_TYPES = new String [] {
+    "<ALPHANUM>",
+    "<APOSTROPHE>",
+    "<ACRONYM>",
+    "<COMPANY>",
+    "<EMAIL>",
+    "<HOST>",
+    "<NUM>",
+    "<CJ>"
+};
+
+public final int yychar()
+{
+    return yychar;
+}
+%}
+
+// basic word: a sequence of digits & letters
+ALPHANUM   = ({LETTER}|{DIGIT}|{KOREAN})+
+
+// internal apostrophes: O'Reilly, you're, O'Reilly's
+// use a post-filter to remove possesives
+APOSTROPHE =  {ALPHA} ("'" {ALPHA})+
+
+// acronyms: U.S.A., I.B.M., etc.
+// use a post-filter to remove dots
+ACRONYM    =  {ALPHA} "." ({ALPHA} ".")+
+
+// company names like AT&T and Excite@Home.
+COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}
+
+// email addresses
+EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
+
+// hostname
+HOST       =  {ALPHANUM} ((".") {ALPHANUM})+
+
+// floating point, serial, model numbers, ip addresses, etc.
+// every other segment must have at least one digit
+NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
+           | {HAS_DIGIT} {P} {ALPHANUM}
+           | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+           | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+           | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+           | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+
+// punctuation
+P	         = ("_"|"-"|"/"|"."|",")
+
+// at least one digit
+HAS_DIGIT  =
+    ({LETTER}|{DIGIT})*
+    {DIGIT}
+    ({LETTER}|{DIGIT})*
+
+ALPHA      = ({LETTER})+
+
+
+LETTER     = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
+
+DIGIT      = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
+
+KOREAN     = [\uac00-\ud7af\u1100-\u11ff]
+
+// Chinese, Japanese
+CJ         = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
+
+WHITESPACE = \r\n | [ \r\n\t\f]
+
+%%
+
+{ALPHANUM}                                                     { return ALPHANUM; }
+{APOSTROPHE}                                                   { return APOSTROPHE; }
+{ACRONYM}                                                      { return ACRONYM; }
+{COMPANY}                                                      { return COMPANY; }
+{EMAIL}                                                        { return EMAIL; }
+{HOST}                                                         { return HOST; }
+{NUM}                                                          { return NUM; }
+{CJ}                                                           { return CJ; }
+
+/** Ignore the rest */
+. | {WHITESPACE}                                               { /* ignore */ }

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/package.html?view=diff&rev=564036&r1=564035&r2=564036
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/package.html (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/package.html Wed Aug  8 15:26:44 2007
@@ -2,14 +2,9 @@
 <html>
 <head>
    <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
-   <meta name="Author" content="Doug Cutting">
+   <meta name="Author" content="Stanislaw Osinski">
 </head>
 <body>
-A grammar-based tokenizer constructed with JavaCC.
-<p>Note that JavaCC defines lots of public classes, methods and fields
-that do not need to be public.&nbsp; These clutter the documentation.&nbsp;
-Sorry.
-<p>Note that because JavaCC defines a class named <tt>Token</tt>, <tt>org.apache.lucene.analysis.Token</tt>
-must always be fully qualified in source code in this package.
+A fast grammar-based tokenizer constructed with JFlex.
 </body>
 </html>

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?view=diff&rev=564036&r1=564035&r2=564036
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Wed Aug  8 15:26:44 2007
@@ -23,89 +23,177 @@
 
 public class TestStandardAnalyzer extends TestCase {
 
-  public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception {
-    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
-    for (int i = 0; i < expected.length; i++) {
-      Token t = ts.next();
-      assertNotNull(t);
-      assertEquals(expected[i], t.termText());
-    }
-    assertNull(ts.next());
-    ts.close();
-  }
-
-
-  public void testStandard() throws Exception {
-    Analyzer a = new StandardAnalyzer();
-
-    // alphanumeric tokens
-    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
-    assertAnalyzesTo(a, "2B", new String[]{"2b"});
-
-    // underscores are delimiters, but not in email addresses (below)
-    assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
-    assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
-
-    // other delimiters: "-", "/", ","
-    assertAnalyzesTo(a, "some-dashed-phrase",   new String[]{"some", "dashed", "phrase" });
-    assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
-    assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
-
-    // internal apostrophes: O'Reilly, you're, O'Reilly's
-    // possessives are actually removed by StardardFilter, not the tokenizer
-    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
-    assertAnalyzesTo(a, "you're", new String[]{"you're"});
-    assertAnalyzesTo(a, "she's", new String[]{"she"});
-    assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
-    assertAnalyzesTo(a, "don't", new String[]{"don't"});
-    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
-
-    // t and s had been stopwords in Lucene <= 2.0, which made it impossible
-    // to correctly search for these terms:
-    assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
-    assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
-    // 'a' is still a stopword:
-    assertAnalyzesTo(a, "a-class", new String[]{"class"});
-
-    // company names
-    assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
-    assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
-
-    // domain names
-    assertAnalyzesTo(a, "www.nutch.org",   new String[]{"www.nutch.org" });
-
-    // email addresses, possibly with underscores, periods, etc
-    assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
-    assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
-    assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
-
-    // floating point, serial, model numbers, ip addresses, etc.
-    // every other segment must have at least one digit
-    assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
-    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
-    assertAnalyzesTo(a, "216.239.63.104",   new String[]{"216.239.63.104"});
-    assertAnalyzesTo(a, "1-2-3",   new String[]{"1-2-3"});
-    assertAnalyzesTo(a, "a1-b2-c3",   new String[]{"a1-b2-c3"});
-    assertAnalyzesTo(a, "a1-b-c3",   new String[]{"a1-b-c3"});
-
-    // numbers
-    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
-
-    // various
-    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted" });
-    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
-    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
-    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
-
-    // acronyms have their dots stripped
-    assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" });
-
-    // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
-    assertAnalyzesTo(a, "C++", new String[]{"c"});
-    assertAnalyzesTo(a, "C#", new String[]{"c"});
+    private Analyzer a = new StandardAnalyzer();
 
-    // Korean words
-    assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+    public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception {
+	assertAnalyzesTo(a, input, expected, null);
+    }
+
+    public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
+	TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+	for (int i = 0; i < expectedImages.length; i++) {
+	    Token t = ts.next();
+	    assertNotNull(t);
+	    assertEquals(expectedImages[i], t.termText());
+	    if (expectedTypes != null)
+	    {
+		assertEquals(expectedTypes[i], t.type());
+	    }
+	}
+	assertNull(ts.next());
+	ts.close();
+    }
+
+
+    public void testAlphanumeric() throws Exception {
+	// alphanumeric tokens
+	assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
+	assertAnalyzesTo(a, "2B", new String[]{"2b"});
+    }
+
+    public void testUnderscores() throws Exception {
+	// underscores are delimiters, but not in email addresses (below)
+	assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
+	assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
+    }
+
+    public void testDelimiters() throws Exception {
+	// other delimiters: "-", "/", ","
+	assertAnalyzesTo(a, "some-dashed-phrase",   new String[]{"some", "dashed", "phrase" });
+	assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+	assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+    }
+
+    public void testApostrophes() throws Exception {
+	// internal apostrophes: O'Reilly, you're, O'Reilly's
+	// possessives are actually removed by StardardFilter, not the tokenizer
+	assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
+	assertAnalyzesTo(a, "you're", new String[]{"you're"});
+	assertAnalyzesTo(a, "she's", new String[]{"she"});
+	assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
+	assertAnalyzesTo(a, "don't", new String[]{"don't"});
+	assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
+    }
+
+    public void testTSADash() throws Exception {
+	// t and s had been stopwords in Lucene <= 2.0, which made it impossible
+	// to correctly search for these terms:
+	assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
+	assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
+	// 'a' is still a stopword:
+	assertAnalyzesTo(a, "a-class", new String[]{"class"});
+    }
+
+    public void testCompanyNames() throws Exception {
+	// company names
+	assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
+	assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
+    }
+
+    public void testDomainNames() throws Exception {
+	// domain names
+	assertAnalyzesTo(a, "www.nutch.org",   new String[]{"www.nutch.org" });
+    }
+
+    public void testEMailAddresses() throws Exception {
+	// email addresses, possibly with underscores, periods, etc
+	assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
+	assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
+	assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
+    }
+
+    public void testNumeric() throws Exception {
+	// floating point, serial, model numbers, ip addresses, etc.
+	// every other segment must have at least one digit
+	assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
+	assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
+	assertAnalyzesTo(a, "216.239.63.104",   new String[]{"216.239.63.104"});
+	assertAnalyzesTo(a, "1-2-3",   new String[]{"1-2-3"});
+	assertAnalyzesTo(a, "a1-b2-c3",   new String[]{"a1-b2-c3"});
+	assertAnalyzesTo(a, "a1-b-c3",   new String[]{"a1-b-c3"});
+    }
+
+    public void testTextWithNumbers() throws Exception {
+	// numbers
+	assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
+    }
+
+    public void testVariousText() throws Exception {
+	// various
+	assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted" });
+	assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
+	assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
+	assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
+    }
+
+    public void testAcronyms() throws Exception {
+	// acronyms have their dots stripped
+	assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" });
+    }
+
+    public void testCPlusPlusHash() throws Exception {
+	// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
+	assertAnalyzesTo(a, "C++", new String[]{"c"});
+	assertAnalyzesTo(a, "C#", new String[]{"c"});
+    }
+
+    public void testKorean() throws Exception {
+	// Korean words
+	assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+    }
+
+    // Compliance with the "old" JavaCC-based analyzer, see:
+    // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
+
+    public void testComplianceFileName() throws Exception {
+	assertAnalyzesTo(a, "2004.jpg",
+		new String[] { "2004.jpg" },
+		new String[] { "<HOST>" });
+    }
 
-  }
+    public void testComplianceNumericIncorrect() throws Exception {
+	assertAnalyzesTo(a, "62.46",
+		new String[] { "62.46" },
+		new String[] { "<HOST>" });
+    }
+
+    public void testComplianceNumericLong() throws Exception {
+	assertAnalyzesTo(a, "978-0-94045043-1",
+		new String[] { "978-0-94045043-1" },
+		new String[] { "<NUM>" });
+    }
+
+    public void testComplianceNumericFile() throws Exception {
+	assertAnalyzesTo(
+		a,
+		"78academyawards/rules/rule02.html",
+		new String[] { "78academyawards/rules/rule02.html" },
+		new String[] { "<NUM>" });
+    }
+
+    public void testComplianceNumericWithUnderscores() throws Exception {
+	assertAnalyzesTo(
+		a,
+		"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
+		new String[] { "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs" },
+		new String[] { "<NUM>" });
+    }
+
+    public void testComplianceNumericWithDash() throws Exception {
+	assertAnalyzesTo(a, "mid-20th", new String[] { "mid-20th" },
+		new String[] { "<NUM>" });
+    }
+
+    public void testComplianceManyTokens() throws Exception {
+	assertAnalyzesTo(
+		a,
+		"/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
+			+ "safari-0-sheikh-zayed-grand-mosque.jpg",
+		new String[] { "money.cnn.com", "magazines", "fortune",
+			"fortune", "archive/2007/03/19/8402357", "index.htm",
+			"safari-0-sheikh", "zayed", "grand", "mosque.jpg" },
+		new String[] { "<HOST>", "<ALPHANUM>", "<ALPHANUM>",
+			"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
+			"<ALPHANUM>", "<HOST>" });
+    }
 }



Mime
View raw message