lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject svn commit: r431151 - in /lucene/java/trunk: ./ src/demo/org/apache/lucene/demo/html/ src/java/org/apache/lucene/analysis/standard/
Date Sun, 13 Aug 2006 07:02:27 GMT
Author: otis
Date: Sun Aug 13 00:02:26 2006
New Revision: 431151

URL: http://svn.apache.org/viewvc?rev=431151&view=rev
Log:
- LUCENE-478: Updated Unicode code point ranges for CJK

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/demo/org/apache/lucene/demo/html/HTMLParser.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=431151&r1=431150&r2=431151&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Aug 13 00:02:26 2006
@@ -12,7 +12,11 @@
     Note that this problem still exists for 'a', e.g. in 'a-class' as
     'a' continues to be a stopword.
     (Daniel Naber)
-    
+
+ 2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now
+    split into CJ and K) in StandardAnalyzer.
+    (John Want and Steven Rowe via Otis Gospodnetic)
+
 New features
 
  1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers

Modified: lucene/java/trunk/src/demo/org/apache/lucene/demo/html/HTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/demo/org/apache/lucene/demo/html/HTMLParser.java?rev=431151&r1=431150&r2=431151&view=diff
==============================================================================
--- lucene/java/trunk/src/demo/org/apache/lucene/demo/html/HTMLParser.java (original)
+++ lucene/java/trunk/src/demo/org/apache/lucene/demo/html/HTMLParser.java Sun Aug 13 00:02:26
2006
@@ -40,6 +40,12 @@
     }
   }
 
+  /**
+   * @deprecated Use HTMLParser(FileInputStream) instead
+   */
+  public HTMLParser(File file) throws FileNotFoundException {
+    this(new FileInputStream(file));
+  }
 
   public String getTitle() throws IOException, InterruptedException {
     if (pipeIn == null)

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java?rev=431151&r1=431150&r2=431151&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java Sun Aug
13 00:02:26 2006
@@ -26,6 +26,20 @@
   char readChar() throws java.io.IOException;
 
   /**
+   * Returns the column position of the character last read.
+   * @deprecated 
+   * @see #getEndColumn
+   */
+  int getColumn();
+
+  /**
+   * Returns the line number of the character last read.
+   * @deprecated 
+   * @see #getEndLine
+   */
+  int getLine();
+
+  /**
    * Returns the column number of the last character for current token (being
    * matched after the last call to BeginTOken).
    */

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj?rev=431151&r1=431150&r2=431151&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj Sun
Aug 13 00:02:26 2006
@@ -103,21 +103,24 @@
        "\u00c0"-"\u00d6",
        "\u00d8"-"\u00f6",
        "\u00f8"-"\u00ff",
-       "\u0100"-"\u1fff"
+       "\u0100"-"\u1fff",
+       "\uffa0"-"\uffdc"
       ]
   >
 | < CJ:                                          // Chinese, Japanese
       [
        "\u3040"-"\u318f",
+       "\u31f0"-"\u31ff",
        "\u3300"-"\u337f",
-       "\u3400"-"\u3d2d",
+       "\u3400"-"\u4db5",
        "\u4e00"-"\u9fff",
-       "\uf900"-"\ufaff"
+       "\uf900"-"\ufaff",
+       "\uff65"-"\uff9f"
       ]
   >
 | < KOREAN:                                          // Korean
       [
-       "\uac00"-"\ud7af"
+       "\uac00"-"\ud7a3"
       ]
   >
 | < #DIGIT:					  // unicode digits

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java?rev=431151&r1=431150&r2=431151&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
(original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
Sun Aug 13 00:02:26 2006
@@ -41,54 +41,60 @@
    jjCheckNAdd(jjnextStates[start + 1]);
 }
 static final long[] jjbitVec0 = {
-   0x1ff0000000000000L, 0xffffffffffffc000L, 0xffffffffL, 0x600000000000000L
+   0xfff0000000000000L, 0xffffffffffffdfffL, 0xffffffffL, 0x600000000000000L
 };
 static final long[] jjbitVec2 = {
    0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
 };
 static final long[] jjbitVec3 = {
-   0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0x0L
+   0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0xffff000000000000L
 };
 static final long[] jjbitVec4 = {
    0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
 };
 static final long[] jjbitVec5 = {
-   0x3fffffffffffL, 0x0L, 0x0L, 0x0L
+   0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L
 };
 static final long[] jjbitVec6 = {
-   0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
+   0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
 };
 static final long[] jjbitVec7 = {
-   0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L
+   0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
 };
 static final long[] jjbitVec8 = {
-   0xfffffffeL, 0x0L, 0x0L, 0x0L
+   0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L
 };
 static final long[] jjbitVec9 = {
-   0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
+   0xfffffffeL, 0x0L, 0x0L, 0x0L
 };
 static final long[] jjbitVec10 = {
-   0x1600L, 0x0L, 0x0L, 0x0L
+   0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
 };
 static final long[] jjbitVec11 = {
-   0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
+   0x0L, 0x0L, 0xffffffff00000000L, 0x1fffffffL
 };
 static final long[] jjbitVec12 = {
-   0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
+   0x1600L, 0x0L, 0x0L, 0x0L
 };
 static final long[] jjbitVec13 = {
-   0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
+   0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
 };
 static final long[] jjbitVec14 = {
-   0x0L, 0xffc000000000L, 0x0L, 0x0L
+   0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
 };
 static final long[] jjbitVec15 = {
-   0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
+   0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
 };
 static final long[] jjbitVec16 = {
-   0x0L, 0x3ffL, 0x0L, 0x0L
+   0x0L, 0xffc000000000L, 0x0L, 0x0L
 };
 static final long[] jjbitVec17 = {
+   0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
+};
+static final long[] jjbitVec18 = {
+   0x0L, 0x3ffL, 0x0L, 0x0L
+};
+static final long[] jjbitVec19 = {
    0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL
 };
 private final int jjMoveNfa_0(int startState, int curPos)
@@ -1012,8 +1018,10 @@
          return ((jjbitVec3[i2] & l2) != 0L);
       case 51:
          return ((jjbitVec4[i2] & l2) != 0L);
-      case 61:
+      case 77:
          return ((jjbitVec5[i2] & l2) != 0L);
+      case 255:
+         return ((jjbitVec6[i2] & l2) != 0L);
       default : 
          if ((jjbitVec0[i1] & l1) != 0L)
             return true;
@@ -1025,9 +1033,9 @@
    switch(hiByte)
    {
       case 215:
-         return ((jjbitVec7[i2] & l2) != 0L);
+         return ((jjbitVec8[i2] & l2) != 0L);
       default : 
-         if ((jjbitVec6[i1] & l1) != 0L)
+         if ((jjbitVec7[i1] & l1) != 0L)
             return true;
          return false;
    }
@@ -1037,9 +1045,11 @@
    switch(hiByte)
    {
       case 0:
-         return ((jjbitVec9[i2] & l2) != 0L);
+         return ((jjbitVec10[i2] & l2) != 0L);
+      case 255:
+         return ((jjbitVec11[i2] & l2) != 0L);
       default : 
-         if ((jjbitVec8[i1] & l1) != 0L)
+         if ((jjbitVec9[i1] & l1) != 0L)
             return true;
          return false;
    }
@@ -1049,18 +1059,18 @@
    switch(hiByte)
    {
       case 6:
-         return ((jjbitVec12[i2] & l2) != 0L);
+         return ((jjbitVec14[i2] & l2) != 0L);
       case 11:
-         return ((jjbitVec13[i2] & l2) != 0L);
+         return ((jjbitVec15[i2] & l2) != 0L);
       case 13:
-         return ((jjbitVec14[i2] & l2) != 0L);
+         return ((jjbitVec16[i2] & l2) != 0L);
       case 14:
-         return ((jjbitVec15[i2] & l2) != 0L);
+         return ((jjbitVec17[i2] & l2) != 0L);
       case 16:
-         return ((jjbitVec16[i2] & l2) != 0L);
+         return ((jjbitVec18[i2] & l2) != 0L);
       default : 
-         if ((jjbitVec10[i1] & l1) != 0L)
-            if ((jjbitVec11[i2] & l2) == 0L)
+         if ((jjbitVec12[i1] & l1) != 0L)
+            if ((jjbitVec13[i2] & l2) == 0L)
                return false;
             else
             return true;
@@ -1072,11 +1082,13 @@
    switch(hiByte)
    {
       case 0:
-         return ((jjbitVec9[i2] & l2) != 0L);
+         return ((jjbitVec10[i2] & l2) != 0L);
       case 215:
-         return ((jjbitVec7[i2] & l2) != 0L);
+         return ((jjbitVec8[i2] & l2) != 0L);
+      case 255:
+         return ((jjbitVec11[i2] & l2) != 0L);
       default : 
-         if ((jjbitVec17[i1] & l1) != 0L)
+         if ((jjbitVec19[i1] & l1) != 0L)
             return true;
          return false;
    }



Mime
View raw message