lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ehatc...@apache.org
Subject svn commit: r353930 - in /lucene/java/trunk: ./ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ contrib/analyzers/src/test/org/apache/lucene/analysis/cn/
Date Sun, 04 Dec 2005 23:07:48 GMT
Author: ehatcher
Date: Sun Dec  4 15:07:42 2005
New Revision: 353930

URL: http://svn.apache.org/viewcvs?rev=353930&view=rev
Log:
Applied patched for LUCENE-324, correcting token offsets returned by ChineseTokenizer

Added:
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/CHANGES.txt?rev=353930&r1=353929&r2=353930&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Dec  4 15:07:42 2005
@@ -187,7 +187,7 @@
     It's very useful for searching across multiple fields.
     (Chuck Williams via Yonik Seeley, LUCENE-323)
 
-28. New class ISOLatin1AccentFilter that replaces accented characters in the ISO 
+28. New class ISOLatin1AccentFilter that replaces accented characters in the ISO
     Latin 1 character set by their unaccented equivalent.
     (Sven Duzont via Erik Hatcher)
 
@@ -195,7 +195,7 @@
     This is useful for data like zip codes, ids, and some product names.
     (Erik Hatcher)
 
-30. Copied LengthFilter from contrib area to core. Removes words that are too 
+30. Copied LengthFilter from contrib area to core. Removes words that are too
     long and too short from the stream.
     (David Spencer via Otis and Daniel)
 
@@ -306,8 +306,11 @@
     (Yonik Seeley, LUCENE-462)
 
 18. Fixed inefficient memory usage when loading an index into RAMDirectory.
-	(Volodymyr Bychkoviak via Bernhard, LUCENE-475)
-		
+    (Volodymyr Bychkoviak via Bernhard, LUCENE-475)
+
+19. Corrected term offsets returned by ChineseTokenizer.
+    (Ray Tsang via Erik Hatcher, LUCENE-324)
+
 Optimizations
      
  1. Disk usage (peak requirements during indexing and optimization)

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java?rev=353930&r1=353929&r2=353930&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
Sun Dec  4 15:07:42 2005
@@ -117,6 +117,7 @@
             case Character.OTHER_LETTER:
                 if (length>0) {
                     bufferIndex--;
+                    offset--;
                     return flush();
                 }
                 push(c);

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java?rev=353930&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
Sun Dec  4 15:07:42 2005
@@ -0,0 +1,31 @@
+package org.apache.lucene.analysis.cn;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+
+/**
+ * @author rayt
+ */
+public class TestChineseTokenizer extends TestCase
+{
+    public void testOtherLetterOffset() throws IOException
+    {
+        String s = "a天b";
+        ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
+        Token token;
+
+        int correctStartOffset = 0;
+        int correctEndOffset = 1;
+        while ((token = tokenizer.next()) != null)
+        {
+            assertEquals(correctStartOffset, token.startOffset());
+            assertEquals(correctEndOffset, token.endOffset());
+            correctStartOffset++;
+            correctEndOffset++;
+        }
+    }
+}
\ No newline at end of file



Mime
View raw message