lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r821322 - /lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
Date Sat, 03 Oct 2009 13:54:13 GMT
Author: rmuir
Date: Sat Oct  3 13:54:12 2009
New Revision: 821322

URL: http://svn.apache.org/viewvc?rev=821322&view=rev
Log:
LUCENE-1943: Improve performance of ChineseFilter

Modified:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java?rev=821322&r1=821321&r2=821322&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
Sat Oct  3 13:54:12 2009
@@ -18,9 +18,9 @@
  */
 
 import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.Arrays;
 
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -56,33 +56,32 @@
     };
 
 
-    private Map stopTable;
+    private CharArraySet stopTable;
 
     private TermAttribute termAtt;
     
     public ChineseFilter(TokenStream in) {
         super(in);
 
-        stopTable = new HashMap(STOP_WORDS.length);
-        for (int i = 0; i < STOP_WORDS.length; i++)
-            stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
+        stopTable = new CharArraySet(Arrays.asList(STOP_WORDS), false);
         termAtt = addAttribute(TermAttribute.class);
     }
 
     public boolean incrementToken() throws IOException {
 
         while (input.incrementToken()) {
-            String text = termAtt.term();
+            char text[] = termAtt.termBuffer();
+            int termLength = termAtt.termLength();
 
           // why not key off token type here assuming ChineseTokenizer comes first?
-            if (stopTable.get(text) == null) {
-                switch (Character.getType(text.charAt(0))) {
+            if (!stopTable.contains(text, 0, termLength)) {
+                switch (Character.getType(text[0])) {
 
                 case Character.LOWERCASE_LETTER:
                 case Character.UPPERCASE_LETTER:
 
                     // English word/token should larger than 1 character.
-                    if (text.length()>1) {
+                    if (termLength>1) {
                         return true;
                     }
                     break;



Mime
View raw message