lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sim...@apache.org
Subject svn commit: r823285 - in /lucene/java/trunk/contrib/analyzers/smartcn/src: java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
Date Thu, 08 Oct 2009 19:21:37 GMT
Author: simonw
Date: Thu Oct  8 19:21:36 2009
New Revision: 823285

URL: http://svn.apache.org/viewvc?rev=823285&view=rev
Log:
LUCENE-1965: Lazy Atomic Loading Stopwords in SmartCN

Modified:
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java?rev=823285&r1=823284&r2=823285&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
Thu Oct  8 19:21:36 2009
@@ -21,6 +21,7 @@
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.Collections;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -59,6 +60,48 @@
 public class SmartChineseAnalyzer extends Analyzer {
 
   private final Set stopWords;
+  
+  private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  private static final String STOPWORD_FILE_COMMENT = "//";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<String> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<String> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+
+    static Set<String> loadDefaultStopWordSet() throws IOException {
+      InputStream stream = SmartChineseAnalyzer.class
+          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      try {
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        // make sure it is unmodifiable as we expose it in the outer class
+        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
+      } finally {
+        stream.close();
+      }
+    }
+  }
 
   /**
    * Create a new SmartChineseAnalyzer, using the default stopword list.
@@ -79,18 +122,8 @@
    * @param useDefaultStopWords true to use the default stopword list.
    */
   public SmartChineseAnalyzer(boolean useDefaultStopWords) {
-    if (useDefaultStopWords) {
-      try {
-      InputStream stream = this.getClass().getResourceAsStream("stopwords.txt");
-      InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-      stopWords = WordlistLoader.getWordSet(reader, "//");
-      } catch (IOException e) {
-        // TODO: throw IOException
-        throw new RuntimeException(e);
-      }
-    }else{
-      stopWords = null;
-    }
+    stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
+        : Collections.EMPTY_SET;
   }
 
   /**
@@ -103,7 +136,7 @@
    * @param stopWords {@link Set} of stopwords to use.
    */
   public SmartChineseAnalyzer(Set stopWords) {
-    this.stopWords = stopWords;
+    this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords;
   }
 
   public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -113,8 +146,8 @@
     // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
     // The porter stemming is too strict, this is not a bug, this is a feature:)
     result = new PorterStemFilter(result);
-    if (stopWords != null) {
-      result = new StopFilter(result, stopWords, false);
+    if (!stopWords.isEmpty()) {
+      result = new StopFilter(false,result, stopWords, false);
     }
     return result;
   }
@@ -133,8 +166,8 @@
       streams.tokenStream = new SentenceTokenizer(reader);
       streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
       streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
-      if (stopWords != null) {
-        streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords,
false);
+      if (!stopWords.isEmpty()) {
+        streams.filteredTokenStream = new StopFilter(false, streams.filteredTokenStream,
stopWords, false);
       }
     } else {
       streams.tokenStream.reset(reader);

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=823285&r1=823284&r2=823285&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
Thu Oct  8 19:21:36 2009
@@ -25,6 +25,7 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 
 public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
   
@@ -33,6 +34,9 @@
     String sentence = "我购买了道具和服装。";
     String result[] = { "我", "购买", "了", "道具", "和", "服装"
};
     assertAnalyzesTo(ca, sentence, result);
+    // set stop-words from the outer world - must yield same behavior
+    ca = new SmartChineseAnalyzer(SmartChineseAnalyzer.getDefaultStopSet());
+    assertAnalyzesTo(ca, sentence, result);
   }
   
   /*
@@ -63,11 +67,16 @@
    * if you don't supply (true) to the constructor, or use a different stopwords list,
    * then punctuation is indexed.
    */
-  public void testChineseStopWordsOff() throws Exception {  
-    Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */
+  public void testChineseStopWordsOff() throws Exception {
+    Analyzer[] analyzers = new Analyzer[] {
+        new SmartChineseAnalyzer(false),/* doesn't load stopwords */
+        new SmartChineseAnalyzer(null) /* sets stopwords to empty set */};
     String sentence = "我购买了道具和服装。";
     String result[] = { "我", "购买", "了", "道具", "和", "服装",
"," };
-    assertAnalyzesTo(ca, sentence, result);
+    for (Analyzer analyzer : analyzers) {
+      assertAnalyzesTo(analyzer, sentence, result);
+      assertAnalyzesToReuse(analyzer, sentence, result);
+    }
   }
   
   public void testChineseAnalyzer() throws Exception {



Mime
View raw message