lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r805766 - in /lucene/java/trunk/contrib/analyzers/common/src: java/org/apache/lucene/analysis/br/ java/org/apache/lucene/analysis/cz/ java/org/apache/lucene/analysis/de/ java/org/apache/lucene/analysis/fr/ java/org/apache/lucene/analysis/nl...
Date Wed, 19 Aug 2009 11:56:32 GMT
Author: rmuir
Date: Wed Aug 19 11:56:31 2009
New Revision: 805766

URL: http://svn.apache.org/viewvc?rev=805766&view=rev
Log:
LUCENE-1794: Ensure analyzer options are applied immediately when using reusable token streams

Added:
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
  (with props)
Modified:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
Wed Aug 19 11:56:31 2009
@@ -111,18 +111,21 @@
 	 */
 	public void setStemExclusionTable( String[] exclusionlist ) {
 		excltable = StopFilter.makeStopSet( exclusionlist );
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
 	/**
 	 * Builds an exclusionlist from a {@link Map}.
 	 */
 	public void setStemExclusionTable( Map exclusionlist ) {
 		excltable = new HashSet(exclusionlist.keySet());
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
 	/**
 	 * Builds an exclusionlist from the words contained in the given file.
 	 */
 	public void setStemExclusionTable( File exclusionlist ) throws IOException {
 		excltable = WordlistLoader.getWordSet( exclusionlist );
+		setPreviousTokenStream(null); // force a new stemmer to be created
 	}
 
 	/**

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
Wed Aug 19 11:56:31 2009
@@ -100,6 +100,7 @@
      * @param   encoding    Encoding used (win-1250, iso-8859-2, ...), null for default system
encoding
      */
     public void loadStopWords( InputStream wordfile, String encoding ) {
+        setPreviousTokenStream(null); // force a new stopfilter to be created
         if ( wordfile == null ) {
             stoptable = new HashSet();
             return;
@@ -121,7 +122,9 @@
             }
 
         } catch ( IOException e ) {
-            stoptable = null;
+          // clear any previous table (if present)
+          // TODO: throw IOException
+          stoptable = new HashSet();
         }
     }
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
Wed Aug 19 11:56:31 2009
@@ -114,6 +114,7 @@
    */
   public void setStemExclusionTable(String[] exclusionlist) {
     exclusionSet = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -121,6 +122,7 @@
    */
   public void setStemExclusionTable(Map exclusionlist) {
     exclusionSet = new HashSet(exclusionlist.keySet());
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -128,6 +130,7 @@
    */
   public void setStemExclusionTable(File exclusionlist) throws IOException {
     exclusionSet = WordlistLoader.getWordSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
Wed Aug 19 11:56:31 2009
@@ -111,6 +111,7 @@
    */
   public void setStemExclusionTable(String[] exclusionlist) {
     excltable = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -118,6 +119,7 @@
    */
   public void setStemExclusionTable(Map exclusionlist) {
     excltable = new HashSet(exclusionlist.keySet());
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -126,6 +128,7 @@
    */
   public void setStemExclusionTable(File exclusionlist) throws IOException {
     excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
Wed Aug 19 11:56:31 2009
@@ -131,6 +131,7 @@
    */
   public void setStemExclusionTable(String[] exclusionlist) {
     excltable = StopFilter.makeStopSet(exclusionlist);
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -138,6 +139,7 @@
    */
   public void setStemExclusionTable(HashSet exclusionlist) {
     excltable = exclusionlist;
+    setPreviousTokenStream(null); // force a new stemmer to be created
   }
 
   /**
@@ -146,6 +148,7 @@
   public void setStemExclusionTable(File exclusionlist) {
     try {
       excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
+      setPreviousTokenStream(null); // force a new stemmer to be created
     } catch (IOException e) {
       // TODO: throw IOException
       throw new RuntimeException(e);
@@ -160,6 +163,7 @@
   public void setStemDictionary(File stemdictFile) {
     try {
       stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
+      setPreviousTokenStream(null); // force a new stemmer to be created
     } catch (IOException e) {
       // TODO: throw IOException
       throw new RuntimeException(e);
@@ -210,7 +214,7 @@
       streams.source = new StandardTokenizer(reader);
       streams.result = new StandardFilter(streams.source);
       streams.result = new StopFilter(streams.result, stoptable);
-      streams.result = new DutchStemFilter(streams.result, excltable);
+      streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
       setPreviousTokenStream(streams);
     } else {
       streams.source.reset(reader);

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
Wed Aug 19 11:56:31 2009
@@ -139,6 +139,17 @@
     a.setStemExclusionTable(new String[] { "quintessência" });
     checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely
unchanged.
   }
+  
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    BrazilianAnalyzer a = new BrazilianAnalyzer();
+    checkReuse(a, "quintessência", "quintessente");
+    a.setStemExclusionTable(new String[] { "quintessência" });
+    checkReuse(a, "quintessência", "quintessência");
+  }
 
   private void check(final String input, final String expected) throws IOException {
     Analyzer analyzer = new BrazilianAnalyzer(); 

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
Wed Aug 19 11:56:31 2009
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.StringReader;
 
 import junit.framework.TestCase;
@@ -32,17 +36,55 @@
  *
  */
 public class TestCzechAnalyzer extends TestCase {
-
+  File dataDir = new File(System.getProperty("dataDir", "./bin"));
+  File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt");
+  
   public void testStopWord() throws Exception {
     assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime",
"volnem" });
   }
-  
+    
   public void testReusableTokenStream() throws Exception {
     Analyzer analyzer = new CzechAnalyzer();
     assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem"
});
     assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká",
"republika" });
   }
 
+  /*
+   * An input stream that always throws IOException for testing.
+   */
+  private class UnreliableInputStream extends InputStream {
+    public int read() throws IOException {
+      throw new IOException();
+    }
+  }
+  
+  /*
+   * The loadStopWords method does not throw IOException on error,
+   * instead previously it set the stoptable to null (versus empty)
+   * this would cause a NPE when it is time to create the StopFilter.
+   */
+  public void testInvalidStopWordFile() throws Exception {
+    CzechAnalyzer cz = new CzechAnalyzer();
+    cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
+    assertAnalyzesTo(cz, "Pokud mluvime o volnem",
+        new String[] { "pokud", "mluvime", "o", "volnem" });
+  }
+  
+  /* 
+   * Test that changes to the stop table via loadStopWords are applied immediately
+   * when using reusable token streams.
+   */
+  public void testStopWordFileReuse() throws Exception {
+    CzechAnalyzer cz = new CzechAnalyzer();
+    assertAnalyzesToReuse(cz, "Česká Republika", 
+      new String[] { "česká", "republika" });
+    
+    InputStream stopwords = new FileInputStream(customStopFile);
+    cz.loadStopWords(stopwords, "UTF-8");
+    
+    assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" });
+  }
+
   private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception
{
     TokenStream ts = a.tokenStream("dummy", new StringReader(input));
     TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt?rev=805766&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
Wed Aug 19 11:56:31 2009
@@ -0,0 +1,3 @@
+examplestopword
+anotherexamplestopword
+republika

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/customStopWordFile.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
Wed Aug 19 11:56:31 2009
@@ -89,6 +89,17 @@
     checkReuse(new GermanSubclassAnalyzer(), "Tischen", "Tischen");
   }
 
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    GermanAnalyzer a = new GermanAnalyzer();
+    checkReuse(a, "tischen", "tisch");
+    a.setStemExclusionTable(new String[] { "tischen" });
+    checkReuse(a, "tischen", "tischen");
+  }
+  
   private void check(final String input, final String expected) throws IOException {
     Analyzer a = new GermanAnalyzer();
     TokenStream tokenStream = a.tokenStream("dummy", new StringReader(input));

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
Wed Aug 19 11:56:31 2009
@@ -221,4 +221,14 @@
               "captif" });
 	}
 
+	/* 
+	 * Test that changes to the exclusion table are applied immediately
+	 * when using reusable token streams.
+	 */
+	public void testExclusionTableReuse() throws Exception {
+	  FrenchAnalyzer fa = new FrenchAnalyzer();
+	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
+	  fa.setStemExclusionTable(new String[] { "habitable" });
+	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
+	}
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=805766&r1=805765&r2=805766&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
Wed Aug 19 11:56:31 2009
@@ -17,6 +17,7 @@
  * limitations under the License.
  */
 
+import java.io.File;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@@ -35,6 +36,8 @@
  * 
  */
 public class TestDutchStemmer extends TestCase {
+  File dataDir = new File(System.getProperty("dataDir", "./bin"));
+  File customDictFile = new File(dataDir, "org/apache/lucene/analysis/nl/customStemDict.txt");
   
   public void testWithSnowballExamples() throws IOException {
 	 check("lichaamsziek", "lichaamsziek");
@@ -144,7 +147,28 @@
     checkReuse(a, "lichamelijkheden", "lichamelijkheden");
   }
  
-
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    DutchAnalyzer a = new DutchAnalyzer();
+    checkReuse(a, "lichamelijk", "licham");
+    a.setStemExclusionTable(new String[] { "lichamelijk" });
+    checkReuse(a, "lichamelijk", "lichamelijk");
+  }
+  
+  /* 
+   * Test that changes to the dictionary stemming table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testStemDictionaryReuse() throws Exception {
+    DutchAnalyzer a = new DutchAnalyzer();
+    checkReuse(a, "lichamelijk", "licham");
+    a.setStemDictionary(customDictFile);
+    checkReuse(a, "lichamelijk", "somethingentirelydifferent");
+  }
+  
   private void check(final String input, final String expected) throws IOException {
     Analyzer analyzer = new DutchAnalyzer(); 
     TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt?rev=805766&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
Wed Aug 19 11:56:31 2009
@@ -0,0 +1,3 @@
+lichamelijk	somethingentirelydifferent
+lichamelijke	licham
+lichamelijkheden	licham

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/customStemDict.txt
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message