lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r806886 - in /lucene/java/trunk/contrib: ./ analyzers/common/src/java/org/apache/lucene/analysis/el/ analyzers/common/src/java/org/apache/lucene/analysis/ru/ analyzers/common/src/test/org/apache/lucene/analysis/ru/
Date Sat, 22 Aug 2009 20:36:07 GMT
Author: rmuir
Date: Sat Aug 22 20:36:06 2009
New Revision: 806886

URL: http://svn.apache.org/viewvc?rev=806886&view=rev
Log:
LUCENE-1793: Deprecate custom encoding support in Greek and Russian analyzers

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sat Aug 22 20:36:06 2009
@@ -40,6 +40,10 @@
     The SpanScorer API (now QueryScorer) has also been improved to more closely
     match the API of the previous QueryScorer implementation.  (Mark Miller)  
 
+ 5. LUCENE-1793: Deprecate the custom encoding support in the Greek and Russian
+    Analyzers. If you need to index text in these encodings, please use Java's
+    character set conversion facilities (InputStreamReader, etc) during I/O, 
+    so that Lucene can analyze this text as Unicode instead.  (Robert Muir)  
 
 Bug fixes
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
Sat Aug 22 20:36:06 2009
@@ -155,6 +155,7 @@
      * Charset for Greek letters.
      * Represents encoding for 24 lowercase Greek letters.
      * Predefined charsets can be taken from {@link GreekCharsets} class
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
      */
     private char[] charset;
 
@@ -166,15 +167,27 @@
 
     /**
      * Builds an analyzer.
+     * @deprecated Use {@link #GreekAnalyzer()} instead.
      */
     public GreekAnalyzer(char[] charset)
     {
         this.charset = charset;
         stopSet = StopFilter.makeStopSet(makeStopWords(charset));
     }
+    
+    /**
+     * Builds an analyzer with the given stop words.
+     * @param stopwords Array of stopwords to use.
+     */
+    public GreekAnalyzer(String [] stopwords)
+    {
+    	charset = GreekCharsets.UnicodeGreek;
+    	stopSet = StopFilter.makeStopSet(stopwords);
+    }
 
     /**
      * Builds an analyzer with the given stop words.
+     * @deprecated Use {@link #GreekAnalyzer(String[])} instead.
      */
     public GreekAnalyzer(char[] charset, String[] stopwords)
     {
@@ -182,8 +195,11 @@
         stopSet = StopFilter.makeStopSet(stopwords);
     }
 
-    // Takes greek stop words and translates them to a String array, using
-    // the given charset
+    /**
+     * Takes greek stop words and translates them to a String array, using
+     * the given charset.
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
     private static String[] makeStopWords(char[] charset)
     {
         String[] res = new String[GREEK_STOP_WORDS.length];
@@ -203,12 +219,22 @@
 
     /**
      * Builds an analyzer with the given stop words.
+     * @deprecated Use {@link #GreekAnalyzer(Map)} instead.
      */
     public GreekAnalyzer(char[] charset, Map stopwords)
     {
         this.charset = charset;
         stopSet = new HashSet(stopwords.keySet());
     }
+    
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public GreekAnalyzer(Map stopwords)
+    {
+    	charset = GreekCharsets.UnicodeGreek;
+    	stopSet = new HashSet(stopwords.keySet());
+    }
 
     /**
      * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link
Reader}.

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
Sat Aug 22 20:36:06 2009
@@ -24,6 +24,7 @@
  * including accented ones. One should be able to add other encoding schemes (see RFC 1947)
by adding
  * the definition of a new charset as well as the required logic in the toLowerCase() method.
  * </p>
+ * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
  */
 public class GreekCharsets
 {

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
Sat Aug 22 20:36:06 2009
@@ -28,16 +28,27 @@
  */
 public final class GreekLowerCaseFilter extends TokenFilter
 {
+    /**
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
     char[] charset;
 
     private TermAttribute termAtt;
     
+    /**
+     * @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
+     */
     public GreekLowerCaseFilter(TokenStream in, char[] charset)
     {
         super(in);
         this.charset = charset;
         termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     }
+    
+    public GreekLowerCaseFilter(TokenStream in)
+    {
+    	this(in, GreekCharsets.UnicodeGreek);
+    }
 
     public boolean incrementToken() throws IOException {
       if (input.incrementToken()) {

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
Sat Aug 22 20:36:06 2009
@@ -190,6 +190,7 @@
      * Charset for Russian letters.
      * Represents encoding for 32 lowercase Russian letters.
      * Predefined charsets can be taken from RussianCharSets class
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
      */
     private char[] charset;
 
@@ -202,6 +203,7 @@
 
     /**
      * Builds an analyzer.
+     * @deprecated Use {@link #RussianAnalyzer()} instead.
      */
     public RussianAnalyzer(char[] charset)
     {
@@ -211,15 +213,27 @@
 
     /**
      * Builds an analyzer with the given stop words.
+     * @deprecated Use {@link #RussianAnalyzer(String[])} instead.
      */
     public RussianAnalyzer(char[] charset, String[] stopwords)
     {
         this.charset = charset;
         stopSet = StopFilter.makeStopSet(stopwords);
     }
+    
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public RussianAnalyzer(String[] stopwords)
+    {
+    	this.charset = RussianCharsets.UnicodeRussian;
+    	stopSet = StopFilter.makeStopSet(stopwords);
+    }
 
-    // Takes russian stop words and translates them to a String array, using
-    // the given charset
+    /** Takes russian stop words and translates them to a String array, using
+     * the given charset.
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
     private static String[] makeStopWords(char[] charset)
     {
         String[] res = new String[RUSSIAN_STOP_WORDS.length];
@@ -240,12 +254,23 @@
     /**
      * Builds an analyzer with the given stop words.
      * @todo create a Set version of this ctor
+     * @deprecated Use {@link #RussianAnalyzer(Map)} instead.
      */
     public RussianAnalyzer(char[] charset, Map stopwords)
     {
         this.charset = charset;
         stopSet = new HashSet(stopwords.keySet());
     }
+    
+    /**
+     * Builds an analyzer with the given stop words.
+     * @todo create a Set version of this ctor
+     */
+    public RussianAnalyzer(Map stopwords)
+    {
+    	charset = RussianCharsets.UnicodeRussian;
+    	stopSet = new HashSet(stopwords.keySet());
+    }
 
     /**
      * Creates a {@link TokenStream} which tokenizes all the text in the 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
Sat Aug 22 20:36:06 2009
@@ -24,7 +24,7 @@
  * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding
a new charset
  * and adding logic to toLowerCase() method for that charset.
  * </p>
- *
+ * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
  * @version $Id$
  */
 public class RussianCharsets

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
Sat Aug 22 20:36:06 2009
@@ -37,14 +37,25 @@
 
 public class RussianLetterTokenizer extends CharTokenizer
 {
-    /** Construct a new LetterTokenizer. */
+    /** 
+     * Charset this tokenizer uses.
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
     private char[] charset;
 
+    /**
+     * @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead. 
+     */
     public RussianLetterTokenizer(Reader in, char[] charset)
     {
         super(in);
         this.charset = charset;
     }
+    
+    public RussianLetterTokenizer(Reader in)
+    {
+    	this(in, RussianCharsets.UnicodeRussian);
+    }
 
     /**
      * Collects only characters which satisfy
@@ -52,6 +63,7 @@
      */
     protected boolean isTokenChar(char c)
     {
+    	/* in the next release, this can be implemented as isLetter(c) or [0-9] */
         if (Character.isLetter(c))
             return true;
         for (int i = 0; i < charset.length; i++)

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
Sat Aug 22 20:36:06 2009
@@ -31,16 +31,27 @@
  */
 public final class RussianLowerCaseFilter extends TokenFilter
 {
+    /**
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
     char[] charset;
 
     private TermAttribute termAtt;
 
+    /**
+     * @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
+     */
     public RussianLowerCaseFilter(TokenStream in, char[] charset)
     {
         super(in);
         this.charset = charset;
         termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     }
+    
+    public RussianLowerCaseFilter(TokenStream in)
+    {
+    	this(in, RussianCharsets.UnicodeRussian);
+    }
 
     public final boolean incrementToken() throws IOException
     {

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
Sat Aug 22 20:36:06 2009
@@ -42,6 +42,9 @@
 
     private TermAttribute termAtt;
 
+    /**
+     * @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
+     */
     public RussianStemFilter(TokenStream in, char[] charset)
     {
         super(in);
@@ -49,6 +52,10 @@
         termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     }
 
+    public RussianStemFilter(TokenStream in)
+    {
+    	this(in, RussianCharsets.UnicodeRussian);
+    }
     /**
      * Returns the next token in the stream, or null at EOS
      */

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
Sat Aug 22 20:36:06 2009
@@ -25,6 +25,9 @@
  */
 class RussianStemmer
 {
+    /**
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 
+     */
     private char[] charset;
 
     // positions of RV, R1 and R2 respectively
@@ -255,6 +258,7 @@
 
     /**
      * RussianStemmer constructor comment.
+     * @deprecated Use {@link #RussianStemmer()} instead.
      */
     public RussianStemmer(char[] charset)
     {
@@ -529,6 +533,7 @@
      * Insert the method's description here.
      * Creation date: (16/03/2002 10:58:42 PM)
      * @param newCharset char[]
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
      */
     public void setCharset(char[] newCharset)
     {
@@ -620,6 +625,7 @@
 
     /**
      * Static method for stemming with different charsets
+     * @deprecated Use {@link #stemWord(String)} instead.
      */
     public static String stem(String theWord, char[] charset)
     {
@@ -627,4 +633,14 @@
         stemmer.setCharset(charset);
         return stemmer.stem(theWord);
     }
+    
+    /**
+     * Static method for stemming.
+     */
+    public static String stemWord(String theWord)
+    {
+        RussianStemmer stemmer = new RussianStemmer();
+        stemmer.setCharset(RussianCharsets.UnicodeRussian);
+        return stemmer.stem(theWord);
+    }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=806886&r1=806885&r2=806886&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
Sat Aug 22 20:36:06 2009
@@ -60,7 +60,7 @@
 
     public void testUnicode() throws IOException
     {
-        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
+        RussianAnalyzer ra = new RussianAnalyzer();
         inWords =
             new InputStreamReader(
                 new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
@@ -75,8 +75,7 @@
 
         RussianLetterTokenizer sample =
             new RussianLetterTokenizer(
-                sampleUnicode,
-                RussianCharsets.UnicodeRussian);
+                sampleUnicode);
 
         TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
         TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);



Mime
View raw message