lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ehatc...@apache.org
Subject svn commit: r164686 - in /lucene/java/trunk/contrib/analyzers/src: java/org/apache/lucene/analysis/el/ test/org/apache/lucene/analysis/el/
Date Mon, 25 Apr 2005 23:23:38 GMT
Author: ehatcher
Date: Mon Apr 25 16:23:37 2005
New Revision: 164686

URL: http://svn.apache.org/viewcvs?rev=164686&view=rev
Log:
add GreekAnalyzer, contributed by Panagiotis Astithas (past@ebs.gr)

Added:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/el/
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java?rev=164686&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
Mon Apr 25 16:23:37 2005
@@ -0,0 +1,222 @@
+package org.apache.lucene.analysis.el;
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+
+/**
+ * Analyzer for the Greek language. Supports an external list of stopwords (words
+ * that will not be indexed at all).
+ * A default set of stopwords is used unless an alternative list is specified.
+ *
+ * @author  Panagiotis Astithas, past@ebs.gr
+ */
+public final class GreekAnalyzer extends Analyzer
+{
+    // the letters are indexes to the charset array (see GreekCharsets.java)
+    private static char A = 6;
+    private static char B = 7;
+    private static char G = 8;
+    private static char D = 9;
+    private static char E = 10;
+    private static char Z = 11;
+    private static char H = 12;
+    private static char TH = 13;
+    private static char I = 14;
+    private static char K = 15;
+    private static char L = 16;
+    private static char M = 17;
+    private static char N = 18;
+    private static char KS = 19;
+    private static char O = 20;
+    private static char P = 21;
+    private static char R = 22;
+    private static char S = 24;	// skip final sigma
+    private static char T = 25;
+    private static char Y = 26;
+    private static char F = 27;
+    private static char X = 28;
+    private static char PS = 29;
+    private static char W = 30;
+
+    /**
+     * List of typical Greek stopwords.
+     */
+    private static char[][] GREEK_STOP_WORDS = {
+        {O},
+		{H},
+		{T, O},
+        {O, I},
+		{T, A},
+		{T, O, Y},
+		{T, H, S},
+		{T, W, N},
+		{T, O, N},
+		{T, H, N},
+		{K, A, I},
+		{K, I},
+		{K},
+		{E, I, M, A, I},
+		{E, I, S, A, I},
+		{E, I, N, A, I},
+		{E, I, M, A, S, T, E},
+		{E, I, S, T, E},
+		{S, T, O},
+		{S, T, O, N},
+		{S, T, H},
+		{S, T, H, N},
+		{M, A},
+		{A, L, L, A},
+		{A, P, O},
+		{G, I, A},
+		{P, R, O, S},
+		{M, E},
+		{S, E},
+		{W, S},
+		{P, A, R, A},
+		{A, N, T, I},
+		{K, A, T, A},
+		{M, E, T, A},
+		{TH, A},
+		{N, A},
+		{D, E},
+		{D, E, N},
+		{M, H},
+		{M, H, N},
+		{E, P, I},
+		{E, N, W},
+		{E, A, N},
+		{A, N},
+		{T, O, T, E},
+		{P, O, Y},
+		{P, W, S},
+		{P, O, I, O, S},
+		{P, O, I, A},
+		{P, O, I, O},
+		{P, O, I, O, I},
+		{P, O, I, E, S},
+		{P, O, I, W, N},
+		{P, O, I, O, Y, S},
+		{A, Y, T, O, S},
+		{A, Y, T, H},
+		{A, Y, T, O},
+		{A, Y, T, O, I},
+		{A, Y, T, W, N},
+		{A, Y, T, O, Y, S},
+		{A, Y, T, E, S},
+		{A, Y, T, A},
+		{E, K, E, I, N, O, S},
+		{E, K, E, I, N, H},
+		{E, K, E, I, N, O},
+		{E, K, E, I, N, O, I},
+		{E, K, E, I, N, E, S},
+		{E, K, E, I, N, A},
+		{E, K, E, I, N, W, N},
+		{E, K, E, I, N, O, Y, S},
+		{O, P, W, S},
+		{O, M, W, S},
+		{I, S, W, S},
+		{O, S, O},
+		{O, T, I}
+    };
+
+    /**
+     * Contains the stopwords used with the StopFilter.
+     */
+    private Set stopSet = new HashSet();
+
+    /**
+     * Charset for Greek letters.
+     * Represents encoding for 24 lowercase Greek letters.
+     * Predefined charsets can be taken from GreekCharSets class
+     */
+    private char[] charset;
+
+    public GreekAnalyzer() {
+        charset = GreekCharsets.UnicodeGreek;
+        stopSet = StopFilter.makeStopSet(
+                    makeStopWords(GreekCharsets.UnicodeGreek));
+    }
+
+    /**
+     * Builds an analyzer.
+     */
+    public GreekAnalyzer(char[] charset)
+    {
+        this.charset = charset;
+        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
+    }
+
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public GreekAnalyzer(char[] charset, String[] stopwords)
+    {
+        this.charset = charset;
+        stopSet = StopFilter.makeStopSet(stopwords);
+    }
+
+    // Takes greek stop words and translates them to a String array, using
+    // the given charset
+    private static String[] makeStopWords(char[] charset)
+    {
+        String[] res = new String[GREEK_STOP_WORDS.length];
+        for (int i = 0; i < res.length; i++)
+        {
+            char[] theStopWord = GREEK_STOP_WORDS[i];
+            // translate the word,using the charset
+            StringBuffer theWord = new StringBuffer();
+            for (int j = 0; j < theStopWord.length; j++)
+            {
+                theWord.append(charset[theStopWord[j]]);
+            }
+            res[i] = theWord.toString();
+        }
+        return res;
+    }
+
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public GreekAnalyzer(char[] charset, Hashtable stopwords)
+    {
+        this.charset = charset;
+        stopSet = new HashSet(stopwords.keySet());
+    }
+
+    /**
+     * Creates a TokenStream which tokenizes all the text in the provided Reader.
+     *
+     * @return  A TokenStream build from a StandardTokenizer filtered with
+     *                  GreekLowerCaseFilter and StopFilter
+     */
+    public TokenStream tokenStream(String fieldName, Reader reader)
+    {
+    	TokenStream result = new StandardTokenizer(reader);
+        result = new GreekLowerCaseFilter(result, charset);
+        result = new StopFilter(result, stopSet);
+        return result;
+    }
+}

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekCharsets.java?rev=164686&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
Mon Apr 25 16:23:37 2005
@@ -0,0 +1,481 @@
+package org.apache.lucene.analysis.el;
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
+ * for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
+ * Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68)
characters,
+ * including accented ones. One should be able to add other encoding schemes (see RFC 1947)
by adding
+ * the definition of a new charset as well as the required logic in the toLowerCase() method.
+ *
+ * @author  Panagiotis Astithas, past@ebs.gr
+ */
+public class GreekCharsets
+{
+    // Unicode Greek charset
+    public static char[] UnicodeGreek = {
+    	// lower case
+        '\u0390',
+        '\u03AC',
+        '\u03AD',
+        '\u03AE',
+        '\u03AF',
+        '\u03B0',
+        '\u03B1',
+        '\u03B2',
+        '\u03B3',
+        '\u03B4',
+        '\u03B5',
+        '\u03B6',
+        '\u03B7',
+        '\u03B8',
+        '\u03B9',
+        '\u03BA',
+        '\u03BB',
+        '\u03BC',
+        '\u03BD',
+        '\u03BE',
+        '\u03BF',
+        '\u03C0',
+        '\u03C1',
+        '\u03C2',
+        '\u03C3',
+        '\u03C4',
+        '\u03C5',
+        '\u03C6',
+        '\u03C7',
+        '\u03C8',
+        '\u03C9',
+        '\u03CA',
+        '\u03CB',
+        '\u03CC',
+        '\u03CD',
+        '\u03CE',
+        // upper case
+        '\u0386',
+        '\u0388',
+        '\u0389',
+        '\u038A',
+        '\u038C',
+        '\u038E',
+        '\u038F',
+        '\u0391',
+        '\u0392',
+        '\u0393',
+        '\u0394',
+        '\u0395',
+        '\u0396',
+        '\u0397',
+        '\u0398',
+        '\u0399',
+        '\u039A',
+        '\u039B',
+        '\u039C',
+        '\u039D',
+        '\u039E',
+        '\u039F',
+        '\u03A0',
+        '\u03A1',
+        '\u03A3',
+        '\u03A4',
+        '\u03A5',
+        '\u03A6',
+        '\u03A7',
+        '\u03A8',
+        '\u03A9',
+        '\u03AA',
+        '\u03AB'
+    };
+
+    // ISO-8859-7 charset (ELOT-928)
+    public static char[] ISO = {
+       	// lower case
+        0xc0,
+        0xdc,
+        0xdd,
+        0xde,
+        0xdf,
+        0xe0,
+        0xe1,
+        0xe2,
+        0xe3,
+        0xe4,
+        0xe5,
+        0xe6,
+        0xe7,
+        0xe8,
+        0xe9,
+        0xea,
+        0xeb,
+        0xec,
+        0xed,
+        0xee,
+        0xef,
+        0xf0,
+        0xf1,
+        0xf2,
+        0xf3,
+        0xf4,
+        0xf5,
+        0xf6,
+        0xf7,
+        0xf8,
+        0xf9,
+        0xfa,
+		0xfb,
+		0xfc,
+		0xfd,
+		0xfe,
+        // upper case
+        0xb6,
+        0xb8,
+        0xb9,
+        0xba,
+        0xbc,
+        0xbe,
+        0xbf,
+        0xc1,
+        0xc2,
+        0xc3,
+        0xc4,
+        0xc5,
+        0xc6,
+        0xc7,
+        0xc8,
+        0xc9,
+        0xca,
+        0xcb,
+        0xcc,
+        0xcd,
+        0xce,
+        0xcf,
+        0xd0,
+        0xd1,
+        0xd3,
+        0xd4,
+        0xd5,
+        0xd6,
+        0xd7,
+        0xd8,
+        0xd9,
+        0xda,
+		0xdb
+    };
+
+    // CP1253 charset
+    public static char[] CP1253 = {
+       	// lower case
+        0xc0,
+        0xdc,
+        0xdd,
+        0xde,
+        0xdf,
+        0xe0,
+        0xe1,
+        0xe2,
+        0xe3,
+        0xe4,
+        0xe5,
+        0xe6,
+        0xe7,
+        0xe8,
+        0xe9,
+        0xea,
+        0xeb,
+        0xec,
+        0xed,
+        0xee,
+        0xef,
+        0xf0,
+        0xf1,
+        0xf2,
+        0xf3,
+        0xf4,
+        0xf5,
+        0xf6,
+        0xf7,
+        0xf8,
+        0xf9,
+        0xfa,
+		0xfb,
+		0xfc,
+		0xfd,
+		0xfe,
+        // upper case
+        0xa2,
+        0xb8,
+        0xb9,
+        0xba,
+        0xbc,
+        0xbe,
+        0xbf,
+        0xc1,
+        0xc2,
+        0xc3,
+        0xc4,
+        0xc5,
+        0xc6,
+        0xc7,
+        0xc8,
+        0xc9,
+        0xca,
+        0xcb,
+        0xcc,
+        0xcd,
+        0xce,
+        0xcf,
+        0xd0,
+        0xd1,
+        0xd3,
+        0xd4,
+        0xd5,
+        0xd6,
+        0xd7,
+        0xd8,
+        0xd9,
+        0xda,
+		0xdb
+    };
+
+    public static char toLowerCase(char letter, char[] charset)
+    {
+        if (charset == UnicodeGreek) {
+        	// First deal with lower case, not accented letters
+            if (letter >= '\u03B1' && letter <= '\u03C9')
+            {
+            	// Special case 'small final sigma', where we return 'small sigma'
+                if (letter == '\u03C2') {
+                	return '\u03C3';
+                } else {
+                	return letter;
+                }
+            }
+            // Then deal with lower case, accented letters
+            // alpha with acute
+            if (letter == '\u03AC') {
+            	return '\u03B1';
+            }
+            // epsilon with acute
+            if (letter == '\u03AD') {
+            	return '\u03B5';
+            }
+            // eta with acute
+            if (letter == '\u03AE') {
+            	return '\u03B7';
+            }
+            // iota with acute, iota with diaeresis, iota with acute and diaeresis
+            if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') {
+            	return '\u03B9';
+            }
+            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
+            if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') {
+            	return '\u03C5';
+            }
+            // omicron with acute
+            if (letter == '\u03CC') {
+            	return '\u03BF';
+            }
+            // omega with acute
+            if (letter == '\u03CE') {
+            	return '\u03C9';
+            }
+            // After that, deal with upper case, not accented letters
+            if (letter >= '\u0391' && letter <= '\u03A9')
+            {
+                return (char) (letter + 32);
+            }
+            // Finally deal with upper case, accented letters
+            // alpha with acute
+            if (letter == '\u0386') {
+            	return '\u03B1';
+            }
+            // epsilon with acute
+            if (letter == '\u0388') {
+            	return '\u03B5';
+            }
+            // eta with acute
+            if (letter == '\u0389') {
+            	return '\u03B7';
+            }
+            // iota with acute, iota with diaeresis
+            if (letter == '\u038A' || letter == '\u03AA') {
+            	return '\u03B9';
+            }
+            // upsilon with acute, upsilon with diaeresis
+            if (letter == '\u038E' || letter == '\u03AB') {
+            	return '\u03C5';
+            }
+            // omicron with acute
+            if (letter == '\u038C') {
+            	return '\u03BF';
+            }
+            // omega with acute
+            if (letter == '\u038F') {
+            	return '\u03C9';
+            }
+        } else if (charset == ISO) {
+        	// First deal with lower case, not accented letters
+            if (letter >= 0xe1 && letter <= 0xf9)
+            {
+            	// Special case 'small final sigma', where we return 'small sigma'
+                if (letter == 0xf2) {
+                	return 0xf3;
+                } else {
+                	return letter;
+                }
+            }
+            // Then deal with lower case, accented letters
+            // alpha with acute
+            if (letter == 0xdc) {
+            	return 0xe1;
+            }
+            // epsilon with acute
+            if (letter == 0xdd) {
+            	return 0xe5;
+            }
+            // eta with acute
+            if (letter == 0xde) {
+            	return 0xe7;
+            }
+            // iota with acute, iota with diaeresis, iota with acute and diaeresis
+            if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
+            	return '\u03B9';
+            }
+            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
+            if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
+            	return 0xf5;
+            }
+            // omicron with acute
+            if (letter == 0xfc) {
+            	return 0xef;
+            }
+            // omega with acute
+            if (letter == 0xfe) {
+            	return 0xf9;
+            }
+            // After that, deal with upper case, not accented letters
+            if (letter >= 0xc1 && letter <= 0xd9) {
+                return (char) (letter + 32);
+            }
+            // Finally deal with upper case, accented letters
+            // alpha with acute
+            if (letter == 0xb6) {
+            	return 0xe1;
+            }
+            // epsilon with acute
+            if (letter == 0xb8) {
+            	return 0xe5;
+            }
+            // eta with acute
+            if (letter == 0xb9) {
+            	return 0xe7;
+            }
+            // iota with acute, iota with diaeresis
+            if (letter == 0xba || letter == 0xda) {
+            	return 0xe9;
+            }
+            // upsilon with acute, upsilon with diaeresis
+            if (letter == 0xbe || letter == 0xdb) {
+            	return 0xf5;
+            }
+            // omicron with acute
+            if (letter == 0xbc) {
+            	return 0xef;
+            }
+            // omega with acute
+            if (letter == 0xbf) {
+            	return 0xf9;
+            }
+        } else if (charset == CP1253) {
+        	// First deal with lower case, not accented letters
+            if (letter >= 0xe1 && letter <= 0xf9)
+            {
+            	// Special case 'small final sigma', where we return 'small sigma'
+                if (letter == 0xf2) {
+                	return 0xf3;
+                } else {
+                	return letter;
+                }
+            }
+            // Then deal with lower case, accented letters
+            // alpha with acute
+            if (letter == 0xdc) {
+            	return 0xe1;
+            }
+            // epsilon with acute
+            if (letter == 0xdd) {
+            	return 0xe5;
+            }
+            // eta with acute
+            if (letter == 0xde) {
+            	return 0xe7;
+            }
+            // iota with acute, iota with diaeresis, iota with acute and diaeresis
+            if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
+            	return '\u03B9';
+            }
+            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
+            if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
+            	return 0xf5;
+            }
+            // omicron with acute
+            if (letter == 0xfc) {
+            	return 0xef;
+            }
+            // omega with acute
+            if (letter == 0xfe) {
+            	return 0xf9;
+            }
+            // After that, deal with upper case, not accented letters
+            if (letter >= 0xc1 && letter <= 0xd9) {
+                return (char) (letter + 32);
+            }
+            // Finally deal with upper case, accented letters
+            // alpha with acute
+            if (letter == 0xa2) {
+            	return 0xe1;
+            }
+            // epsilon with acute
+            if (letter == 0xb8) {
+            	return 0xe5;
+            }
+            // eta with acute
+            if (letter == 0xb9) {
+            	return 0xe7;
+            }
+            // iota with acute, iota with diaeresis
+            if (letter == 0xba || letter == 0xda) {
+            	return 0xe9;
+            }
+            // upsilon with acute, upsilon with diaeresis
+            if (letter == 0xbe || letter == 0xdb) {
+            	return 0xf5;
+            }
+            // omicron with acute
+            if (letter == 0xbc) {
+            	return 0xef;
+            }
+            // omega with acute
+            if (letter == 0xbf) {
+            	return 0xf9;
+            }
+        }
+
+        return Character.toLowerCase(letter);
+    }
+}

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java?rev=164686&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
Mon Apr 25 16:23:37 2005
@@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.el;
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Normalizes token text to lower case, analyzing given ("greek") charset.
+ *
+ * @author  Panagiotis Astithas, past@ebs.gr
+ */
+public final class GreekLowerCaseFilter extends TokenFilter
+{
+    char[] charset;
+
+    public GreekLowerCaseFilter(TokenStream in, char[] charset)
+    {
+        super(in);
+        this.charset = charset;
+    }
+
+    public final Token next() throws java.io.IOException
+    {
+        Token t = input.next();
+
+        if (t == null)
+            return null;
+
+        String txt = t.termText();
+
+        char[] chArray = txt.toCharArray();
+        for (int i = 0; i < chArray.length; i++)
+        {
+            chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
+        }
+
+        String newTxt = new String(chArray);
+        // create new token
+        Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
+
+        return newToken;
+    }
+}

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java?rev=164686&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
Mon Apr 25 16:23:37 2005
@@ -0,0 +1,75 @@
+package org.apache.lucene.analysis.el;
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+import junit.framework.TestCase;
+
+
+/**
+ * A unit test class for verifying the correct operation of the GreekAnalyzer.
+ *
+ * @author past
+ */
+public class GreekAnalyzerTest extends TestCase {
+
+	/**
+	 * A helper method copied from org.apache.lucene.analysis.TestAnalyzers.
+	 *
+	 * @param a			the Analyzer to test
+	 * @param input		an input String to analyze
+	 * @param output	a String[] with the results of the analysis
+	 * @throws Exception in case an error occurs
+	 */
+	private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception
{
+		TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+		for (int i=0; i<output.length; i++) {
+			Token t = ts.next();
+			assertNotNull(t);
+			assertEquals(t.termText(), output[i]);
+		}
+		assertNull(ts.next());
+		ts.close();
+	}
+
+	/**
+	 * Test the analysis of various greek strings.
+	 *
+	 * @throws Exception in case an error occurs
+	 */
+	public void testAnalyzer() throws Exception {
+		Analyzer a = new GreekAnalyzer();
+		// Verify the correct analysis of capitals and small accented letters
+		assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac
\u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac
\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2
\u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
+				new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1",
"\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1",
"\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
+				"\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3"
});
+		// Verify the correct analysis of small letters with diaeresis and the elimination
+		// of punctuation marks
+		assertAnalyzesTo(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9)
    [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2]	-	\u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
+				new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3",
"\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
+		// Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
+		// as well as the elimination of stop words
+		assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3
 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9
\u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
+				new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3",
"\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9"
});
+	}
+
+}



Mime
View raw message