lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hoss...@apache.org
Subject svn commit: r411863 - in /lucene/java/trunk: ./ contrib/analyzers/src/java/org/apache/lucene/analysis/th/ contrib/analyzers/src/test/org/apache/lucene/analysis/th/
Date Mon, 05 Jun 2006 17:29:02 GMT
Author: hossman
Date: Mon Jun  5 10:29:01 2006
New Revision: 411863

URL: http://svn.apache.org/viewvc?rev=411863&view=rev
Log:
LUCENE-503: New ThaiAnalyzer and ThaiWordFilter

Added:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
Modified:
    lucene/java/trunk/CHANGES.txt

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=411863&r1=411862&r2=411863&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Jun  5 10:29:01 2006
@@ -6,7 +6,8 @@
 
 New features
 
- 1.
+ 1. New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
+    (Samphan Raruenrom va Chris Hostetter)
 
 API Changes
 

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=411863&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
Mon Jun  5 10:29:01 2006
@@ -0,0 +1,40 @@
+package org.apache.lucene.analysis.th;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Analyzer for Thai language. It uses java.text.BreakIterator to break words.
+ * @author Samphan Raruenrom <samphan@osdev.co.th> for To-Be-One Technology Co., Ltd.
+ * @version 0.2
+ */
+public class ThaiAnalyzer extends Analyzer {
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+	  TokenStream ts = new StandardTokenizer(reader);
+    ts = new StandardFilter(ts);
+    ts = new ThaiWordFilter(ts);
+    ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS);
+    return ts;
+  }
+}

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=411863&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
Mon Jun  5 10:29:01 2006
@@ -0,0 +1,71 @@
+package org.apache.lucene.analysis.th;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Locale;
+import java.lang.Character.UnicodeBlock;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import java.text.BreakIterator;
+
+/**
+ * TokenFilter that use java.text.BreakIterator to break each 
+ * Token that is Thai into separate Token(s) for each Thai word.
+ * @author Samphan Raruenrom <samphan@osdev.co.th> for To-Be-One Technology Co., Ltd.
+ * @version 0.2
+ */
+public class ThaiWordFilter extends TokenFilter {
+  
+  private BreakIterator breaker = null;
+  private Token thaiToken = null;
+  
+  public ThaiWordFilter(TokenStream input) {
+    super(input);
+    breaker = BreakIterator.getWordInstance(new Locale("th"));
+  }
+  
+  public Token next() throws IOException {
+    if (thaiToken != null) {
+      String text = thaiToken.termText();
+      int start = breaker.current();
+      int end = breaker.next();
+      if (end != BreakIterator.DONE) {
+        return new Token(text.substring(start, end), 
+            thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type());
+      }
+      thaiToken = null;
+    }
+    Token tk = input.next();
+    if (tk == null) {
+      return null;
+    }
+    String text = tk.termText();
+    if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
+      return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type());
+    }
+    thaiToken = tk;
+    breaker.setText(text);
+    int end = breaker.next();
+    if (end != BreakIterator.DONE) {
+      return new Token(text.substring(0, end), 
+          thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type());
+    }
+    return null;
+  }
+}

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=411863&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
Mon Jun  5 10:29:01 2006
@@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.th;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
+ *
+ * @author    Samphan Raruenrom <samphan@osdev.co.th>
+ * @version   0.1
+ */
+
+public class TestThaiAnalyzer extends TestCase {
+
+	public void assertAnalyzesTo(Analyzer a, String input, String[] output)
+		throws Exception {
+
+		TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+
+		for (int i = 0; i < output.length; i++) {
+			Token t = ts.next();
+			assertNotNull(t);
+			assertEquals(t.termText(), output[i]);
+		}
+		assertNull(ts.next());
+		ts.close();
+	}
+
+	public void testAnalyzer() throws Exception {
+		ThaiAnalyzer analyzer = new ThaiAnalyzer();
+	
+		assertAnalyzesTo(analyzer, "", new String[] {});
+
+		assertAnalyzesTo(
+			analyzer,
+			"การที่ได้ต้องแสดงว่างานดี",
+			new String[] { "การ", "ที่", "ได้", "ต้อง",
"แสดง", "ว่า", "งาน", "ดี"});
+
+		assertAnalyzesTo(
+			analyzer,
+			"บริษัทชื่อ XY&Z - คุยกับ
xyz@demo.com",
+			new String[] { "บริษัท", "ชื่อ", "xy&z",
"คุย", "กับ", "xyz@demo.com" });
+
+    // English stop words
+		assertAnalyzesTo(
+			analyzer,
+			"ประโยคว่า The quick brown fox jumped over
the lazy dogs",
+			new String[] { "ประโยค", "ว่า", "quick",
"brown", "fox", "jumped", "over", "lazy", "dogs" });
+	}
+}



Mime
View raw message