lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sim...@apache.org
Subject svn commit: r887535 - in /lucene/java/trunk/contrib: ./ analyzers/common/src/java/org/apache/lucene/analysis/tr/ analyzers/common/src/test/org/apache/lucene/analysis/tr/
Date Sat, 05 Dec 2009 12:46:05 GMT
Author: simonw
Date: Sat Dec  5 12:46:05 2009
New Revision: 887535

URL: http://svn.apache.org/viewvc?rev=887535&view=rev
Log:
LUCENE-2102: Add Turkish LowerCaseFilter which handles Turkish and Azeri unique casing behavior
correctly.

Added:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
  (with props)
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
  (with props)
Modified:
    lucene/java/trunk/contrib/CHANGES.txt

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=887535&r1=887534&r2=887535&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sat Dec  5 12:46:05 2009
@@ -16,6 +16,10 @@
    reader.  (Eirik Bjørsnøs via Mike McCandless)
 
 New features
+
+ * LUCENE-2102: Add a Turkish LowerCase Filter. TurkishLowerCaseFilter handles
+   Turkish and Azeri unique casing behavior correctly.
+   (Ahmet Arslan, Robert Muir via Simon Willnauer)
  
  * LUCENE-2039: Add a extensible query parser to contrib/misc.
    ExtendableQueryParser enables arbitrary parser extensions based on a

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java?rev=887535&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
Sat Dec  5 12:46:05 2009
@@ -0,0 +1,125 @@
+package org.apache.lucene.analysis.tr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Normalizes Turkish token text to lower case.
+ * <p>
+ * Turkish and Azeri have unique casing behavior for some characters. This
+ * filter applies Turkish lowercase rules. For more information, see <a
+ * href="http://en.wikipedia.org/wiki/Turkish_dotted_and_dotless_I"
+ * >http://en.wikipedia.org/wiki/Turkish_dotted_and_dotless_I</a>
+ * </p>
+ */
+public final class TurkishLowerCaseFilter extends TokenFilter {
+  private static final int LATIN_CAPITAL_LETTER_I = '\u0049';
+  private static final int LATIN_SMALL_LETTER_I = '\u0069';
+  private static final int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
+  private static final int COMBINING_DOT_ABOVE = '\u0307';
+  private final TermAttribute termAtt;
+  
+  /**
+   * Create a new TurkishLowerCaseFilter, that normalizes Turkish token text 
+   * to lower case.
+   * 
+   * @param in TokenStream to filter
+   */
+  public TurkishLowerCaseFilter(TokenStream in) {
+    super(in);
+    termAtt = addAttribute(TermAttribute.class);
+  }
+  
+  @Override
+  public final boolean incrementToken() throws IOException {
+    boolean iOrAfter = false;
+    
+    if (input.incrementToken()) {
+      final char[] buffer = termAtt.termBuffer();
+      int length = termAtt.termLength();
+      for (int i = 0; i < length;) {
+        final int ch = Character.codePointAt(buffer, i);
+    
+        iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || 
+            (iOrAfter && Character.getType(ch) == Character.NON_SPACING_MARK));
+        
+        if (iOrAfter) { // all the special I turkish handling happens here.
+          switch(ch) {
+            // remove COMBINING_DOT_ABOVE to mimic composed lowercase
+            case COMBINING_DOT_ABOVE:
+              length = delete(buffer, i, length);
+              continue;
+            // i itself, it depends if it is followed by COMBINING_DOT_ABOVE
+            // if it is, we will make it small i and later remove the dot
+            case LATIN_CAPITAL_LETTER_I:
+              if (isBeforeDot(buffer, i + 1, length)) {
+                buffer[i] = LATIN_SMALL_LETTER_I;
+              } else {
+                buffer[i] = LATIN_SMALL_LETTER_DOTLESS_I;
+                // below is an optimization. no COMBINING_DOT_ABOVE follows,
+                // so don't waste time calculating Character.getType(), etc
+                iOrAfter = false;
+              }
+              i++;
+              continue;
+          }
+        }
+        
+        i += Character.toChars(Character.toLowerCase(ch), buffer, i);
+      }
+      
+      termAtt.setTermLength(length);
+      return true;
+    } else
+      return false;
+  }
+  
+  
+  /**
+   * lookahead for a combining dot above.
+   * other NSMs may be in between.
+   */
+  private boolean isBeforeDot(char s[], int pos, int len) {
+    for (int i = pos; i < len;) {
+      final int ch = Character.codePointAt(s, i);
+      if (Character.getType(ch) != Character.NON_SPACING_MARK)
+        return false;
+      if (ch == COMBINING_DOT_ABOVE)
+        return true;
+      i += Character.charCount(ch);
+    }
+    
+    return false;
+  }
+  
+  /**
+   * delete a character in-place.
+   * rarely happens, only if COMBINING_DOT_ABOVE is found after an i
+   */
+  private int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html?rev=887535&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
Sat Dec  5 12:46:05 2009
@@ -0,0 +1,31 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Support for Turkish.
+<p>
+This package contains just the TokenStream for handling turkish casing,
+for a stemmer please see the snowball package. 
+</p>
+<p>
+WARNING: SnowballAnalyzer uses LowerCaseFilter by default, even when the
+language is set to Turkish, so you will need to construct your own
+analyzer that combines TurkishLowerCaseFilter and SnowballFilter.
+</p>
+</body>
+</html>
\ No newline at end of file

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java?rev=887535&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
Sat Dec  5 12:46:05 2009
@@ -0,0 +1,65 @@
+package org.apache.lucene.analysis.tr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * Test the Turkish lowercase filter.
+ */
+public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
+  
+  /**
+   * Test composed forms
+   */
+  public void testTurkishLowerCaseFilter() throws Exception {
+    TokenStream stream = new WhitespaceTokenizer(new StringReader(
+        "\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
+    TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
+    assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
+        "\u0131sparta",});
+  }
+  
+  /**
+   * Test decomposed forms
+   */
+  public void testDecomposed() throws Exception {
+    TokenStream stream = new WhitespaceTokenizer(new StringReader(
+        "\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
+    TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
+    assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
+        "\u0131sparta",});
+  }
+  
+  /**
+   * Test decomposed forms with additional accents
+   * In this example, U+0049 + U+0316 + U+0307 is canonically equivalent
+   * to U+0130 + U+0316, and is lowercased the same way.
+   */
+  public void testDecomposed2() throws Exception {
+    TokenStream stream = new WhitespaceTokenizer(new StringReader(
+        "\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
+    TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
+    assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
+        "\u0131\u0316sparta",});
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL



Mime
View raw message