lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r937039 - in /lucene/dev/trunk/lucene/contrib: CHANGES.txt icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilter.java icu/src/java/overview.html icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java
Date Thu, 22 Apr 2010 19:55:32 GMT
Author: rmuir
Date: Thu Apr 22 19:55:31 2010
New Revision: 937039

URL: http://svn.apache.org/viewvc?rev=937039&view=rev
Log:
LUCENE-2409: Add ICUTransformFilter to support ICU transforms

Added:
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
  (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java
  (with props)
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=937039&r1=937038&r2=937039&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Apr 22 19:55:31 2010
@@ -133,6 +133,10 @@ New features
    does a more thorough job of normalizing unicode text for search.
    (Robert Haschart, Robert Muir)
 
+ * LUCENE-2409: Add ICUTransformFilter, which transforms text in a context
+   sensitive way, either from ICU built-in rules (such as Traditional-Simplified),
+   or from rules you write yourself.  (Robert Muir)
+
 Build
 
  * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilter.java?rev=937039&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
(added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
Thu Apr 22 19:55:31 2010
@@ -0,0 +1,184 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import com.ibm.icu.text.Replaceable;
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * A {@link TokenFilter} that transforms text with ICU.
+ * <p>
+ * ICU provides text-transformation functionality via its Transliteration API.
+ * Although script conversion is its most common use, a Transliterator can
+ * actually perform a more general class of tasks. In fact, Transliterator
+ * defines a very general API which specifies only that a segment of the input
+ * text is replaced by new text. The particulars of this conversion are
+ * determined entirely by subclasses of Transliterator.
+ * </p>
+ * <p>
+ * Some useful transformations for search are built-in:
+ * <ul>
+ * <li>Conversion from Traditional to Simplified Chinese characters
+ * <li>Conversion from Hiragana to Katakana
+ * <li>Conversion from Fullwidth to Halfwidth forms.
+ * <li>Script conversions, for example Serbian Cyrillic to Latin
+ * </ul>
+ * </p>
+ * <p>
+ * Example usage: <blockquote>stream = new ICUTransformFilter(stream,
+ * Transliterator.getInstance("Traditional-Simplified"));</blockquote>
+ * </p>
+ * For more details, see the <a
+ * href="http://userguide.icu-project.org/transforms/general">ICU User
+ * Guide</a>.
+ */
+public final class ICUTransformFilter extends TokenFilter {
+  // Transliterator to transform the text
+  private final Transliterator transform;
+
+  // Reusable position object
+  private final Transliterator.Position position = new Transliterator.Position();
+
+  // term attribute, will be updated with transformed text.
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  // Wraps a termAttribute around the replaceable interface.
+  private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
+
+  /**
+   * Create a new ICUTransformFilter that transforms text on the given stream.
+   * 
+   * @param input {@link TokenStream} to filter.
+   * @param transform Transliterator to transform the text.
+   */
+  public ICUTransformFilter(TokenStream input, Transliterator transform) {
+    super(input);
+    this.transform = transform;
+
+    /* 
+     * This is cheating, but speeds things up a lot.
+     * If we wanted to use pkg-private APIs we could probably do better.
+     */
+    if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator)
{
+      final UnicodeSet sourceSet = transform.getSourceSet();
+      if (sourceSet != null && !sourceSet.isEmpty())
+        transform.setFilter(sourceSet);
+    }
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    /*
+     * Wrap around replaceable. clear the positions, and transliterate.
+     */
+    if (input.incrementToken()) {
+      replaceableAttribute.setText(termAtt);
+      
+      final int length = termAtt.length(); 
+      position.start = 0;
+      position.limit = length;
+      position.contextStart = 0;
+      position.contextLimit = length;
+
+      transform.filteredTransliterate(replaceableAttribute, position, false);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  
+  /**
+   * Wrap a {@link CharTermAttribute} with the Replaceable API.
+   */
+  final class ReplaceableTermAttribute implements Replaceable {
+    private char buffer[];
+    private int length;
+    private CharTermAttribute token;
+
+    void setText(final CharTermAttribute token) {
+      this.token = token;
+      this.buffer = token.buffer();
+      this.length = token.length();
+    }
+
+    public int char32At(int pos) {
+      return UTF16.charAt(buffer, 0, length, pos);
+    }
+
+    public char charAt(int pos) {
+      return buffer[pos];
+    }
+
+    public void copy(int start, int limit, int dest) {
+      char text[] = new char[limit - start];
+      getChars(start, limit, text, 0);
+      replace(dest, dest, text, 0, limit - start);
+    }
+
+    public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) {
+      System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart);
+    }
+
+    public boolean hasMetaData() {
+      return false;
+    }
+
+    public int length() {
+      return length;
+    }
+
+    public void replace(int start, int limit, String text) {
+      final int charsLen = text.length();
+      final int newLength = shiftForReplace(start, limit, charsLen);
+      // insert the replacement text
+      text.getChars(0, charsLen, buffer, start);
+      token.setLength(length = newLength);
+    }
+
+    public void replace(int start, int limit, char[] text, int charsStart,
+        int charsLen) {
+      // shift text if necessary for the replacement
+      final int newLength = shiftForReplace(start, limit, charsLen);
+      // insert the replacement text
+      System.arraycopy(text, charsStart, buffer, start, charsLen);
+      token.setLength(length = newLength);
+    }
+
+    /** shift text (if necessary) for a replacement operation */
+    private int shiftForReplace(int start, int limit, int charsLen) {
+      final int replacementLength = limit - start;
+      final int newLength = length - replacementLength + charsLen;
+      // resize if necessary
+      if (newLength > length)
+        buffer = token.resizeBuffer(newLength);
+      // if the substring being replaced is longer or shorter than the
+      // replacement, need to shift things around
+      if (replacementLength != charsLen && limit < length)
+        System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit);
+      return newLength;
+    }
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html?rev=937039&r1=937038&r2=937039&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html (original)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html Thu Apr 22 19:55:31 2010
@@ -38,6 +38,8 @@ APIs. This module exposes the following 
   Unicode's Default Caseless Matching algorithm.</li>
   <li><a href="#searchfolding">Search Term Folding</a>: Removes distinctions
   (such as accent marks) between similar characters for a loose or fuzzy search.</li>
+  <li><a href="#transform">Text Transformation</a>: Transforms Unicode
text in
+  a context-sensitive fashion: e.g. mapping Traditional to Simplified Chinese</li>
 </ul>
 <hr/>
 <h1><a name="collation">Collation</a></h1>
@@ -286,6 +288,42 @@ many character foldings recursively.
   TokenStream tokenstream = new ICUFoldingFilter(tokenizer);
 </pre></code>
 <hr/>
+<h1><a name="transform">Text Transformation</a></h1>
+<p>
+ICU provides text-transformation functionality via its Transliteration API. This allows
+you to transform text in a variety of ways, taking context into account.
+</p>
+<p>
+For more information, see the 
+<a href="http://userguide.icu-project.org/transforms/general">User's Guide</a>
+and 
+<a href="http://userguide.icu-project.org/transforms/general/rules">Rule Tutorial</a>.
+</p>
+<h2>Use Cases</h2>
+<ul>
+  <li>
+    Convert Traditional to Simplified 
+  </li>
+  <li>
+    Transliterate between different writing systems: e.g. Romanization
+  </li>
+</ul>
+<h2>Example Usages</h2>
+<h3>Convert Traditional to Simplified</h3>
+<code><pre>
+  /**
+   * This filter will map Traditional Chinese to Simplified Chinese
+   */
+  TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Traditional-Simplified"));
+</pre></code>
+<h3>Transliterate Serbian Cyrillic to Serbian Latin</h3>
+  <code><pre>
+  /**
+   * This filter will map Serbian Cyrillic to Serbian Latin according to BGN rules
+   */
+  TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Serbian-Latin/BGN"));
+</pre></code>
+<hr/>
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports

Added: lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java?rev=937039&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java
(added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java
Thu Apr 22 19:55:31 2010
@@ -0,0 +1,86 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UnicodeSet;
+
+
+/**
+ * Test the ICUTransformFilter with some basic examples.
+ */
+public class TestICUTransformFilter extends BaseTokenStreamTestCase {
+  
+  public void testBasicFunctionality() throws Exception {
+    checkToken(Transliterator.getInstance("Traditional-Simplified"), 
+        "簡化字", "简化字"); 
+    checkToken(Transliterator.getInstance("Katakana-Hiragana"), 
+        "ヒラガナ", "ひらがな");
+    checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), 
+        "アルアノリウ", "アルアノリウ");
+    checkToken(Transliterator.getInstance("Any-Latin"), 
+        "Αλφαβητικός Κατάλογος",
"Alphabētikós Katálogos");
+    checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"), 
+        "Alphabētikós Katálogos", "Alphabetikos Katalogos");
+    checkToken(Transliterator.getInstance("Han-Latin"),
+        "中国", "zhōng guó");
+  }
+  
+  public void testCustomFunctionality() throws Exception {
+    String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
+    checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba",
"bcbcbdbcb");
+  }
+  
+  public void testCustomFunctionality2() throws Exception {
+    String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
+    checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa",
"cbd");
+  }
+  
+  public void testOptimizer() throws Exception {
+    String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
+    Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
+    assertTrue(custom.getFilter() == null);
+    new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom);
+    assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
+  }
+  
+  public void testOptimizer2() throws Exception {
+    checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"), 
+        "ABCDE", "abcde");
+  }
+  
+  public void testOptimizerSurrogate() throws Exception {
+    String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
+    Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
+    assertTrue(custom.getFilter() == null);
+    new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom);
+    assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
+  }
+
+  private void checkToken(Transliterator transform, String input, String expected) throws
IOException {
+    TokenStream ts = new ICUTransformFilter(new KeywordTokenizer((new StringReader(input))),
transform);
+    assertTokenStreamContents(ts, new String[] { expected });
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message