lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject svn commit: r1556617 - in /lucene/dev/branches/branch_4x/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/core/ analysis/common/src/java/org/apache/lucene/analysis/util/ analysis/common/src/test/org/apache/lucene/analysis/core/
Date Wed, 08 Jan 2014 19:51:53 GMT
Author: ryan
Date: Wed Jan  8 19:51:53 2014
New Revision: 1556617

URL: http://svn.apache.org/r1556617
Log:
LUCENE-5369: Added an UpperCaseFilter to make UPPERCASE tokens

Added:
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java
Modified:
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1556617&r1=1556616&r2=1556617&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Jan  8 19:51:53 2014
@@ -28,6 +28,9 @@ New Features
 
 * LUCENE-5379: Add Analyzer for Kurdish.  (Robert Muir)
 
+* LUCENE-5369: Added an UpperCaseFilter to make UPPERCASE tokens. (ryan)
+
+
 Build
 
 * LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable

Added: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java?rev=1556617&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
(added)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
Wed Jan  8 19:51:53 2014
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
+import org.apache.lucene.util.Version;
+
+/**
+ * Normalizes token text to UPPER CASE.
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating UpperCaseFilter
+ * 
+ * <p><b>NOTE:</b> In Unicode, this transformation may lose information
when the
+ * upper case character represents more than one lower case character. Use this filter
+ * when you require uppercase tokens.  Use the {@link LowerCaseFilter} for 
+ * general search matching
+ */
+public final class UpperCaseFilter extends TokenFilter {
+  private final CharacterUtils charUtils;
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  /**
+   * Create a new UpperCaseFilter, that normalizes token text to upper case.
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param in TokenStream to filter
+   */
+  public UpperCaseFilter(Version matchVersion, TokenStream in) {
+    super(in);
+    charUtils = CharacterUtils.getInstance(matchVersion);
+  }
+  
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      charUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
+      return true;
+    } else
+      return false;
+  }
+}

Added: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java?rev=1556617&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java
(added)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java
Wed Jan  8 19:51:53 2014
@@ -0,0 +1,63 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.UpperCaseFilter;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link UpperCaseFilter}. 
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_uppercase" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.UpperCaseFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * 
+ * <p><b>NOTE:</b> In Unicode, this transformation may lose information
when the
+ * upper case character represents more than one lower case character. Use this filter
+ * when you require uppercase tokens.  Use the {@link LowerCaseFilterFactory} for 
+ * general search matching
+ */
+public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent
{
+  
+  /** Creates a new UpperCaseFilterFactory */
+  public UpperCaseFilterFactory(Map<String,String> args) {
+    super(args);
+    assureMatchVersion();
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public UpperCaseFilter create(TokenStream input) {
+    return new UpperCaseFilter(luceneMatchVersion,input);
+  }
+
+  @Override
+  public AbstractAnalysisFactory getMultiTermComponent() {
+    return this;
+  }
+}

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java?rev=1556617&r1=1556616&r2=1556617&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
Wed Jan  8 19:51:53 2014
@@ -132,6 +132,23 @@ public abstract class CharacterUtils {
      }
   }
 
+  /**
+   * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)}
starting 
+   * at the given offset.
+   * @param buffer the char buffer to UPPERCASE
+   * @param offset the offset to start at
+   * @param limit the max char in the buffer to lower case
+   */
+  public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
+    assert buffer.length >= limit;
+    assert offset <=0 && offset <= buffer.length;
+    for (int i = offset; i < limit;) {
+      i += Character.toChars(
+              Character.toUpperCase(
+                  codePointAt(buffer, i, limit)), buffer, i);
+     }
+  }
+
   /** Converts a sequence of Java characters to a sequence of unicode code points.
    *  @return the number of code points written to the destination buffer */
   public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff)
{

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java?rev=1556617&r1=1556616&r2=1556617&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
Wed Jan  8 19:51:53 2014
@@ -129,6 +129,17 @@ public class TestAnalyzers extends BaseT
     
   }
   
+  private static class UpperCaseWhitespaceAnalyzer extends Analyzer {
+
+    @Override
+    public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT,
tokenizer));
+    }
+    
+  }
+  
+  
   /**
    * Test that LowercaseFilter handles entire unicode range correctly
    */
@@ -148,6 +159,27 @@ public class TestAnalyzers extends BaseT
     assertAnalyzesTo(a, "AbaC\uDC16AdaBa", 
         new String [] { "abac\uDC16adaba" });
   }
+
+  /**
+   * Test that LowercaseFilter handles entire unicode range correctly
+   */
+  public void testUpperCaseFilter() throws IOException {
+    Analyzer a = new UpperCaseWhitespaceAnalyzer();
+    // BMP
+    assertAnalyzesTo(a, "AbaCaDabA", new String[] { "ABACADABA" });
+    // supplementary
+    assertAnalyzesTo(a, "\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e",
+          new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
+    assertAnalyzesTo(a, "AbaCa\ud801\udc3eDabA", 
+         new String[] { "ABACA\ud801\udc16DABA" });
+    // unpaired lead surrogate
+    assertAnalyzesTo(a, "AbaC\uD801AdaBa", 
+        new String [] { "ABAC\uD801ADABA" });
+    // unpaired trail surrogate
+    assertAnalyzesTo(a, "AbaC\uDC16AdaBa", 
+        new String [] { "ABAC\uDC16ADABA" });
+  }
+  
   
   /**
    * Test that LowercaseFilter handles the lowercasing correctly if the term



Mime
View raw message