lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1583525 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ analysis/common/src/resources/META-INF/services/ analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/
Date Tue, 01 Apr 2014 04:05:15 GMT
Author: rmuir
Date: Tue Apr  1 04:05:14 2014
New Revision: 1583525

URL: http://svn.apache.org/r1583525
Log:
LUCENE-5558: Add TruncateTokenFilter

Added:
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
  (with props)
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java
  (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java
  (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java
  (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1583525&r1=1583524&r2=1583525&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Apr  1 04:05:14 2014
@@ -135,6 +135,9 @@ New Features
   resort the hits from a first pass search using a Sort or an
   Expression. (Simon Willnauer, Robert Muir, Mike McCandless)
 
+* LUCENE-5558: Add TruncateTokenFilter which truncates terms to
+  the specified length.  (Ahmet Arslan via Robert Muir)
+
 API Changes
 
 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues

Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java?rev=1583525&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
(added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
Tue Apr  1 04:05:14 2014
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+import java.io.IOException;
+
+/**
+ * A token filter for truncating the terms into a specific length.
+ * Fixed prefix truncation, as a stemming method, produces good results on Turkish language.
+ * It is reported that F5, using first 5 characters, produced best results in
+ * <a href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">
+ * Information Retrieval on Turkish Texts</a>
+ */
+public final class TruncateTokenFilter extends TokenFilter {
+
+  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  private final int length;
+
+  public TruncateTokenFilter(TokenStream input, int length) {
+    super(input);
+    if (length < 1)
+      throw new IllegalArgumentException("length parameter must be a positive number: " +
length);
+    this.length = length;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword() && termAttribute.length() > length)
+        termAttribute.setLength(length);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
\ No newline at end of file

Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java?rev=1583525&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java
(added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java
Tue Apr  1 04:05:14 2014
@@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}. The
following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.ApostropheFilterFactory"/&gt;
+ *     &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/&gt;
+ *     &lt;filter class="solr.KeywordRepeatFilterFactory"/&gt;
+ *     &lt;filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/&gt;
+ *     &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class TruncateTokenFilterFactory extends TokenFilterFactory {
+
+  public static final String PREFIX_LENGTH_KEY = "prefixLength";
+  private final byte prefixLength;
+
+  public TruncateTokenFilterFactory(Map<String, String> args) {
+    super(args);
+    prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
+    if (prefixLength < 1)
+      throw new IllegalArgumentException(PREFIX_LENGTH_KEY + " parameter must be a positive
number: " + prefixLength);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameter(s): " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new TruncateTokenFilter(input, prefixLength);
+  }
+}

Modified: lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1583525&r1=1583524&r2=1583525&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
(original)
+++ lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
Tue Apr  1 04:05:14 2014
@@ -69,6 +69,7 @@ org.apache.lucene.analysis.miscellaneous
 org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
 org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
+org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
 org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
 org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java?rev=1583525&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java
(added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java
Tue Apr  1 04:05:14 2014
@@ -0,0 +1,39 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.junit.Test;
+
+/**
+ * Test the truncate token filter.
+ */
+public class TestTruncateTokenFilter extends BaseTokenStreamTestCase {
+
+  public void testTruncating() throws Exception {
+    TokenStream stream = whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345
123");
+    stream = new TruncateTokenFilter(stream, 5);
+    assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc",
"12345", "123"});
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testNonPositiveLength() throws Exception {
+    new TruncateTokenFilter(whitespaceMockTokenizer("length must be a positive number"),
-48);
+  }
+}

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java?rev=1583525&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java
(added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java
Tue Apr  1 04:05:14 2014
@@ -0,0 +1,73 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+/**
+ * Simple tests to ensure the simple truncation filter factory is working.
+ */
+public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
+  /**
+   * Ensure the filter actually truncates text.
+   */
+  public void testTruncating() throws Exception {
+    Reader reader = new StringReader("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
+    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    ((Tokenizer) stream).setReader(reader);
+    stream = tokenFilterFactory("Truncate",
+        TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5").create(stream);
+    assertTokenStreamContents(stream, new String[]{"abcde", "12345", "ABCDE", "abcde", "abc",
"12345", "123"});
+  }
+
+  /**
+   * Test that bogus arguments result in exception
+   */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("Truncate",
+          TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "5",
+          "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameter(s):"));
+    }
+  }
+
+  /**
+   * Test that negative prefix length result in exception
+   */
+  public void testNonPositivePrefixLengthArgument() throws Exception {
+    try {
+      tokenFilterFactory("Truncate",
+          TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "-5"
+      );
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains(TruncateTokenFilterFactory.PREFIX_LENGTH_KEY
+ " parameter must be a positive number: -5"));
+    }
+  }
+}
+
+



Mime
View raw message