lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r725691 - in /lucene/java/trunk: ./ contrib/analyzers/src/java/org/apache/lucene/analysis/position/ contrib/analyzers/src/test/org/apache/lucene/analysis/position/
Date Thu, 11 Dec 2008 14:17:44 GMT
Author: gsingers
Date: Thu Dec 11 06:17:44 2008
New Revision: 725691

URL: http://svn.apache.org/viewvc?rev=725691&view=rev
Log:
LUCENE-1380: Add PositionFilter

Added:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/position/
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java
  (with props)
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/position/
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/position/PositionFilterTest.java
  (with props)
Modified:
    lucene/java/trunk/CHANGES.txt

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=725691&r1=725690&r2=725691&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Thu Dec 11 06:17:44 2008
@@ -1,4 +1,4 @@
-Lucene Change Log
+Lucene Change Log
 $Id$
 
 ======================= Trunk (not yet released) =======================
@@ -879,6 +879,8 @@
     the query parser ignores position increments).
     (Doron Cohen)
 
+13. LUCENE-1380: Added TokenFilter for setting position increment in special cases related
to the ShingleFilter (Mck SembWever, Steve Rowe, Karl Wettin via Grant Ingersoll)
+
 
 
 Optimizations

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java?rev=725691&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java
Thu Dec 11 06:17:44 2008
@@ -0,0 +1,79 @@
+package org.apache.lucene.analysis.position;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+/** Set the positionIncrement of all tokens to the "positionIncrement",
+ * except the first return token which retains its original positionIncrement value.
+ * The default positionIncrement value is zero.
+ */
+public class PositionFilter extends TokenFilter {
+
+  /** Position increment to assign to all but the first token - default = 0 */
+  private int positionIncrement = 0;
+  
+  /** The first token must have non-zero positionIncrement **/
+  private boolean firstTokenPositioned = false;
+
+  /**
+   * Constructs a PositionFilter that assigns a position increment of zero to
+   * all but the first token from the given input stream.
+   * 
+   * @param input the input stream
+   */
+  public PositionFilter(final TokenStream input) {
+    super(input);
+  }
+
+  /**
+   * Constructs a PositionFilter that assigns the given position increment to
+   * all but the first token from the given input stream.
+   * 
+   * @param input the input stream
+   * @param positionIncrement position increment to assign to all but the first
+   *  token from the input stream
+   */
+  public PositionFilter(final TokenStream input, final int positionIncrement) {
+    this(input);
+    this.positionIncrement = positionIncrement;
+  }
+
+  public Token next(Token reusableToken) throws IOException {
+
+    assert reusableToken != null;
+    reusableToken = input.next(reusableToken);
+    if (null != reusableToken) {
+      if (firstTokenPositioned) {
+        reusableToken.setPositionIncrement(positionIncrement);
+      } else {
+        firstTokenPositioned = true;
+      }
+    }
+    return reusableToken;
+  }
+
+  public void reset() throws IOException {
+    super.reset();
+    firstTokenPositioned = false;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/position/PositionFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/position/PositionFilterTest.java?rev=725691&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/position/PositionFilterTest.java
(added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/position/PositionFilterTest.java
Thu Dec 11 06:17:44 2008
@@ -0,0 +1,174 @@
+package org.apache.lucene.analysis.position;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+
+public class PositionFilterTest extends TestCase {
+
+  public class TestTokenStream extends TokenStream {
+
+    protected int index = 0;
+    protected Token[] testToken;
+
+    public TestTokenStream(Token[] testToken) {
+      super();
+      this.testToken = testToken;
+    }
+
+    public Token next(final Token reusableToken) throws IOException {
+      assert reusableToken != null;
+      if (index < testToken.length) {
+        return testToken[index++];
+      } else {
+        return null;
+      }
+    }
+    public void reset() {
+      index = 0;
+    }
+  }
+
+  public static void main(String[] args) {
+    junit.textui.TestRunner.run(PositionFilterTest.class);
+  }
+  public static final Token[] TEST_TOKEN = new Token[]{
+    createToken("please"),
+    createToken("divide"),
+    createToken("this"),
+    createToken("sentence"),
+    createToken("into"),
+    createToken("shingles"),
+  };
+  public static final int[] TEST_TOKEN_POSITION_INCREMENTS = new int[]{
+    1, 0, 0, 0, 0, 0
+  };
+  public static final int[] TEST_TOKEN_NON_ZERO_POSITION_INCREMENTS = new int[]{
+    1, 5, 5, 5, 5, 5
+  };
+
+  public static final Token[] SIX_GRAM_NO_POSITIONS_TOKENS = new Token[]{
+    createToken("please"),
+    createToken("please divide"),
+    createToken("please divide this"),
+    createToken("please divide this sentence"),
+    createToken("please divide this sentence into"),
+    createToken("please divide this sentence into shingles"),
+    createToken("divide"),
+    createToken("divide this"),
+    createToken("divide this sentence"),
+    createToken("divide this sentence into"),
+    createToken("divide this sentence into shingles"),
+    createToken("this"),
+    createToken("this sentence"),
+    createToken("this sentence into"),
+    createToken("this sentence into shingles"),
+    createToken("sentence"),
+    createToken("sentence into"),
+    createToken("sentence into shingles"),
+    createToken("into"),
+    createToken("into shingles"),
+    createToken("shingles"),
+  };
+  public static final int[] SIX_GRAM_NO_POSITIONS_INCREMENTS = new int[]{
+    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+  public static final String[] SIX_GRAM_NO_POSITIONS_TYPES = new String[]{
+    "word", "shingle", "shingle", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+
+  public void testFilter() throws IOException {
+
+    filterTest(new PositionFilter(new TestTokenStream(TEST_TOKEN)),
+               TEST_TOKEN,
+               TEST_TOKEN_POSITION_INCREMENTS);
+  }
+
+  public void testNonZeroPositionIncrement() throws IOException {
+    
+    filterTest(new PositionFilter(new TestTokenStream(TEST_TOKEN), 5),
+               TEST_TOKEN,
+               TEST_TOKEN_NON_ZERO_POSITION_INCREMENTS);
+  }
+  
+  public void testReset() throws IOException {
+
+    PositionFilter filter = new PositionFilter(new TestTokenStream(TEST_TOKEN));
+    filterTest(filter, TEST_TOKEN, TEST_TOKEN_POSITION_INCREMENTS);
+    filter.reset();
+    // Make sure that the reset filter provides correct position increments
+    filterTest(filter, TEST_TOKEN, TEST_TOKEN_POSITION_INCREMENTS);
+  }
+  
+  /** Tests ShingleFilter up to six shingles against six terms.
+   *  Tests PositionFilter setting all but the first positionIncrement to zero.
+   * @throws java.io.IOException @see Token#next(Token)
+   */
+  public void test6GramFilterNoPositions() throws IOException {
+
+    ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
+    filterTest(new PositionFilter(filter),
+               SIX_GRAM_NO_POSITIONS_TOKENS,
+               SIX_GRAM_NO_POSITIONS_INCREMENTS);
+  }
+
+  protected TokenStream filterTest(final TokenStream filter,
+                                   final Token[] tokensToCompare,
+                                   final int[] positionIncrements)
+      throws IOException {
+
+    int i = 0;
+    final Token reusableToken = new Token();
+
+    for (Token nextToken = filter.next(reusableToken)
+        ; i < tokensToCompare.length
+        ; nextToken = filter.next(reusableToken)) {
+
+      if (null != nextToken) {
+        final String termText = nextToken.term();
+        final String goldText = tokensToCompare[i].term();
+
+        assertEquals("Wrong termText", goldText, termText);
+        assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
+                     positionIncrements[i], nextToken.getPositionIncrement());
+      }else{
+        assertNull(tokensToCompare[i]);
+      }
+      i++;
+    }
+    return filter;
+  }
+
+  private static Token createToken(String term) {
+    final Token token = new Token();
+    if (null != term) {
+      token.setTermBuffer(term);
+    }
+    return token;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/position/PositionFilterTest.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message