lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r805769 - in /lucene/java/trunk/contrib: CHANGES.txt analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
Date Wed, 19 Aug 2009 12:07:16 GMT
Author: rmuir
Date: Wed Aug 19 12:07:15 2009
New Revision: 805769

URL: http://svn.apache.org/viewvc?rev=805769&view=rev
Log:
LUCENE-1813: Add option to ReverseStringFilter to mark reversed tokens

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=805769&r1=805768&r2=805769&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Wed Aug 19 12:07:15 2009
@@ -147,6 +147,9 @@
 
 16. LUCENE-1628: Added Persian analyzer.  (Robert Muir)
 
+17. LUCENE-1813: Add option to ReverseStringFilter to mark reversed tokens.
+    (Andrzej Bialecki via Robert Muir)
+
 Optimizations
 
   1. LUCENE-1643: Re-use the collation key (RawCollationKey) for

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java?rev=805769&r1=805768&r2=805769&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
Wed Aug 19 12:07:15 2009
@@ -24,22 +24,82 @@
 import java.io.IOException;
 
 /**
- * Reverse token string e.g. "country" => "yrtnuoc".
- *
+ * Reverse token string, for example "country" => "yrtnuoc".
+ * <p>
+ * If <code>marker</code> is supplied, then tokens will be also prepended by
+ * that character. For example, with a marker of &#x5C;u0001, "country" =>
+ * "&#x5C;u0001yrtnuoc". This is useful when implementing efficient leading
+ * wildcards search.
+ * </p>
+ * 
  * @version $Id$
  */
 public final class ReverseStringFilter extends TokenFilter {
 
   private TermAttribute termAtt;
-
+  private final char marker;
+  private static final char NOMARKER = '\uFFFF';
+  
+  /**
+   * Example marker character: U+0001 (START OF HEADING) 
+   */
+  public static final char START_OF_HEADING_MARKER = '\u0001';
+  
+  /**
+   * Example marker character: U+001F (INFORMATION SEPARATOR ONE)
+   */
+  public static final char INFORMATION_SEPARATOR_MARKER = '\u001F';
+  
+  /**
+   * Example marker character: U+EC00 (PRIVATE USE AREA: EC00) 
+   */
+  public static final char PUA_EC00_MARKER = '\uEC00';
+  
+  /**
+   * Example marker character: U+200F (RIGHT-TO-LEFT MARK)
+   */
+  public static final char RTL_DIRECTION_MARKER = '\u200F';
+  
+  /**
+   * Create a new ReverseStringFilter that reverses all tokens in the 
+   * supplied {@link TokenStream}.
+   * <p>
+   * The reversed tokens will not be marked. 
+   * </p>
+   * 
+   * @param in {@link TokenStream} to filter
+   */
   public ReverseStringFilter(TokenStream in) {
+    this(in, NOMARKER);
+  }
+
+  /**
+   * Create a new ReverseStringFilter that reverses and marks all tokens in the
+   * supplied {@link TokenStream}.
+   * <p>
+   * The reversed tokens will be prepended (marked) by the <code>marker</code>
+   * character.
+   * </p>
+   * 
+   * @param in {@link TokenStream} to filter
+   * @param marker A character used to mark reversed tokens
+   */
+  public ReverseStringFilter(TokenStream in, char marker) {
     super(in);
+    this.marker = marker;
     termAtt = (TermAttribute) addAttribute(TermAttribute.class);
   }
 
   public boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
-      reverse( termAtt.termBuffer(), termAtt.termLength() );
+      int len = termAtt.termLength();
+      if (marker != NOMARKER) {
+        len++;
+        termAtt.resizeTermBuffer(len);
+        termAtt.termBuffer()[len - 1] = marker;
+      }
+      reverse( termAtt.termBuffer(), len );
+      termAtt.setTermLength(len);
       return true;
     } else {
       return false;

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java?rev=805769&r1=805768&r2=805769&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
Wed Aug 19 12:07:15 2009
@@ -42,6 +42,25 @@
     assertEquals("yad", text.term());
     assertFalse(filter.incrementToken());
   }
+  
+  public void testFilterWithMark() throws Exception {
+    TokenStream stream = new WhitespaceTokenizer(new StringReader(
+        "Do have a nice day")); // 1-4 length string
+    ReverseStringFilter filter = new ReverseStringFilter(stream, '\u0001');
+    TermAttribute text = (TermAttribute) filter
+        .getAttribute(TermAttribute.class);
+    assertTrue(filter.incrementToken());
+    assertEquals("\u0001oD", text.term());
+    assertTrue(filter.incrementToken());
+    assertEquals("\u0001evah", text.term());
+    assertTrue(filter.incrementToken());
+    assertEquals("\u0001a", text.term());
+    assertTrue(filter.incrementToken());
+    assertEquals("\u0001ecin", text.term());
+    assertTrue(filter.incrementToken());
+    assertEquals("\u0001yad", text.term());
+    assertFalse(filter.incrementToken());
+  }
 
   public void testReverseString() throws Exception {
     assertEquals( "A", ReverseStringFilter.reverse( "A" ) );



Mime
View raw message