lucene-solr-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yo...@apache.org
Subject svn commit: r883386 - in /lucene/solr/trunk: CHANGES.txt src/java/org/apache/solr/analysis/ReversedWildcardFilter.java src/java/org/apache/solr/search/SolrQueryParser.java src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
Date Mon, 23 Nov 2009 16:09:05 GMT
Author: yonik
Date: Mon Nov 23 16:09:05 2009
New Revision: 883386

URL: http://svn.apache.org/viewvc?rev=883386&view=rev
Log:
SOLR-1593: fix reverse wildcard filter for surrogate pairs

Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
    lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java
    lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=883386&r1=883385&r2=883386&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Mon Nov 23 16:09:05 2009
@@ -74,6 +74,11 @@
   fl=score to the parameter list instead of appending score to the
   existing field list. (yonik)
 
+* SOLR-1593: ReverseWildcardFilter didn't work for surrogate pairs
+  (i.e. code points outside of the BMP), resulting in incorrect
+  matching.  This change requires reindexing for any content with
+  such characters.  (Robert Muir, yonik)
+
 
 Other Changes
 ----------------------

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java?rev=883386&r1=883385&r2=883386&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java Mon Nov
23 16:09:05 2009
@@ -20,7 +20,6 @@
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.reverse.ReverseStringFilter;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
@@ -73,13 +72,79 @@
     }
     char [] buffer = termAtt.resizeTermBuffer(oldLen + 1);
     buffer[oldLen] = markerChar;
-    //String reversed = reverseAndMark(value, markerChar);
-    ReverseStringFilter.reverse(buffer, oldLen + 1);
+    reverse(buffer, 0, oldLen + 1);
 
     posAtt.setPositionIncrement(origOffset);
     termAtt.setTermBuffer(buffer, 0, oldLen +1);
     return true;
   }
   
-   
+
+  /**
+   * Partially reverses the given input buffer in-place from the given offset
+   * up to the given length, keeping surrogate pairs in the correct (non-reversed) order.
+   * @param buffer the input char array to reverse
+   * @param start the offset from where to reverse the buffer
+   * @param len the length in the buffer up to where the
+   *        buffer should be reversed
+   */
+  public static void reverse(final char[] buffer, final int start, final int len) {
+    /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
+    if (len < 2)
+      return;
+    int end = (start + len) - 1;
+    char frontHigh = buffer[start];
+    char endLow = buffer[end];
+    boolean allowFrontSur = true, allowEndSur = true;
+    final int mid = start + (len >> 1);
+    for (int i = start; i < mid; ++i, --end) {
+      final char frontLow = buffer[i + 1];
+      final char endHigh = buffer[end - 1];
+      final boolean surAtFront = allowFrontSur
+          && Character.isSurrogatePair(frontHigh, frontLow);
+      if (surAtFront && (len < 3)) {
+        // nothing to do since surAtFront is allowed and 1 char left
+        return;
+      }
+      final boolean surAtEnd = allowEndSur
+          && Character.isSurrogatePair(endHigh, endLow);
+      allowFrontSur = allowEndSur = true;
+      if (surAtFront == surAtEnd) {
+        if (surAtFront) {
+          // both surrogates
+          buffer[end] = frontLow;
+          buffer[--end] = frontHigh;
+          buffer[i] = endHigh;
+          buffer[++i] = endLow;
+          frontHigh = buffer[i + 1];
+          endLow = buffer[end - 1];
+        } else {
+          // neither surrogates
+          buffer[end] = frontHigh;
+          buffer[i] = endLow;
+          frontHigh = frontLow;
+          endLow = endHigh;
+        }
+      } else {
+        if (surAtFront) {
+          // surrogate only at the front
+          buffer[end] = frontLow;
+          buffer[i] = endLow;
+          endLow = endHigh;
+          allowFrontSur = false;
+        } else {
+          // surrogate only at the end
+          buffer[end] = frontHigh;
+          buffer[i] = endHigh;
+          frontHigh = frontLow;
+          allowEndSur = false;
+        }
+      }
+    }
+    if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
+      // only if odd length
+      buffer[end] = allowFrontSur ? endLow : frontHigh;
+    }
+  }
+
 }

Modified: lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java?rev=883386&r1=883385&r2=883386&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java Mon Nov 23 16:09:05
2009
@@ -27,10 +27,7 @@
 import org.apache.lucene.search.*;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.reverse.ReverseStringFilter;
-import org.apache.solr.analysis.ReversedWildcardFilter;
-import org.apache.solr.analysis.ReversedWildcardFilterFactory;
-import org.apache.solr.analysis.TokenFilterFactory;
-import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.analysis.*;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.IndexSchema;
@@ -193,7 +190,12 @@
     String type = schema.getFieldType(field).getTypeName();
     ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
     if (factory != null && factory.shouldReverse(termStr)) {
-      termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar());
+      int len = termStr.length();
+      char[] chars = new char[len+1];
+      chars[0] = factory.getMarkerChar();      
+      termStr.getChars(0, len, chars, 1);
+      ReversedWildcardFilter.reverse(chars, 1, len);
+      termStr = new String(chars);
     }
     Query q = super.getWildcardQuery(field, termStr);
     if (q instanceof WildcardQuery) {

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java?rev=883386&r1=883385&r2=883386&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
(original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
Mon Nov 23 16:09:05 2009
@@ -77,8 +77,8 @@
   
   public void testIndexingAnalysis() throws Exception {
     Analyzer a = schema.getAnalyzer();
-    String text = "one two three";
-    String expected1 = "one \u0001eno two \u0001owt three \u0001eerht";
+    String text = "one two three si\uD834\uDD1Ex";
+    String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis";
     List<Token> expectedTokens1 = getTokens(
             new WhitespaceTokenizer(new StringReader(expected1)));
     // set positionIncrements and offsets in expected tokens
@@ -86,10 +86,10 @@
       Token t = expectedTokens1.get(i);
       t.setPositionIncrement(0);
     }
-    String expected2 = "\u0001eno \u0001owt \u0001eerht";
+    String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis";
     List<Token> expectedTokens2 = getTokens(
             new WhitespaceTokenizer(new StringReader(expected2)));
-    String expected3 = "one two three";
+    String expected3 = "one two three si\uD834\uDD1Ex";
     List<Token> expectedTokens3 = getTokens(
             new WhitespaceTokenizer(new StringReader(expected3)));
     // field one
@@ -116,10 +116,10 @@
     // XXX note: this should be false, but for now we return true for any field,
     // XXX if at least one field uses the reversing
     assertTrue(parserThree.getAllowLeadingWildcard());
-    String text = "one +two *hree f*ur fiv*";
-    String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*";
-    String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*";
-    String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*";
+    String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex";
+    String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*";
+    String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*";
+    String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex";
     Query q = parserOne.parse(text);
     assertEquals(expectedOne, q.toString());
     q = parserTwo.parse(text);



Mime
View raw message