Return-Path: Delivered-To: apmail-lucene-solr-commits-archive@minotaur.apache.org Received: (qmail 13874 invoked from network); 23 Nov 2009 16:09:29 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 23 Nov 2009 16:09:29 -0000 Received: (qmail 41376 invoked by uid 500); 23 Nov 2009 16:09:29 -0000 Delivered-To: apmail-lucene-solr-commits-archive@lucene.apache.org Received: (qmail 41312 invoked by uid 500); 23 Nov 2009 16:09:28 -0000 Mailing-List: contact solr-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: solr-dev@lucene.apache.org Delivered-To: mailing list solr-commits@lucene.apache.org Received: (qmail 41303 invoked by uid 99); 23 Nov 2009 16:09:28 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 23 Nov 2009 16:09:28 +0000 X-ASF-Spam-Status: No, hits=-2.6 required=5.0 tests=AWL,BAYES_00 X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 23 Nov 2009 16:09:25 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 8FFAA238899C; Mon, 23 Nov 2009 16:09:05 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r883386 - in /lucene/solr/trunk: CHANGES.txt src/java/org/apache/solr/analysis/ReversedWildcardFilter.java src/java/org/apache/solr/search/SolrQueryParser.java src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java Date: Mon, 23 Nov 2009 16:09:05 -0000 To: solr-commits@lucene.apache.org From: yonik@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20091123160905.8FFAA238899C@eris.apache.org> Author: yonik Date: Mon Nov 23 16:09:05 2009 New Revision: 883386 URL: http://svn.apache.org/viewvc?rev=883386&view=rev Log: SOLR-1593: fix reverse wildcard filter for surrogate pairs Modified: lucene/solr/trunk/CHANGES.txt lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java Modified: lucene/solr/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=883386&r1=883385&r2=883386&view=diff ============================================================================== --- lucene/solr/trunk/CHANGES.txt (original) +++ lucene/solr/trunk/CHANGES.txt Mon Nov 23 16:09:05 2009 @@ -74,6 +74,11 @@ fl=score to the parameter list instead of appending score to the existing field list. (yonik) +* SOLR-1593: ReverseWildcardFilter didn't work for surrogate pairs + (i.e. code points outside of the BMP), resulting in incorrect + matching. This change requires reindexing for any content with + such characters. (Robert Muir, yonik) + Other Changes ---------------------- Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java?rev=883386&r1=883385&r2=883386&view=diff ============================================================================== --- lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java (original) +++ lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java Mon Nov 23 16:09:05 2009 @@ -20,7 +20,6 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -73,13 +72,79 @@ } char [] buffer = termAtt.resizeTermBuffer(oldLen + 1); buffer[oldLen] = markerChar; - //String reversed = reverseAndMark(value, markerChar); - ReverseStringFilter.reverse(buffer, oldLen + 1); + reverse(buffer, 0, oldLen + 1); posAtt.setPositionIncrement(origOffset); termAtt.setTermBuffer(buffer, 0, oldLen +1); return true; } - + + /** + * Partially reverses the given input buffer in-place from the given offset + * up to the given length, keeping surrogate pairs in the correct (non-reversed) order. + * @param buffer the input char array to reverse + * @param start the offset from where to reverse the buffer + * @param len the length in the buffer up to where the + * buffer should be reversed + */ + public static void reverse(final char[] buffer, final int start, final int len) { + /* modified version of Apache Harmony AbstractStringBuilder reverse0() */ + if (len < 2) + return; + int end = (start + len) - 1; + char frontHigh = buffer[start]; + char endLow = buffer[end]; + boolean allowFrontSur = true, allowEndSur = true; + final int mid = start + (len >> 1); + for (int i = start; i < mid; ++i, --end) { + final char frontLow = buffer[i + 1]; + final char endHigh = buffer[end - 1]; + final boolean surAtFront = allowFrontSur + && Character.isSurrogatePair(frontHigh, frontLow); + if (surAtFront && (len < 3)) { + // nothing to do since surAtFront is allowed and 1 char left + return; + } + final boolean surAtEnd = allowEndSur + && Character.isSurrogatePair(endHigh, endLow); + allowFrontSur = allowEndSur = true; + if (surAtFront == surAtEnd) { + if (surAtFront) { + // both surrogates + buffer[end] = frontLow; + buffer[--end] = frontHigh; + buffer[i] = endHigh; + buffer[++i] = endLow; + frontHigh = buffer[i + 1]; + endLow = buffer[end - 1]; + } else { + // neither surrogates + buffer[end] = frontHigh; + buffer[i] = endLow; + frontHigh = frontLow; + endLow = endHigh; + } + } else { + if (surAtFront) { + // surrogate only at the front + buffer[end] = frontLow; + buffer[i] = endLow; + endLow = endHigh; + allowFrontSur = false; + } else { + // surrogate only at the end + buffer[end] = frontHigh; + buffer[i] = endHigh; + frontHigh = frontLow; + allowEndSur = false; + } + } + } + if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) { + // only if odd length + buffer[end] = allowFrontSur ? endLow : frontHigh; + } + } + } Modified: lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java?rev=883386&r1=883385&r2=883386&view=diff ============================================================================== --- lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java (original) +++ lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java Mon Nov 23 16:09:05 2009 @@ -27,10 +27,7 @@ import org.apache.lucene.search.*; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.reverse.ReverseStringFilter; -import org.apache.solr.analysis.ReversedWildcardFilter; -import org.apache.solr.analysis.ReversedWildcardFilterFactory; -import org.apache.solr.analysis.TokenFilterFactory; -import org.apache.solr.analysis.TokenizerChain; +import org.apache.solr.analysis.*; import org.apache.solr.common.SolrException; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; @@ -193,7 +190,12 @@ String type = schema.getFieldType(field).getTypeName(); ReversedWildcardFilterFactory factory = leadingWildcards.get(type); if (factory != null && factory.shouldReverse(termStr)) { - termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar()); + int len = termStr.length(); + char[] chars = new char[len+1]; + chars[0] = factory.getMarkerChar(); + termStr.getChars(0, len, chars, 1); + ReversedWildcardFilter.reverse(chars, 1, len); + termStr = new String(chars); } Query q = super.getWildcardQuery(field, termStr); if (q instanceof WildcardQuery) { Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java?rev=883386&r1=883385&r2=883386&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java Mon Nov 23 16:09:05 2009 @@ -77,8 +77,8 @@ public void testIndexingAnalysis() throws Exception { Analyzer a = schema.getAnalyzer(); - String text = "one two three"; - String expected1 = "one \u0001eno two \u0001owt three \u0001eerht"; + String text = "one two three si\uD834\uDD1Ex"; + String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis"; List expectedTokens1 = getTokens( new WhitespaceTokenizer(new StringReader(expected1))); // set positionIncrements and offsets in expected tokens @@ -86,10 +86,10 @@ Token t = expectedTokens1.get(i); t.setPositionIncrement(0); } - String expected2 = "\u0001eno \u0001owt \u0001eerht"; + String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis"; List expectedTokens2 = getTokens( new WhitespaceTokenizer(new StringReader(expected2))); - String expected3 = "one two three"; + String expected3 = "one two three si\uD834\uDD1Ex"; List expectedTokens3 = getTokens( new WhitespaceTokenizer(new StringReader(expected3))); // field one @@ -116,10 +116,10 @@ // XXX note: this should be false, but for now we return true for any field, // XXX if at least one field uses the reversing assertTrue(parserThree.getAllowLeadingWildcard()); - String text = "one +two *hree f*ur fiv*"; - String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*"; - String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*"; - String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*"; + String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex"; + String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*"; + String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*"; + String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex"; Query q = parserOne.parse(text); assertEquals(expectedOne, q.toString()); q = parserTwo.parse(text);