Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id C8FAE200C39 for ; Wed, 8 Feb 2017 11:22:14 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id C79FC160B4E; Wed, 8 Feb 2017 10:22:14 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id EF87F160B67 for ; Wed, 8 Feb 2017 11:22:13 +0100 (CET) Received: (qmail 51855 invoked by uid 500); 8 Feb 2017 10:22:13 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 51821 invoked by uid 99); 8 Feb 2017 10:22:12 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Feb 2017 10:22:12 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id AF068DFE1C; Wed, 8 Feb 2017 10:22:12 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: jpountz@apache.org To: commits@lucene.apache.org Date: Wed, 08 Feb 2017 10:22:13 -0000 Message-Id: <5f18c200d4f1421db875a3e8ec5c1457@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [2/3] lucene-solr:branch_5_5: LUCENE-6974: Fixed DecimalDigitFilter in case of supplementary code points. archived-at: Wed, 08 Feb 2017 10:22:15 -0000 LUCENE-6974: Fixed DecimalDigitFilter in case of supplementary code points. Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/91147a84 Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/91147a84 Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/91147a84 Branch: refs/heads/branch_5_5 Commit: 91147a84515e0d84201a36a0aedaab40806a02a0 Parents: 28d405c Author: Adrien Grand Authored: Tue Oct 18 10:38:51 2016 +0200 Committer: Adrien Grand Committed: Wed Feb 8 10:59:12 2017 +0100 ---------------------------------------------------------------------- lucene/CHANGES.txt | 3 + .../analysis/core/DecimalDigitFilter.java | 2 +- .../analysis/core/TestDecimalDigitFilter.java | 151 +++++++++++++++++-- 3 files changed, 140 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/CHANGES.txt ---------------------------------------------------------------------- diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index af76924..41eea64 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -24,6 +24,9 @@ Bug Fixes * LUCENE-7547: JapaneseTokenizerFactory was failing to close the dictionary file it opened (Markus via Mike McCandless) +* LUCENE-6974: Fixed DecimalDigitFilter in case of supplementary code points. + (Hossman) + Other * LUCENE-6989: Backport MMapDirectory's unmapping code from Lucene 6.4 to use http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java index b81d42f..de459cf 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java @@ -52,7 +52,7 @@ public final class DecimalDigitFilter extends TokenFilter { buffer[i] = (char) ('0' + Character.getNumericValue(ch)); // if the original was supplementary, shrink the string if (ch > 0xFFFF) { - length = StemmerUtil.delete(buffer, ++i, length); + length = StemmerUtil.delete(buffer, i+1, length); termAtt.setLength(length); } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java index ae25193..e5e18ef 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java @@ -21,14 +21,42 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.TestUtil; +import java.util.Random; + +import org.junit.AfterClass; +import org.junit.BeforeClass; + /** * Tests for {@link DecimalDigitFilter} */ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase { private Analyzer tokenized; private Analyzer keyword; + + private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS; + + @BeforeClass + public static void init_DECIMAL_DIGIT_CODEPOINTS() { + DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT); + for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) { + if (Character.isDigit(codepoint)) { + DECIMAL_DIGIT_CODEPOINTS.set(codepoint); + } + } + assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality(); + } + + @AfterClass + public static void destroy_DECIMAL_DIGIT_CODEPOINTS() { + DECIMAL_DIGIT_CODEPOINTS = null; + } + @Override public void setUp() throws Exception { @@ -64,30 +92,83 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase { } /** - * test all digits in different locations of strings. + * test that double struck digits are normalized */ - public void testRandom() throws Exception { - for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) { - if (Character.isDigit(codepoint)) { - // add some a-z before/after the string - String prefix = TestUtil.randomSimpleString(random()); - String suffix = TestUtil.randomSimpleString(random()); + public void testDoubleStruck() throws Exception { + // MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4 + final String input = "𝟙 𝟡 𝟠 𝟜"; + final String expected = "1 9 8 4"; + checkOneTerm(keyword, input, expected); + checkOneTerm(keyword, input.replaceAll("\\s",""), expected.replaceAll("\\s","")); + } + + /** + * test sequences of digits mixed with other random simple string data + */ + public void testRandomSequences() throws Exception { + + // test numIters random strings containing a sequence of numDigits codepoints + final int numIters = atLeast(5); + for (int iter = 0; iter < numIters; iter++) { + final int numDigits = atLeast(20); + final StringBuilder expected = new StringBuilder(); + final StringBuilder actual = new StringBuilder(); + for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) { - StringBuilder expected = new StringBuilder(); + // increased odds of 0 length random string prefix + final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random()); expected.append(prefix); + actual.append(prefix); + + int codepoint = getRandomDecimalDigit(random()); + int value = Character.getNumericValue(codepoint); assert value >= 0 && value <= 9; expected.append(Integer.toString(value)); - expected.append(suffix); - - StringBuilder actual = new StringBuilder(); - actual.append(prefix); actual.appendCodePoint(codepoint); - actual.append(suffix); - - checkOneTerm(keyword, actual.toString(), expected.toString()); } + // occasional suffix, increased odds of 0 length random string + final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random()); + expected.append(suffix); + actual.append(suffix); + + checkOneTerm(keyword, actual.toString(), expected.toString()); } + + } + + /** + * test each individual digit in different locations of strings. + */ + public void testRandom() throws Exception { + int numCodePointsChecked = 0; // sanity check + for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0); + codepoint != DocIdSetIterator.NO_MORE_DOCS; + codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint+1)) { + + assert Character.isDigit(codepoint); + + // add some a-z before/after the string + String prefix = TestUtil.randomSimpleString(random()); + String suffix = TestUtil.randomSimpleString(random()); + + StringBuilder expected = new StringBuilder(); + expected.append(prefix); + int value = Character.getNumericValue(codepoint); + assert value >= 0 && value <= 9; + expected.append(Integer.toString(value)); + expected.append(suffix); + + StringBuilder actual = new StringBuilder(); + actual.append(prefix); + actual.appendCodePoint(codepoint); + actual.append(suffix); + + checkOneTerm(keyword, actual.toString(), expected.toString()); + + numCodePointsChecked++; + } + assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked; } /** @@ -103,4 +184,44 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER); } + + /** returns a psuedo-random codepoint which is a Decimal Digit */ + public static int getRandomDecimalDigit(Random r) { + final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length()-1); + + if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess + assert Character.isDigit(aprox); + return aprox; + } + + // seek up and down for closest set bit + final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox); + final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox); + + // sanity check edge cases + if (lower < 0) { + assert higher != DocIdSetIterator.NO_MORE_DOCS; + assert Character.isDigit(higher); + return higher; + } + if (higher == DocIdSetIterator.NO_MORE_DOCS) { + assert 0 <= lower; + assert Character.isDigit(lower); + return lower; + } + + // which is closer? + final int cmp = Integer.compare(aprox - lower, higher - aprox); + + if (0 == cmp) { + // dead even, flip a coin + final int result = random().nextBoolean() ? lower : higher; + assert Character.isDigit(result); + return result; + } + + final int result = (cmp < 0) ? lower : higher; + assert Character.isDigit(result); + return result; + } }