Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id A7356200B79 for ; Wed, 24 Aug 2016 01:18:04 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id A5939160AC5; Tue, 23 Aug 2016 23:18:04 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id C4EE6160AC6 for ; Wed, 24 Aug 2016 01:18:03 +0200 (CEST) Received: (qmail 44848 invoked by uid 500); 23 Aug 2016 23:17:55 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 43745 invoked by uid 99); 23 Aug 2016 23:17:55 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 23 Aug 2016 23:17:55 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 51601DFCC0; Tue, 23 Aug 2016 23:17:55 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: synhershko@apache.org To: commits@lucenenet.apache.org Date: Tue, 23 Aug 2016 23:18:41 -0000 Message-Id: <0db897629e1643bda49f00e2b3b031ef@git.apache.org> In-Reply-To: <7ea169ebc34c46fb8a7c1c3199804cae@git.apache.org> References: <7ea169ebc34c46fb8a7c1c3199804cae@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [48/50] [abbrv] lucenenet git commit: Fix for CharTokenizer.IsTokenChar() to revert the parameter back to int as was intended. A char cannot represent a surrogate pair, which makes it impossible to use IsTokenChar() with surrogate pairs. archived-at: Tue, 23 Aug 2016 23:18:04 -0000 Fix for CharTokenizer.IsTokenChar() to revert the parameter back to int as was intended. A char cannot represent a surrogate pair, which makes it impossible to use IsTokenChar() with surrogate pairs. Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/053d3efc Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/053d3efc Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/053d3efc Branch: refs/heads/analysis-work Commit: 053d3efcb647dac4c681ddf3999eda18b3964b11 Parents: c36a0bd Author: Shad Storhaug Authored: Tue Aug 23 14:37:12 2016 +0700 Committer: Shad Storhaug Committed: Tue Aug 23 16:05:50 2016 +0700 ---------------------------------------------------------------------- .../Analysis/Ar/ArabicLetterTokenizer.cs | 5 +++-- src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs | 2 +- .../Analysis/Core/WhitespaceTokenizer.cs | 4 ++-- src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs | 2 +- .../Analysis/Ru/RussianLetterTokenizer.cs | 5 +++-- src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs | 4 ++-- .../Analysis/Util/TestCharTokenizers.cs | 2 +- 7 files changed, 13 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs index 9e36d25..5fa5827 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs @@ -1,4 +1,5 @@ using Lucene.Net.Analysis.Core; +using Lucene.Net.Support; using Lucene.Net.Util; using System; using System.Globalization; @@ -74,9 +75,9 @@ namespace Lucene.Net.Analysis.Ar /// /// Allows for Letter category or NonspacingMark category /// - protected override bool IsTokenChar(char c) + protected override bool IsTokenChar(int c) { - return base.IsTokenChar(c) || char.GetUnicodeCategory((char)c) == UnicodeCategory.NonSpacingMark; + return base.IsTokenChar(c) || Character.GetType(c) == UnicodeCategory.NonSpacingMark; } } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs index 9a0b57d..9d3dc2b 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs @@ -75,7 +75,7 @@ namespace Lucene.Net.Analysis.Core /// Collects only characters which satisfy /// . /// - protected override bool IsTokenChar(char c) + protected override bool IsTokenChar(int c) { return Character.IsLetter(c); } http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs index 1567daf..5ccdbbf 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs @@ -69,9 +69,9 @@ namespace Lucene.Net.Analysis.Core /// Collects only characters which do not satisfy /// . /// - protected override bool IsTokenChar(char c) + protected override bool IsTokenChar(int c) { - return !char.IsWhiteSpace(c); + return !char.IsWhiteSpace((char)c); } } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs index 5117267..2de7baa 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs @@ -41,7 +41,7 @@ namespace Lucene.Net.Analysis.In { } - protected override bool IsTokenChar(char c) // LUCENENET TODO: Change parameter back to int (for codepoint) rather than a single char since this could contain surrogate pairs + protected override bool IsTokenChar(int c) { UnicodeCategory category = Character.GetType(c); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs index e48c33f..15db0f7 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs @@ -1,4 +1,5 @@ using Lucene.Net.Analysis.Util; +using Lucene.Net.Support; using Lucene.Net.Util; using System; using System.IO; @@ -73,9 +74,9 @@ namespace Lucene.Net.Analysis.Ru /// Collects only characters which satisfy /// . /// - protected override bool IsTokenChar(char c) + protected override bool IsTokenChar(int c) { - return char.IsLetter(c) || (c >= DIGIT_0 && c <= DIGIT_9); + return Character.IsLetter(c) || (c >= DIGIT_0 && c <= DIGIT_9); } } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs index b4ea553..14047ca 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs @@ -120,7 +120,7 @@ namespace Lucene.Net.Analysis.Util /// predicate. Codepoints for which this is false are used to define token /// boundaries and are not included in tokens. /// - protected abstract bool IsTokenChar(char c); + protected abstract bool IsTokenChar(int c); /// /// Called on each token character to normalize it before it is added to the @@ -166,7 +166,7 @@ namespace Lucene.Net.Analysis.Util int charCount = Character.CharCount(c); bufferIndex += charCount; - if (IsTokenChar((char)c)) // if it's a token char + if (IsTokenChar(c)) // if it's a token char { if (length == 0) // start of token { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs index d452d83..40ae0bb 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs @@ -273,7 +273,7 @@ namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util { } - protected override bool IsTokenChar(char c) + protected override bool IsTokenChar(int c) { if (char.IsNumber((char)c)) {