Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id A6B84200C0F for ; Thu, 2 Feb 2017 21:15:55 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id A5391160B68; Thu, 2 Feb 2017 20:15:55 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id A34B3160B61 for ; Thu, 2 Feb 2017 21:15:54 +0100 (CET) Received: (qmail 76405 invoked by uid 500); 2 Feb 2017 20:15:53 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 76269 invoked by uid 99); 2 Feb 2017 20:15:53 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 02 Feb 2017 20:15:53 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 3950EDFB95; Thu, 2 Feb 2017 20:15:53 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: nightowl888@apache.org To: commits@lucenenet.apache.org Date: Thu, 02 Feb 2017 20:15:54 -0000 Message-Id: <4f2534dab43342678737de998d9c3cd1@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [2/6] lucenenet git commit: Lucene.Net.Analysis.Cn refactor: member accessibility and documentation comments archived-at: Thu, 02 Feb 2017 20:15:55 -0000 Lucene.Net.Analysis.Cn refactor: member accessibility and documentation comments Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/3e97f31e Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/3e97f31e Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/3e97f31e Branch: refs/heads/api-work Commit: 3e97f31e190f7c3a7781a45e9807e609a1e06393 Parents: 0986545 Author: Shad Storhaug Authored: Thu Feb 2 23:22:53 2017 +0700 Committer: Shad Storhaug Committed: Fri Feb 3 01:13:42 2017 +0700 ---------------------------------------------------------------------- .../Analysis/Cn/ChineseAnalyzer.cs | 23 +++++----- .../Analysis/Cn/ChineseFilter.cs | 37 ++++++++-------- .../Analysis/Cn/ChineseFilterFactory.cs | 6 +-- .../Analysis/Cn/ChineseTokenizer.cs | 45 ++++++++++---------- .../Analysis/Cn/ChineseTokenizerFactory.cs | 8 ++-- 5 files changed, 61 insertions(+), 58 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3e97f31e/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs index 5dc0aa6..de0b5e7 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs @@ -21,21 +21,22 @@ namespace Lucene.Net.Analysis.Cn */ ///

- /// An that tokenizes text with and - /// filters with

- /// @deprecated (3.1) Use instead, which has the same functionality. + /// An that tokenizes text with and + /// filters with + /// + /// @deprecated (3.1) Use instead, which has the same functionality. /// This analyzer will be removed in Lucene 5.0 [Obsolete("(3.1) Use StandardAnalyzer instead, which has the same functionality.")] public sealed class ChineseAnalyzer : Analyzer - ///

- /// Creates - /// - /// used to tokenize all the text in the provided . - ///

- /// - /// built from a filtered with - /// { + ///

+ /// Creates + /// + /// used to tokenize all the text in the provided . + ///

+ /// + /// built from a filtered with + /// protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new ChineseTokenizer(reader); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3e97f31e/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs index 9b3b95a..61e6576 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs @@ -25,28 +25,32 @@ namespace Lucene.Net.Analysis.Cn */ ///

- /// A with a stop word table. - ///

Numeric tokens are removed. - ///
English tokens must be larger than 1 character. - ///
One Chinese character as one Chinese word. - ///

+ /// A with a stop word table. + /// + /// Numeric tokens are removed. + /// English tokens must be larger than 1 character. + /// One Chinese character as one Chinese word. + /// /// TO DO: - ///

Add Chinese stop words, such as \ue400 - ///
Dictionary based Chinese word extraction - ///
Intelligent Chinese word extraction - ///

+ /// + /// Add Chinese stop words, such as \ue400 + /// Dictionary based Chinese word extraction + /// Intelligent Chinese word extraction + /// ///

- /// @deprecated (3.1) Use instead, which has the same functionality. + /// @deprecated (3.1) Use instead, which has the same functionality. /// This filter will be removed in Lucene 5.0 [Obsolete("(3.1) Use StopFilter instead, which has the same functionality.")] public sealed class ChineseFilter : TokenFilter { - // Only English now, Chinese to be added later. - public static readonly string[] STOP_WORDS = new string[] { "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; - + public static readonly string[] STOP_WORDS = new string[] { + "and", "are", "as", "at", "be", "but", "by", + "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", + "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with" + }; private CharArraySet stopTable; @@ -55,13 +59,12 @@ namespace Lucene.Net.Analysis.Cn public ChineseFilter(TokenStream @in) : base(@in) { - stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.AsList(STOP_WORDS), false); termAtt = AddAttribute(); } + public override bool IncrementToken() { - while (m_input.IncrementToken()) { char[] text = termAtt.Buffer; http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3e97f31e/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs index d3e30e5..98ddee9 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs @@ -22,14 +22,14 @@ namespace Lucene.Net.Analysis.Cn */ ///

- /// Factory for

- /// @deprecated Use instead. + /// Factory for + /// @deprecated Use instead. [Obsolete("Use StopFilterFactory instead.")] public class ChineseFilterFactory : TokenFilterFactory { ///

- /// Creates a new ChineseFilterFactory

+ /// Creates a new public ChineseFilterFactory(IDictionary args) : base(args) { if (args.Count > 0) http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3e97f31e/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs index 4ae7ff8..eb500bb 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs @@ -26,27 +26,28 @@ namespace Lucene.Net.Analysis.Cn /// Tokenize Chinese text as individual chinese characters. /// /// - /// The difference between ChineseTokenizer and - /// CJKTokenizer is that they have different + /// The difference between and + /// is that they have different /// token parsing logic. /// /// /// For example, if the Chinese text /// "C1C2C3C4" is to be indexed: - ///

The tokens returned from ChineseTokenizer are C1, C2, C3, C4. - ///
The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4. - ///

+ /// + /// The tokens returned from ChineseTokenizer are C1, C2, C3, C4. + /// The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4. + /// /// /// - /// Therefore the index created by CJKTokenizer is much larger. + /// Therefore the index created by is much larger. /// /// /// The problem is that when searching for C1, C1C2, C1C3, - /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the - /// CJKTokenizer will not work. - /// - /// @deprecated (3.1) Use instead, which has the same functionality. + /// C4C2, C1C2C3 ... the works, but the + /// will not work. + /// + /// + /// @deprecated (3.1) Use instead, which has the same functionality. /// This filter will be removed in Lucene 5.0 [Obsolete("(3.1) Use StandardTokenizer instead, which has the same functionality.")] public sealed class ChineseTokenizer : Tokenizer @@ -82,9 +83,8 @@ namespace Lucene.Net.Analysis.Cn private ICharTermAttribute termAtt; private IOffsetAttribute offsetAtt; - private void push(char c) + private void Push(char c) { - if (length == 0) // start of token { start = offset - 1; @@ -93,9 +93,8 @@ namespace Lucene.Net.Analysis.Cn } - private bool flush() + private bool Flush() { - if (length > 0) { //System.out.println(new String(buffer, 0, @@ -132,7 +131,7 @@ namespace Lucene.Net.Analysis.Cn if (dataLen <= 0) { offset--; - return flush(); + return Flush(); } else { @@ -145,10 +144,10 @@ namespace Lucene.Net.Analysis.Cn case UnicodeCategory.DecimalDigitNumber: case UnicodeCategory.LowercaseLetter: case UnicodeCategory.UppercaseLetter: - push(c); + Push(c); if (length == MAX_WORD_LEN) { - return flush(); + return Flush(); } break; @@ -157,22 +156,22 @@ namespace Lucene.Net.Analysis.Cn { bufferIndex--; offset--; - return flush(); + return Flush(); } - push(c); - return flush(); + Push(c); + return Flush(); default: if (length > 0) { - return flush(); + return Flush(); } break; } } } - public override void End() + public override sealed void End() { base.End(); // set final offset http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3e97f31e/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs index 2eef7be..b71906e 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs @@ -24,14 +24,14 @@ namespace Lucene.Net.Analysis.Cn */ ///

- /// Factory for

- /// @deprecated Use instead. + /// Factory for + /// + /// @deprecated Use instead. [Obsolete("Use StandardTokenizerFactory instead.")] public class ChineseTokenizerFactory : TokenizerFactory { - ///

- /// Creates a new ChineseTokenizerFactory

+ /// Creates a new public ChineseTokenizerFactory(IDictionary args) : base(args) {