Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id B7C97200C0F for ; Thu, 2 Feb 2017 15:54:58 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id B659F160B54; Thu, 2 Feb 2017 14:54:58 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 8AEC6160B68 for ; Thu, 2 Feb 2017 15:54:57 +0100 (CET) Received: (qmail 7924 invoked by uid 500); 2 Feb 2017 14:54:56 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 7675 invoked by uid 99); 2 Feb 2017 14:54:56 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 02 Feb 2017 14:54:56 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 5B186EEE1D; Thu, 2 Feb 2017 14:54:56 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: nightowl888@apache.org To: commits@lucenenet.apache.org Date: Thu, 02 Feb 2017 14:55:01 -0000 Message-Id: In-Reply-To: <84fb41731f8d46ba8d5ba4fb62d73933@git.apache.org> References: <84fb41731f8d46ba8d5ba4fb62d73933@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [6/6] lucenenet git commit: Lucene.Net.Analysis.Cjk refactor: member accessibility and documentation comments archived-at: Thu, 02 Feb 2017 14:54:58 -0000 Lucene.Net.Analysis.Cjk refactor: member accessibility and documentation comments Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/917b4fdf Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/917b4fdf Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/917b4fdf Branch: refs/heads/api-work Commit: 917b4fdf53f978f32219cef6edf31f3c30b84dea Parents: 7fdbd66 Author: Shad Storhaug Authored: Thu Feb 2 21:53:51 2017 +0700 Committer: Shad Storhaug Committed: Thu Feb 2 21:53:51 2017 +0700 ---------------------------------------------------------------------- .../Analysis/Cjk/CJKAnalyzer.cs | 12 ++--- .../Analysis/Cjk/CJKBigramFilter.cs | 32 ++++++------- .../Analysis/Cjk/CJKBigramFilterFactory.cs | 13 +++--- .../Analysis/Cjk/CJKTokenizer.cs | 14 +++--- .../Analysis/Cjk/CJKTokenizerFactory.cs | 13 +++--- .../Analysis/Cjk/CJKWidthFilter.cs | 49 ++++++++++++++------ .../Analysis/Cjk/CJKWidthFilterFactory.cs | 10 ++-- 7 files changed, 82 insertions(+), 61 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs index 0fcc42c..28c7a52 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs @@ -25,16 +25,16 @@ namespace Lucene.Net.Analysis.Cjk */ /// - /// An that tokenizes text with , - /// normalizes content with , folds case with - /// , forms bigrams of CJK with , - /// and filters stopwords with + /// An that tokenizes text with , + /// normalizes content with , folds case with + /// , forms bigrams of CJK with , + /// and filters stopwords with /// public sealed class CJKAnalyzer : StopwordAnalyzerBase { /// /// File containing default CJK stopwords. - ///

+ /// /// Currently it contains some common English words that are not usually /// useful for searching and some double-byte interpunctions. ///

@@ -72,7 +72,7 @@ namespace Lucene.Net.Analysis.Cjk } /// - /// Builds an analyzer which removes words in . + /// Builds an analyzer which removes words in . /// public CJKAnalyzer(LuceneVersion matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs index 4b8cb17..443ea04 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs @@ -23,18 +23,18 @@ namespace Lucene.Net.Analysis.Cjk */ /// - /// Forms bigrams of CJK terms that are generated from StandardTokenizer + /// Forms bigrams of CJK terms that are generated from /// or ICUTokenizer. /// /// CJK types are set by these tokenizers, but you can also use - /// to explicitly control which + /// to explicitly control which /// of the CJK scripts are turned into bigrams. /// /// /// By default, when a CJK character has no adjacent characters to form /// a bigram, it is output in unigram form. If you want to always output /// both unigrams and bigrams, set the outputUnigrams - /// flag in . + /// flag in . /// This can be used for a combined unigram+bigram approach. /// /// @@ -90,22 +90,22 @@ namespace Lucene.Net.Analysis.Cjk private readonly IPositionLengthAttribute posLengthAtt; // buffers containing codepoint and offsets in parallel - internal int[] buffer = new int[8]; - internal int[] startOffset = new int[8]; - internal int[] endOffset = new int[8]; + private int[] buffer = new int[8]; + private int[] startOffset = new int[8]; + private int[] endOffset = new int[8]; // length of valid buffer - internal int bufferLen; + private int bufferLen; // current buffer index - internal int index; + private int index; // the last end offset, to determine if we should bigram across tokens - internal int lastEndOffset; + private int lastEndOffset; private bool exhausted; /// - /// Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) - /// CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)} + /// Calls + /// CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL) /// public CJKBigramFilter(TokenStream @in) : this(@in, HAN | HIRAGANA | KATAKANA | HANGUL) @@ -113,8 +113,8 @@ namespace Lucene.Net.Analysis.Cjk } /// - /// Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean) - /// CJKBigramFilter(in, flags, false)} + /// Calls + /// CJKBigramFilter(in, flags, false) /// public CJKBigramFilter(TokenStream @in, int flags) : this(@in, flags, false) @@ -122,10 +122,10 @@ namespace Lucene.Net.Analysis.Cjk } /// - /// Create a new CJKBigramFilter, specifying which writing systems should be bigrammed, + /// Create a new , specifying which writing systems should be bigrammed, /// and whether or not unigrams should also be output. - /// OR'ed set from , , - /// , + /// OR'ed set from , , + /// , /// true if unigrams for the selected writing systems should also be output. /// when this is false, this is only done when there are no adjacent characters to form /// a bigram. http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs index 8fd34fd..b9e4d97 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs @@ -21,8 +21,8 @@ namespace Lucene.Net.Analysis.Cjk */ /// - /// Factory for . - ///
+    /// Factory for .
+    /// 
     /// <fieldType name="text_cjk" class="solr.TextField">
     ///   <analyzer>
     ///     <tokenizer class="solr.StandardTokenizerFactory"/>
@@ -32,15 +32,16 @@ namespace Lucene.Net.Analysis.Cjk
     ///       han="true" hiragana="true" 
     ///       katakana="true" hangul="true" outputUnigrams="false" />
     ///   </analyzer>
-    /// </fieldType>
+ /// </fieldType> + /// ///
public class CJKBigramFilterFactory : TokenFilterFactory { - internal readonly int flags; - internal readonly bool outputUnigrams; + private readonly int flags; + private readonly bool outputUnigrams; /// - /// Creates a new CJKBigramFilterFactory + /// Creates a new
public CJKBigramFilterFactory(IDictionary args) : base(args) { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs index 1ff4f07..160306d 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs @@ -32,11 +32,11 @@ namespace Lucene.Net.Analysis.Cjk /// Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4". /// /// Additionally, the following is applied to Latin text (such as English): - ///
    - ///
  • Text is converted to lowercase. - ///
  • Numeric digits, '+', '#', and '_' are tokenized as letters. - ///
  • Full-width forms are converted to half-width forms. - ///
+ /// + /// Text is converted to lowercase. + /// Numeric digits, '+', '#', and '_' are tokenized as letters. + /// Full-width forms are converted to half-width forms. + /// /// For more info on Asian language (Chinese, Japanese, and Korean) text segmentation: /// please search google @@ -145,7 +145,7 @@ namespace Lucene.Net.Analysis.Cjk /// /// false for end of stream, true otherwise /// - /// - throw IOException when read error
+ /// when read error /// happened in the InputStream /// public override bool IncrementToken() @@ -347,7 +347,7 @@ namespace Lucene.Net.Analysis.Cjk } } - public override void End() + public override sealed void End() { base.End(); // set final offset http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs index 220a7d6..c33f3a6 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs @@ -25,20 +25,21 @@ namespace Lucene.Net.Analysis.Cjk /// - /// Factory for . - ///
+    /// Factory for . 
+    /// 
     /// <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
     ///   <analyzer>
     ///     <tokenizer class="solr.CJKTokenizerFactory"/>
     ///   </analyzer>
-    /// </fieldType>
- /// @deprecated Use instead. + /// </fieldType> + /// + /// + /// @deprecated Use instead. [Obsolete("Use CJKBigramFilterFactory instead.")] public class CJKTokenizerFactory : TokenizerFactory { - /// - /// Creates a new CJKTokenizerFactory + /// Creates a new public CJKTokenizerFactory(IDictionary args) : base(args) { if (args.Count > 0) http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs index 331de6b..64018e2 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs @@ -22,11 +22,11 @@ namespace Lucene.Net.Analysis.Cjk */ /// - /// A that normalizes CJK width differences: - ///
    - ///
  • Folds fullwidth ASCII variants into the equivalent basic latin - ///
  • Folds halfwidth Katakana variants into the equivalent kana - ///
+ /// A that normalizes CJK width differences: + /// + /// Folds fullwidth ASCII variants into the equivalent basic latin + /// Folds halfwidth Katakana variants into the equivalent kana + /// /// /// NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD /// Unicode normalization. See the normalization support in the ICU package @@ -37,13 +37,22 @@ namespace Lucene.Net.Analysis.Cjk { private ICharTermAttribute termAtt; - /* halfwidth kana mappings: 0xFF65-0xFF9D - * - * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A - * as a fallback when they cannot properly combine with a preceding - * character into a composed form. - */ - private static readonly char[] KANA_NORM = new char[] { (char)0x30fb, (char)0x30f2, (char)0x30a1, (char)0x30a3, (char)0x30a5, (char)0x30a7, (char)0x30a9, (char)0x30e3, (char)0x30e5, (char)0x30e7, (char)0x30c3, (char)0x30fc, (char)0x30a2, (char)0x30a4, (char)0x30a6, (char)0x30a8, (char)0x30aa, (char)0x30ab, (char)0x30ad, (char)0x30af, (char)0x30b1, (char)0x30b3, (char)0x30b5, (char)0x30b7, (char)0x30b9, (char)0x30bb, (char)0x30bd, (char)0x30bf, (char)0x30c1, (char)0x30c4, (char)0x30c6, (char)0x30c8, (char)0x30ca, (char)0x30cb, (char)0x30cc, (char)0x30cd, (char)0x30ce, (char)0x30cf, (char)0x30d2, (char)0x30d5, (char)0x30d8, (char)0x30db, (char)0x30de, (char)0x30df, (char)0x30e0, (char)0x30e1, (char)0x30e2, (char)0x30e4, (char)0x30e6, (char)0x30e8, (char)0x30e9, (char)0x30ea, (char)0x30eb, (char)0x30ec, (char)0x30ed, (char)0x30ef, (char)0x30f3, (char)0x3099, (char)0x309A }; + /// + /// halfwidth kana mappings: 0xFF65-0xFF9D + /// + /// note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A + /// as a fallback when they cannot properly combine with a preceding + /// character into a composed form. + /// + private static readonly char[] KANA_NORM = new char[] { + (char)0x30fb, (char)0x30f2, (char)0x30a1, (char)0x30a3, (char)0x30a5, (char)0x30a7, (char)0x30a9, (char)0x30e3, (char)0x30e5, + (char)0x30e7, (char)0x30c3, (char)0x30fc, (char)0x30a2, (char)0x30a4, (char)0x30a6, (char)0x30a8, (char)0x30aa, (char)0x30ab, + (char)0x30ad, (char)0x30af, (char)0x30b1, (char)0x30b3, (char)0x30b5, (char)0x30b7, (char)0x30b9, (char)0x30bb, (char)0x30bd, + (char)0x30bf, (char)0x30c1, (char)0x30c4, (char)0x30c6, (char)0x30c8, (char)0x30ca, (char)0x30cb, (char)0x30cc, (char)0x30cd, + (char)0x30ce, (char)0x30cf, (char)0x30d2, (char)0x30d5, (char)0x30d8, (char)0x30db, (char)0x30de, (char)0x30df, (char)0x30e0, + (char)0x30e1, (char)0x30e2, (char)0x30e4, (char)0x30e6, (char)0x30e8, (char)0x30e9, (char)0x30ea, (char)0x30eb, (char)0x30ec, + (char)0x30ed, (char)0x30ef, (char)0x30f3, (char)0x3099, (char)0x309A + }; public CJKWidthFilter(TokenStream input) : base(input) @@ -87,10 +96,20 @@ namespace Lucene.Net.Analysis.Cjk } } - /* kana combining diffs: 0x30A6-0x30FD */ - private static readonly sbyte[] KANA_COMBINE_VOICED = new sbyte[] { 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; + /// kana combining diffs: 0x30A6-0x30FD + private static readonly sbyte[] KANA_COMBINE_VOICED = new sbyte[] { + 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, + 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + }; - private static readonly sbyte[] KANA_COMBINE_HALF_VOICED = new sbyte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + private static readonly sbyte[] KANA_COMBINE_HALF_VOICED = new sbyte[] { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, + 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; /// /// returns true if we successfully combined the voice mark http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs index dfe8f2e..9c956e6 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs @@ -21,8 +21,8 @@ namespace Lucene.Net.Analysis.Cjk */ /// - /// Factory for . - ///
+    /// Factory for .
+    /// 
     /// <fieldType name="text_cjk" class="solr.TextField">
     ///   <analyzer>
     ///     <tokenizer class="solr.StandardTokenizerFactory"/>
@@ -30,13 +30,13 @@ namespace Lucene.Net.Analysis.Cjk
     ///     <filter class="solr.LowerCaseFilterFactory"/>
     ///     <filter class="solr.CJKBigramFilterFactory"/>
     ///   </analyzer>
-    /// </fieldType>
+ /// </fieldType> + /// ///
public class CJKWidthFilterFactory : TokenFilterFactory, IMultiTermAwareComponent { - /// - /// Creates a new CJKWidthFilterFactory + /// Creates a new
public CJKWidthFilterFactory(IDictionary args) : base(args) { if (args.Count > 0)