Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id EE045200C11 for ; Sat, 4 Feb 2017 21:32:27 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id EC931160B54; Sat, 4 Feb 2017 20:32:27 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id AF20D160B68 for ; Sat, 4 Feb 2017 21:32:25 +0100 (CET) Received: (qmail 98145 invoked by uid 500); 4 Feb 2017 20:32:21 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 96894 invoked by uid 99); 4 Feb 2017 20:32:20 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 04 Feb 2017 20:32:20 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id B55F1E038D; Sat, 4 Feb 2017 20:32:20 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: nightowl888@apache.org To: commits@lucenenet.apache.org Date: Sat, 04 Feb 2017 20:32:58 -0000 Message-Id: <65588d4106674d8585263e948ff39ce5@git.apache.org> In-Reply-To: <4b623b494bc34c2780b01bf40ded92c7@git.apache.org> References: <4b623b494bc34c2780b01bf40ded92c7@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [39/39] lucenenet git commit: Lucene.Net.Analysis.Ngram - renamed NGram in Git archived-at: Sat, 04 Feb 2017 20:32:28 -0000 Lucene.Net.Analysis.Ngram - renamed NGram in Git Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/ab81d913 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/ab81d913 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/ab81d913 Branch: refs/heads/api-work Commit: ab81d91313149500e6c88b4ceabd6ff5aa4e0d63 Parents: 3201465 Author: Shad Storhaug Authored: Sun Feb 5 03:17:39 2017 +0700 Committer: Shad Storhaug Committed: Sun Feb 5 03:29:11 2017 +0700 ---------------------------------------------------------------------- .../Analysis/NGram/EdgeNGramFilterFactory.cs | 60 +++ .../Analysis/NGram/EdgeNGramTokenFilter.cs | 245 ++++++++++++ .../Analysis/NGram/EdgeNGramTokenizer.cs | 72 ++++ .../Analysis/NGram/EdgeNGramTokenizerFactory.cs | 75 ++++ .../NGram/Lucene43EdgeNGramTokenizer.cs | 297 ++++++++++++++ .../Analysis/NGram/Lucene43NGramTokenizer.cs | 173 ++++++++ .../Analysis/NGram/NGramFilterFactory.cs | 56 +++ .../Analysis/NGram/NGramTokenFilter.cs | 252 ++++++++++++ .../Analysis/NGram/NGramTokenizer.cs | 319 +++++++++++++++ .../Analysis/NGram/NGramTokenizerFactory.cs | 70 ++++ .../Analysis/Ngram/EdgeNGramFilterFactory.cs | 60 --- .../Analysis/Ngram/EdgeNGramTokenFilter.cs | 245 ------------ .../Analysis/Ngram/EdgeNGramTokenizer.cs | 72 ---- .../Analysis/Ngram/EdgeNGramTokenizerFactory.cs | 75 ---- .../Ngram/Lucene43EdgeNGramTokenizer.cs | 297 -------------- .../Analysis/Ngram/Lucene43NGramTokenizer.cs | 173 -------- .../Analysis/Ngram/NGramFilterFactory.cs | 56 --- .../Analysis/Ngram/NGramTokenFilter.cs | 252 ------------ .../Analysis/Ngram/NGramTokenizer.cs | 319 --------------- .../Analysis/Ngram/NGramTokenizerFactory.cs | 70 ---- .../Analysis/NGram/EdgeNGramTokenFilterTest.cs | 390 +++++++++++++++++++ .../Analysis/NGram/EdgeNGramTokenizerTest.cs | 278 +++++++++++++ .../Analysis/NGram/NGramTokenFilterTest.cs | 249 ++++++++++++ .../Analysis/NGram/NGramTokenizerTest.cs | 303 ++++++++++++++ .../Analysis/NGram/TestNGramFilters.cs | 196 ++++++++++ .../Analysis/Ngram/EdgeNGramTokenFilterTest.cs | 390 ------------------- .../Analysis/Ngram/EdgeNGramTokenizerTest.cs | 278 ------------- .../Analysis/Ngram/NGramTokenFilterTest.cs | 249 ------------ .../Analysis/Ngram/NGramTokenizerTest.cs | 303 -------------- .../Analysis/Ngram/TestNGramFilters.cs | 196 ---------- 30 files changed, 3035 insertions(+), 3035 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs new file mode 100644 index 0000000..70b44d3 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs @@ -0,0 +1,60 @@ +using Lucene.Net.Analysis.Util; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Creates new instances of . + /// + /// <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"> + /// <analyzer> + /// <tokenizer class="solr.WhitespaceTokenizerFactory"/> + /// <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/> + /// </analyzer> + /// </fieldType> + /// + public class EdgeNGramFilterFactory : TokenFilterFactory + { + private readonly int maxGramSize; + private readonly int minGramSize; + private readonly string side; + + /// + /// Creates a new + public EdgeNGramFilterFactory(IDictionary args) + : base(args) + { + minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); + maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString()); + if (args.Count > 0) + { + throw new System.ArgumentException("Unknown parameters: " + args); + } + } + + public override TokenStream Create(TokenStream input) + { +#pragma warning disable 612, 618 + return new EdgeNGramTokenFilter(m_luceneMatchVersion, input, side, minGramSize, maxGramSize); +#pragma warning restore 612, 618 + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs new file mode 100644 index 0000000..8cf8172 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs @@ -0,0 +1,245 @@ +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Tokenizes the given token into n-grams of given size(s). + /// + /// This create n-grams from the beginning edge or ending edge of a input token. + /// + /// As of Lucene 4.4, this filter does not support + /// (you can use up-front and + /// afterward to get the same behavior), handles supplementary characters + /// correctly and does not update offsets anymore. + /// + /// + public sealed class EdgeNGramTokenFilter : TokenFilter + { + public const Side DEFAULT_SIDE = Side.FRONT; + public const int DEFAULT_MAX_GRAM_SIZE = 1; + public const int DEFAULT_MIN_GRAM_SIZE = 1; + + /// + /// Specifies which side of the input the n-gram should be generated from + public enum Side + { + /// + /// Get the n-gram from the front of the input + FRONT, + + /// + /// Get the n-gram from the end of the input + [System.Obsolete] + BACK, + } + + /// + /// Get the appropriate from a string + /// + public static Side GetSide(string sideName) + { + Side result; + if (!Enum.TryParse(sideName, true, out result)) + { + result = Side.FRONT; + } + return result; + } + + private readonly LuceneVersion version; + private readonly CharacterUtils charUtils; + private readonly int minGram; + private readonly int maxGram; + private Side side; + private char[] curTermBuffer; + private int curTermLength; + private int curCodePointCount; + private int curGramSize; + private int tokStart; + private int tokEnd; // only used if the length changed before this filter + private bool updateOffsets; // never if the length changed before this filter + private int savePosIncr; + private int savePosLen; + + private readonly ICharTermAttribute termAtt; + private readonly IOffsetAttribute offsetAtt; + private readonly IPositionIncrementAttribute posIncrAtt; + private readonly IPositionLengthAttribute posLenAtt; + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// holding the input to be tokenized + /// the from which to chop off an n-gram + /// the smallest n-gram to generate + /// the largest n-gram to generate + [Obsolete] + public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) + : base(input) + { + + //if (version == null) + //{ + // throw new System.ArgumentException("version must not be null"); + //} + + if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) + { + throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); + } + + if (!Enum.IsDefined(typeof(Side), side)) + { + throw new System.ArgumentException("sideLabel must be either front or back"); + } + + if (minGram < 1) + { + throw new System.ArgumentException("minGram must be greater than zero"); + } + + if (minGram > maxGram) + { + throw new System.ArgumentException("minGram must not be greater than maxGram"); + } + + this.version = version; + this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; + this.minGram = minGram; + this.maxGram = maxGram; + this.side = side; + + this.termAtt = AddAttribute(); + this.offsetAtt = AddAttribute(); + this.posIncrAtt = AddAttribute(); + this.posLenAtt = AddAttribute(); + } + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// holding the input to be tokenized + /// the name of the from which to chop off an n-gram + /// the smallest n-gram to generate + /// the largest n-gram to generate + [Obsolete] + public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, string sideLabel, int minGram, int maxGram) + : this(version, input, GetSide(sideLabel), minGram, maxGram) + { + } + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) +#pragma warning disable 612, 618 + : this(version, input, Side.FRONT, minGram, maxGram) +#pragma warning restore 612, 618 + { + } + + public override sealed bool IncrementToken() + { + while (true) + { + if (curTermBuffer == null) + { + if (!m_input.IncrementToken()) + { + return false; + } + else + { + curTermBuffer = (char[])termAtt.Buffer.Clone(); + curTermLength = termAtt.Length; + curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); + curGramSize = minGram; + tokStart = offsetAtt.StartOffset; + tokEnd = offsetAtt.EndOffset; +#pragma warning disable 612, 618 + if (version.OnOrAfter(LuceneVersion.LUCENE_44)) +#pragma warning restore 612, 618 + { + // Never update offsets + updateOffsets = false; + } + else + { + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + updateOffsets = (tokStart + curTermLength) == tokEnd; + } + savePosIncr += posIncrAtt.PositionIncrement; + savePosLen = posLenAtt.PositionLength; + } + } + if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit + { + if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams + { + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize); + int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); + ClearAttributes(); + if (updateOffsets) + { + offsetAtt.SetOffset(tokStart + start, tokStart + end); + } + else + { + offsetAtt.SetOffset(tokStart, tokEnd); + } + // first ngram gets increment, others don't + if (curGramSize == minGram) + { + posIncrAtt.PositionIncrement = savePosIncr; + savePosIncr = 0; + } + else + { + posIncrAtt.PositionIncrement = 0; + } + posLenAtt.PositionLength = savePosLen; + termAtt.CopyBuffer(curTermBuffer, start, end - start); + curGramSize++; + return true; + } + } + curTermBuffer = null; + } + } + + public override void Reset() + { + base.Reset(); + curTermBuffer = null; + savePosIncr = 0; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs new file mode 100644 index 0000000..ed2cb3d --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using System.IO; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Tokenizes the input from an edge into n-grams of given size(s). + /// + /// This create n-grams from the beginning edge or ending edge of a input token. + /// + /// As of Lucene 4.4, this tokenizer + /// + /// can handle maxGram larger than 1024 chars, but beware that this will result in increased memory usage + /// doesn't trim the input, + /// sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones + /// doesn't support backward n-grams anymore. + /// supports pre-tokenization, + /// correctly handles supplementary characters. + /// + /// + /// Although highly discouraged, it is still possible + /// to use the old behavior through . + /// + /// + public class EdgeNGramTokenizer : NGramTokenizer + { + public const int DEFAULT_MAX_GRAM_SIZE = 1; + public const int DEFAULT_MIN_GRAM_SIZE = 1; + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) + : base(version, input, minGram, maxGram, true) + { + } + + /// + /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// to use + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram) + : base(version, factory, input, minGram, maxGram, true) + { + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs new file mode 100644 index 0000000..00325f5 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs @@ -0,0 +1,75 @@ +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Creates new instances of . + /// + /// <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"> + /// <analyzer> + /// <tokenizer class="solr.EdgeNGramTokenizerFactory" minGramSize="1" maxGramSize="1"/> + /// </analyzer> + /// </fieldType> + /// + public class EdgeNGramTokenizerFactory : TokenizerFactory + { + private readonly int maxGramSize; + private readonly int minGramSize; + private readonly string side; + + /// + /// Creates a new + public EdgeNGramTokenizerFactory(IDictionary args) : base(args) + { + minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE); + maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString()); + if (args.Count > 0) + { + throw new System.ArgumentException("Unknown parameters: " + args); + } + } + + public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input) + { +#pragma warning disable 612, 618 + if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44)) +#pragma warning restore 612, 618 + { + EdgeNGramTokenFilter.Side sideEnum; + if (!Enum.TryParse(this.side, true, out sideEnum)) + { + throw new System.ArgumentException(typeof(EdgeNGramTokenizer).Name + " does not support backward n-grams as of Lucene 4.4"); + } + return new EdgeNGramTokenizer(m_luceneMatchVersion, input, minGramSize, maxGramSize); + } + else + { +#pragma warning disable 612, 618 + return new Lucene43EdgeNGramTokenizer(m_luceneMatchVersion, input, side, minGramSize, maxGramSize); +#pragma warning restore 612, 618 + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs new file mode 100644 index 0000000..4dadbed --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs @@ -0,0 +1,297 @@ +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Util; +using System; +using System.IO; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Old version of which doesn't handle correctly + /// supplementary characters. + /// + [Obsolete] + public sealed class Lucene43EdgeNGramTokenizer : Tokenizer + { + public const Side DEFAULT_SIDE = Side.FRONT; + public const int DEFAULT_MAX_GRAM_SIZE = 1; + public const int DEFAULT_MIN_GRAM_SIZE = 1; + + private ICharTermAttribute termAtt; + private IOffsetAttribute offsetAtt; + private IPositionIncrementAttribute posIncrAtt; + + /// + /// Specifies which side of the input the n-gram should be generated from + public enum Side + { + /// + /// Get the n-gram from the front of the input + FRONT, + + /// + /// Get the n-gram from the end of the input + BACK, + } + + // Get the appropriate Side from a string + public static Side GetSide(string sideName) + { + Side result; + if (!Enum.TryParse(sideName, true, out result)) + { + result = Side.FRONT; + } + return result; + } + + private int minGram; + private int maxGram; + private int gramSize; + private Side side; + private bool started; + private int inLen; // length of the input AFTER trim() + private int charsRead; // length of the input + private string inStr; + + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// holding the input to be tokenized + /// the from which to chop off an n-gram + /// the smallest n-gram to generate + /// the largest n-gram to generate + [Obsolete] + public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram) + : base(input) + { + Init(version, side, minGram, maxGram); + } + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// to use + /// holding the input to be tokenized + /// the from which to chop off an n-gram + /// the smallest n-gram to generate + /// the largest n-gram to generate + [Obsolete] + public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram) + : base(factory, input) + { + Init(version, side, minGram, maxGram); + } + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// holding the input to be tokenized + /// the name of the from which to chop off an n-gram + /// the smallest n-gram to generate + /// the largest n-gram to generate + [Obsolete] + public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, string sideLabel, int minGram, int maxGram) + : this(version, input, GetSide(sideLabel), minGram, maxGram) + { + } + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// to use + /// holding the input to be tokenized + /// the name of the from which to chop off an n-gram + /// the smallest n-gram to generate + /// the largest n-gram to generate + [Obsolete] + public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) + : this(version, factory, input, GetSide(sideLabel), minGram, maxGram) + { + } + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) + : this(version, input, Side.FRONT, minGram, maxGram) + { + } + + /// + /// Creates that can generate n-grams in the sizes of the given range + /// + /// the Lucene match version - See + /// to use + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram) + : this(version, factory, input, Side.FRONT, minGram, maxGram) + { + } + + private void Init(LuceneVersion version, Side side, int minGram, int maxGram) + { + //if (version == null) + //{ + // throw new System.ArgumentException("version must not be null"); + //} + + if (!Enum.IsDefined(typeof(Side), side)) + { + throw new System.ArgumentException("sideLabel must be either front or back"); + } + + if (minGram < 1) + { + throw new System.ArgumentException("minGram must be greater than zero"); + } + + if (minGram > maxGram) + { + throw new System.ArgumentException("minGram must not be greater than maxGram"); + } + + if (version.OnOrAfter(LuceneVersion.LUCENE_44)) + { + if (side == Side.BACK) + { + throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4"); + } + } + else + { + maxGram = Math.Min(maxGram, 1024); + } + + this.minGram = minGram; + this.maxGram = maxGram; + this.side = side; + this.termAtt = AddAttribute(); + this.offsetAtt = AddAttribute(); + this.posIncrAtt = AddAttribute(); + } + + /// + /// Returns the next token in the stream, or null at EOS. + public override bool IncrementToken() + { + ClearAttributes(); + // if we are just starting, read the whole input + if (!started) + { + started = true; + gramSize = minGram; + int limit = side == Side.FRONT ? maxGram : 1024; + char[] chars = new char[Math.Min(1024, limit)]; + charsRead = 0; + // TODO: refactor to a shared readFully somewhere: + bool exhausted = false; + while (charsRead < limit) + { + int inc = m_input.Read(chars, charsRead, chars.Length - charsRead); + if (inc <= 0) + { + exhausted = true; + break; + } + charsRead += inc; + if (charsRead == chars.Length && charsRead < limit) + { + chars = ArrayUtil.Grow(chars); + } + } + + inStr = new string(chars, 0, charsRead); + inStr = inStr.Trim(); + + if (!exhausted) + { + // Read extra throwaway chars so that on end() we + // report the correct offset: + var throwaway = new char[1024]; + while (true) + { + int inc = m_input.Read(throwaway, 0, throwaway.Length); + if (inc <= 0) + { + break; + } + charsRead += inc; + } + } + + inLen = inStr.Length; + if (inLen == 0) + { + return false; + } + posIncrAtt.PositionIncrement = 1; + } + else + { + posIncrAtt.PositionIncrement = 0; + } + + // if the remaining input is too short, we can't generate any n-grams + if (gramSize > inLen) + { + return false; + } + + // if we have hit the end of our n-gram size range, quit + if (gramSize > maxGram || gramSize > inLen) + { + return false; + } + + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : inLen - gramSize; + int end = start + gramSize; + termAtt.SetEmpty().Append(inStr, start, end); + offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end)); + gramSize++; + return true; + } + + public override void End() + { + base.End(); + // set final offset + int finalOffset = CorrectOffset(charsRead); + this.offsetAtt.SetOffset(finalOffset, finalOffset); + } + + public override void Reset() + { + base.Reset(); + started = false; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs new file mode 100644 index 0000000..b806345 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs @@ -0,0 +1,173 @@ +using Lucene.Net.Analysis.TokenAttributes; +using System; +using System.IO; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Old broken version of . + /// + [Obsolete] + public sealed class Lucene43NGramTokenizer : Tokenizer + { + public const int DEFAULT_MIN_NGRAM_SIZE = 1; + public const int DEFAULT_MAX_NGRAM_SIZE = 2; + + private int minGram, maxGram; + private int gramSize; + private int pos; + private int inLen; // length of the input AFTER trim() + private int charsRead; // length of the input + private string inStr; + private bool started; + + private ICharTermAttribute termAtt; + private IOffsetAttribute offsetAtt; + + /// + /// Creates with given min and max n-grams. + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram) + : base(input) + { + Init(minGram, maxGram); + } + + /// + /// Creates with given min and max n-grams. + /// to use + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram) + : base(factory, input) + { + Init(minGram, maxGram); + } + + /// + /// Creates with default min and max n-grams. + /// holding the input to be tokenized + public Lucene43NGramTokenizer(TextReader input) + : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) + { + } + + private void Init(int minGram, int maxGram) + { + if (minGram < 1) + { + throw new System.ArgumentException("minGram must be greater than zero"); + } + if (minGram > maxGram) + { + throw new System.ArgumentException("minGram must not be greater than maxGram"); + } + this.minGram = minGram; + this.maxGram = maxGram; + termAtt = AddAttribute(); + offsetAtt = AddAttribute(); + } + + /// + /// Returns the next token in the stream, or null at EOS. + public override bool IncrementToken() + { + ClearAttributes(); + if (!started) + { + started = true; + gramSize = minGram; + char[] chars = new char[1024]; + charsRead = 0; + // TODO: refactor to a shared readFully somewhere: + while (charsRead < chars.Length) + { + int inc = m_input.Read(chars, charsRead, chars.Length - charsRead); + if (inc == -1) + { + break; + } + charsRead += inc; + } + inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings + + if (charsRead == chars.Length) + { + // Read extra throwaway chars so that on end() we + // report the correct offset: + var throwaway = new char[1024]; + while (true) + { + int inc = m_input.Read(throwaway, 0, throwaway.Length); + if (inc == -1) + { + break; + } + charsRead += inc; + } + } + + inLen = inStr.Length; + if (inLen == 0) + { + return false; + } + } + + if (pos + gramSize > inLen) // if we hit the end of the string + { + pos = 0; // reset to beginning of string + gramSize++; // increase n-gram size + if (gramSize > maxGram) // we are done + { + return false; + } + if (pos + gramSize > inLen) + { + return false; + } + } + + int oldPos = pos; + pos++; + termAtt.SetEmpty().Append(inStr, oldPos, oldPos + gramSize); + offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize)); + return true; + } + + public override void End() + { + base.End(); + // set final offset + int finalOffset = CorrectOffset(charsRead); + this.offsetAtt.SetOffset(finalOffset, finalOffset); + } + + public override void Reset() + { + base.Reset(); + started = false; + pos = 0; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs new file mode 100644 index 0000000..ca1d0bc --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs @@ -0,0 +1,56 @@ +using Lucene.Net.Analysis.Util; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Factory for . + /// + /// <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"> + /// <analyzer> + /// <tokenizer class="solr.WhitespaceTokenizerFactory"/> + /// <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/> + /// </analyzer> + /// </fieldType> + /// + public class NGramFilterFactory : TokenFilterFactory + { + private readonly int maxGramSize; + private readonly int minGramSize; + + /// + /// Creates a new + public NGramFilterFactory(IDictionary args) + : base(args) + { + minGramSize = GetInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); + maxGramSize = GetInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); + if (args.Count > 0) + { + throw new System.ArgumentException("Unknown parameters: " + args); + } + } + + public override TokenStream Create(TokenStream input) + { + return new NGramTokenFilter(m_luceneMatchVersion, input, minGramSize, maxGramSize); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs new file mode 100644 index 0000000..f1c82c5 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs @@ -0,0 +1,252 @@ +using Lucene.Net.Analysis.Miscellaneous; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Tokenizes the input into n-grams of the given size(s). + /// You must specify the required compatibility when + /// creating a . As of Lucene 4.4, this token filters: + /// + /// handles supplementary characters correctly, + /// emits all n-grams for the same token at the same position, + /// does not modify offsets, + /// sorts n-grams by their offset in the original token first, then + /// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc", + /// "c"). + /// + /// + /// You can make this filter use the old behavior by providing a version < + /// in the constructor but this is not recommended as + /// it will lead to broken s that will cause highlighting + /// bugs. + /// + /// If you were using this to perform partial highlighting, + /// this won't work anymore since this filter doesn't update offsets. You should + /// modify your analysis chain to use , and potentially + /// override to perform pre-tokenization. + /// + /// + public sealed class NGramTokenFilter : TokenFilter + { + public const int DEFAULT_MIN_NGRAM_SIZE = 1; + public const int DEFAULT_MAX_NGRAM_SIZE = 2; + + private readonly int minGram, maxGram; + + private char[] curTermBuffer; + private int curTermLength; + private int curCodePointCount; + private int curGramSize; + private int curPos; + private int curPosInc, curPosLen; + private int tokStart; + private int tokEnd; + private bool hasIllegalOffsets; // only if the length changed before this filter + + private readonly LuceneVersion version; + private readonly CharacterUtils charUtils; + private readonly ICharTermAttribute termAtt; + private readonly IPositionIncrementAttribute posIncAtt; + private readonly IPositionLengthAttribute posLenAtt; + private readonly IOffsetAttribute offsetAtt; + + /// + /// Creates with given min and max n-grams. + /// Lucene version to enable correct position increments. + /// See for details. + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) + : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) + { + this.version = version; + this.charUtils = version.OnOrAfter( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_44) ? +#pragma warning restore 612, 618 + CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; + if (minGram < 1) + { + throw new System.ArgumentException("minGram must be greater than zero"); + } + if (minGram > maxGram) + { + throw new System.ArgumentException("minGram must not be greater than maxGram"); + } + this.minGram = minGram; + this.maxGram = maxGram; +#pragma warning disable 612, 618 + if (version.OnOrAfter(LuceneVersion.LUCENE_44)) +#pragma warning restore 612, 618 + { + posIncAtt = AddAttribute(); + posLenAtt = AddAttribute(); + } + else + { + posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); + posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); + } + termAtt = AddAttribute(); + offsetAtt = AddAttribute(); + } + + private class PositionIncrementAttributeAnonymousInnerClassHelper : PositionIncrementAttribute + { + private readonly NGramTokenFilter outerInstance; + + public PositionIncrementAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance) + { + this.outerInstance = outerInstance; + } + + public override int PositionIncrement + { + set + { + } + get + { + return 0; + } + } + } + + private class PositionLengthAttributeAnonymousInnerClassHelper : PositionLengthAttribute + { + private readonly NGramTokenFilter outerInstance; + + public PositionLengthAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance) + { + this.outerInstance = outerInstance; + } + + public override int PositionLength + { + set + { + } + get + { + return 0; + } + } + } + + /// + /// Creates with default min and max n-grams. + /// Lucene version to enable correct position increments. + /// See for details. + /// holding the input to be tokenized + public NGramTokenFilter(LuceneVersion version, TokenStream input) + : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) + { + } + + /// + /// Returns the next token in the stream, or null at EOS. + /// + public override sealed bool IncrementToken() + { + while (true) + { + if (curTermBuffer == null) + { + if (!m_input.IncrementToken()) + { + return false; + } + else + { + curTermBuffer = (char[])termAtt.Buffer.Clone(); + curTermLength = termAtt.Length; + curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); + curGramSize = minGram; + curPos = 0; + curPosInc = posIncAtt.PositionIncrement; + curPosLen = posLenAtt.PositionLength; + tokStart = offsetAtt.StartOffset; + tokEnd = offsetAtt.EndOffset; + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; + } + } +#pragma warning disable 612, 618 + if (version.OnOrAfter(LuceneVersion.LUCENE_44)) +#pragma warning restore 612, 618 + { + if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) + { + ++curPos; + curGramSize = minGram; + } + if ((curPos + curGramSize) <= curCodePointCount) + { + ClearAttributes(); + int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); + int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); + termAtt.CopyBuffer(curTermBuffer, start, end - start); + posIncAtt.PositionIncrement = curPosInc; + curPosInc = 0; + posLenAtt.PositionLength = curPosLen; + offsetAtt.SetOffset(tokStart, tokEnd); + curGramSize++; + return true; + } + } + else + { + while (curGramSize <= maxGram) + { + while (curPos + curGramSize <= curTermLength) // while there is input + { + ClearAttributes(); + termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize); + if (hasIllegalOffsets) + { + offsetAtt.SetOffset(tokStart, tokEnd); + } + else + { + offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); + } + curPos++; + return true; + } + curGramSize++; // increase n-gram size + curPos = 0; + } + } + curTermBuffer = null; + } + } + + public override void Reset() + { + base.Reset(); + curTermBuffer = null; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs new file mode 100644 index 0000000..b1845c8 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs @@ -0,0 +1,319 @@ +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Diagnostics; +using System.IO; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Tokenizes the input into n-grams of the given size(s). + /// On the contrary to , this class sets offsets so + /// that characters between startOffset and endOffset in the original stream are + /// the same as the term chars. + /// + /// For example, "abcde" would be tokenized as (minGram=2, maxGram=3): + /// + /// + /// Term + /// Position increment + /// Position length + /// Offsets + /// + /// + /// ab + /// 1 + /// 1 + /// [0,2[ + /// + /// + /// abc + /// 1 + /// 1 + /// [0,3[ + /// + /// + /// bc + /// 1 + /// 1 + /// [1,3[ + /// + /// + /// bcd + /// 1 + /// 1 + /// [1,4[ + /// + /// + /// cd + /// 1 + /// 1 + /// [2,4[ + /// + /// + /// cde + /// 1 + /// 1 + /// [2,5[ + /// + /// + /// de + /// 1 + /// 1 + /// [3,5[ + /// + /// + /// + /// This tokenizer changed a lot in Lucene 4.4 in order to: + /// + /// tokenize in a streaming fashion to support streams which are larger + /// than 1024 chars (limit of the previous version), + /// count grams based on unicode code points instead of java chars (and + /// never split in the middle of surrogate pairs), + /// give the ability to pre-tokenize the stream () + /// before computing n-grams. + /// + /// + /// Additionally, this class doesn't trim trailing whitespaces and emits + /// tokens in a different order, tokens are now emitted by increasing start + /// offsets while they used to be emitted by increasing lengths (which prevented + /// from supporting large input streams). + /// + /// Although highly discouraged, it is still possible + /// to use the old behavior through . + /// + /// + // non-sealed to allow for overriding IsTokenChar, but all other methods should be sealed + public class NGramTokenizer : Tokenizer + { + public const int DEFAULT_MIN_NGRAM_SIZE = 1; + public const int DEFAULT_MAX_NGRAM_SIZE = 2; + + private CharacterUtils charUtils; + private CharacterUtils.CharacterBuffer charBuffer; + private int[] buffer; // like charBuffer, but converted to code points + private int bufferStart, bufferEnd; // remaining slice in buffer + private int offset; + private int gramSize; + private int minGram, maxGram; + private bool exhausted; + private int lastCheckedChar; // last offset in the buffer that we checked + private int lastNonTokenChar; // last offset that we found to not be a token char + private bool edgesOnly; // leading edges n-grams only + + private ICharTermAttribute termAtt; + private IPositionIncrementAttribute posIncAtt; + private IPositionLengthAttribute posLenAtt; + private IOffsetAttribute offsetAtt; + + internal NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram, bool edgesOnly) + : base(input) + { + Init(version, minGram, maxGram, edgesOnly); + } + + /// + /// Creates with given min and max n-grams. + /// the lucene compatibility version + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) + : this(version, input, minGram, maxGram, false) + { + } + + internal NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram, bool edgesOnly) + : base(factory, input) + { + Init(version, minGram, maxGram, edgesOnly); + } + + /// + /// Creates with given min and max n-grams. + /// the lucene compatibility version + /// to use + /// holding the input to be tokenized + /// the smallest n-gram to generate + /// the largest n-gram to generate + public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram) + : this(version, factory, input, minGram, maxGram, false) + { + } + + /// + /// Creates with default min and max n-grams. + /// the lucene compatibility version + /// holding the input to be tokenized + public NGramTokenizer(LuceneVersion version, TextReader input) + : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) + { + } + + private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) + { +#pragma warning disable 612, 618 + if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) +#pragma warning restore 612, 618 + { + throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); + } +#pragma warning disable 612, 618 + charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? +#pragma warning restore 612, 618 + CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; + if (minGram < 1) + { + throw new System.ArgumentException("minGram must be greater than zero"); + } + if (minGram > maxGram) + { + throw new System.ArgumentException("minGram must not be greater than maxGram"); + } + termAtt = AddAttribute(); + posIncAtt = AddAttribute(); + posLenAtt = AddAttribute(); + offsetAtt = AddAttribute(); + this.minGram = minGram; + this.maxGram = maxGram; + this.edgesOnly = edgesOnly; + charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader + buffer = new int[charBuffer.Buffer.Length]; + + // Make the term att large enough + termAtt.ResizeBuffer(2 * maxGram); + } + + public override sealed bool IncrementToken() + { + ClearAttributes(); + + // termination of this loop is guaranteed by the fact that every iteration + // either advances the buffer (calls consumes()) or increases gramSize + while (true) + { + // compact + if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) + { + Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart); + bufferEnd -= bufferStart; + lastCheckedChar -= bufferStart; + lastNonTokenChar -= bufferStart; + bufferStart = 0; + + // fill in remaining space + exhausted = !charUtils.Fill(charBuffer, m_input, buffer.Length - bufferEnd); + // convert to code points + bufferEnd += charUtils.ToCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd); + } + + // should we go to the next offset? + if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) + { + if (bufferStart + 1 + minGram > bufferEnd) + { + Debug.Assert(exhausted); + return false; + } + Consume(); + gramSize = minGram; + } + + UpdateLastNonTokenChar(); + + // retry if the token to be emitted was going to not only contain token chars + bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize); + bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; + if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) + { + Consume(); + gramSize = minGram; + continue; + } + + int length = charUtils.ToChars(buffer, bufferStart, gramSize, termAtt.Buffer, 0); + termAtt.Length = length; + posIncAtt.PositionIncrement = 1; + posLenAtt.PositionLength = 1; + offsetAtt.SetOffset(CorrectOffset(offset), CorrectOffset(offset + length)); + ++gramSize; + return true; + } + } + + private void UpdateLastNonTokenChar() + { + int termEnd = bufferStart + gramSize - 1; + if (termEnd > lastCheckedChar) + { + for (int i = termEnd; i > lastCheckedChar; --i) + { + if (!IsTokenChar(buffer[i])) + { + lastNonTokenChar = i; + break; + } + } + lastCheckedChar = termEnd; + } + } + + /// + /// Consume one code point. + private void Consume() + { + offset += Character.CharCount(buffer[bufferStart++]); + } + + /// + /// Only collect characters which satisfy this condition. + protected virtual bool IsTokenChar(int chr) + { + return true; + } + + public override sealed void End() + { + base.End(); + Debug.Assert(bufferStart <= bufferEnd); + int endOffset = offset; + for (int i = bufferStart; i < bufferEnd; ++i) + { + endOffset += Character.CharCount(buffer[i]); + } + endOffset = CorrectOffset(endOffset); + // set final offset + offsetAtt.SetOffset(endOffset, endOffset); + } + + public override sealed void Reset() + { + base.Reset(); + bufferStart = bufferEnd = buffer.Length; + lastNonTokenChar = lastCheckedChar = bufferStart - 1; + offset = 0; + gramSize = minGram; + exhausted = false; + charBuffer.Reset(); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs new file mode 100644 index 0000000..cf25b65 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs @@ -0,0 +1,70 @@ +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Factory for . + /// + /// <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"> + /// <analyzer> + /// <tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/> + /// </analyzer> + /// </fieldType> + /// + public class NGramTokenizerFactory : TokenizerFactory + { + private readonly int maxGramSize; + private readonly int minGramSize; + + /// + /// Creates a new + public NGramTokenizerFactory(IDictionary args) + : base(args) + { + minGramSize = GetInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); + maxGramSize = GetInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); + if (args.Count > 0) + { + throw new System.ArgumentException("Unknown parameters: " + args); + } + } + + /// + /// Creates the of n-grams from the given and . + public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input) + { +#pragma warning disable 612, 618 + if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44)) +#pragma warning restore 612, 618 + { + return new NGramTokenizer(m_luceneMatchVersion, factory, input, minGramSize, maxGramSize); + } + else + { +#pragma warning disable 612, 618 + return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize); +#pragma warning restore 612, 618 + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs deleted file mode 100644 index 70b44d3..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs +++ /dev/null @@ -1,60 +0,0 @@ -using Lucene.Net.Analysis.Util; -using System.Collections.Generic; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// - /// Creates new instances of . - /// - /// <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"> - /// <analyzer> - /// <tokenizer class="solr.WhitespaceTokenizerFactory"/> - /// <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/> - /// </analyzer> - /// </fieldType> - /// - public class EdgeNGramFilterFactory : TokenFilterFactory - { - private readonly int maxGramSize; - private readonly int minGramSize; - private readonly string side; - - /// - /// Creates a new - public EdgeNGramFilterFactory(IDictionary args) - : base(args) - { - minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); - maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); - side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString()); - if (args.Count > 0) - { - throw new System.ArgumentException("Unknown parameters: " + args); - } - } - - public override TokenStream Create(TokenStream input) - { -#pragma warning disable 612, 618 - return new EdgeNGramTokenFilter(m_luceneMatchVersion, input, side, minGramSize, maxGramSize); -#pragma warning restore 612, 618 - } - } -} \ No newline at end of file