Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 2AA11200C61 for ; Tue, 25 Apr 2017 13:50:52 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 2910C160B9E; Tue, 25 Apr 2017 11:50:52 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id EAAFB160BBA for ; Tue, 25 Apr 2017 13:50:49 +0200 (CEST) Received: (qmail 52246 invoked by uid 500); 25 Apr 2017 11:50:44 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 51506 invoked by uid 99); 25 Apr 2017 11:50:43 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 25 Apr 2017 11:50:43 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id CB551DFFAB; Tue, 25 Apr 2017 11:50:42 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: nightowl888@apache.org To: commits@lucenenet.apache.org Date: Tue, 25 Apr 2017 11:50:45 -0000 Message-Id: <684d44f247494af4ba0bec75859d3dfb@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [04/52] [abbrv] lucenenet git commit: SWEEP: Moved BreakIterator-dependent functionality to a common Lucene.Net.Icu library so we can manage the icu.net dependency from one place and not make the majority of the users deal with it when they don't need to archived-at: Tue, 25 Apr 2017 11:50:52 -0000 SWEEP: Moved BreakIterator-dependent functionality to a common Lucene.Net.Icu library so we can manage the icu.net dependency from one place and not make the majority of the users deal with it when they don't need to Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b1fdcca3 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b1fdcca3 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b1fdcca3 Branch: refs/heads/master Commit: b1fdcca3b3c3f418dfe37aafeda6f4dab75fb6d4 Parents: 63c599e Author: Shad Storhaug Authored: Mon Apr 17 01:38:10 2017 +0700 Committer: Shad Storhaug Committed: Mon Apr 17 01:38:10 2017 +0700 ---------------------------------------------------------------------- Lucene.Net.Portable.sln | 20 + Lucene.Net.sln | 52 +++ NuGet.config | 1 + src/IcuBreakIterator.cs | 394 ----------------- .../Analysis/Th/ThaiAnalyzer.cs | 2 +- .../Lucene.Net.Analysis.Common.csproj | 3 - src/Lucene.Net.Analysis.Common/project.json | 6 +- .../Lucene.Net.Highlighter.csproj | 5 +- .../DefaultPassageFormatter.cs | 4 +- .../PostingsHighlight/MultiTermHighlighting.cs | 4 +- .../PostingsHighlight/Passage.cs | 4 +- .../PostingsHighlight/PassageFormatter.cs | 4 +- .../PostingsHighlight/PassageScorer.cs | 4 +- .../Properties/AssemblyInfo.cs | 2 + src/Lucene.Net.Highlighter/project.json | 6 +- src/Lucene.Net.Icu/Analysis/Th/stopwords.txt | 119 ++++++ src/Lucene.Net.Icu/Lucene.Net.Icu.csproj | 124 ++++++ src/Lucene.Net.Icu/Lucene.Net.Icu.project.json | 11 + src/Lucene.Net.Icu/Lucene.Net.Icu.xproj | 19 + src/Lucene.Net.Icu/Properties/AssemblyInfo.cs | 31 ++ src/Lucene.Net.Icu/Support/BreakIterator.cs | 231 ++++++++++ src/Lucene.Net.Icu/Support/CharacterIterator.cs | 50 +++ src/Lucene.Net.Icu/Support/IcuBreakIterator.cs | 394 +++++++++++++++++ .../Support/StringCharacterIterator.cs | 232 ++++++++++ src/Lucene.Net.Icu/project.json | 63 +++ .../Lucene.Net.Tests.Highlighter.csproj | 3 +- .../TestBreakIterator.cs | 421 ------------------- src/Lucene.Net.Tests.Highlighter/project.json | 4 +- .../Lucene.Net.Tests.Icu.csproj | 121 ++++++ .../Lucene.Net.Tests.Icu.project.json | 12 + .../Lucene.Net.Tests.Icu.xproj | 22 + .../Properties/AssemblyInfo.cs | 21 + .../Search/PostingsHighlight/CambridgeMA.utf8 | 1 + .../Support/TestApiConsistency.cs | 126 ++++++ .../Support/TestExceptionSerialization.cs | 54 +++ .../Support/TestIcuBreakIterator.cs | 421 +++++++++++++++++++ src/Lucene.Net.Tests.Icu/project.json | 67 +++ src/Lucene.Net/Lucene.Net.csproj | 3 - src/Lucene.Net/Properties/AssemblyInfo.cs | 2 + src/Lucene.Net/Support/BreakIterator.cs | 231 ---------- src/Lucene.Net/Support/CharacterIterator.cs | 50 --- .../Support/StringCharacterIterator.cs | 232 ---------- 42 files changed, 2220 insertions(+), 1356 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/Lucene.Net.Portable.sln ---------------------------------------------------------------------- diff --git a/Lucene.Net.Portable.sln b/Lucene.Net.Portable.sln index 8044aed..7f4edad 100644 --- a/Lucene.Net.Portable.sln +++ b/Lucene.Net.Portable.sln @@ -79,6 +79,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "build", "build", "{EFA10A77 build\build.ps1 = build\build.ps1 EndProjectSection EndProject +Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Icu", "src\Lucene.Net.Icu\Lucene.Net.Icu.xproj", "{44A5341B-0F52-429D-977A-C35E10ECCADF}" +EndProject +Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.Icu", "src\Lucene.Net.Tests.Icu\Lucene.Net.Tests.Icu.xproj", "{32FD3471-E862-4055-B969-79C12A656366}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -367,6 +371,22 @@ Global {C708701D-4318-469F-9822-49A80386CFEA}.Release|Any CPU.Build.0 = Release|Any CPU {C708701D-4318-469F-9822-49A80386CFEA}.Release|x86.ActiveCfg = Release|Any CPU {C708701D-4318-469F-9822-49A80386CFEA}.Release|x86.Build.0 = Release|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Debug|x86.ActiveCfg = Debug|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Debug|x86.Build.0 = Debug|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Release|Any CPU.Build.0 = Release|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Release|x86.ActiveCfg = Release|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Release|x86.Build.0 = Release|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Debug|Any CPU.Build.0 = Debug|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Debug|x86.ActiveCfg = Debug|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Debug|x86.Build.0 = Debug|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Release|Any CPU.ActiveCfg = Release|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Release|Any CPU.Build.0 = Release|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Release|x86.ActiveCfg = Release|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/Lucene.Net.sln ---------------------------------------------------------------------- diff --git a/Lucene.Net.sln b/Lucene.Net.sln index b218f0d..66e91a6 100644 --- a/Lucene.Net.sln +++ b/Lucene.Net.sln @@ -88,6 +88,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "build", "build", "{9811D53E build\build.ps1 = build\build.ps1 EndProjectSection EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Icu", "src\Lucene.Net.Icu\Lucene.Net.Icu.csproj", "{349CB7C9-7534-4E1D-9B0A-5521441AF0AE}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Icu", "src\Lucene.Net.Tests.Icu\Lucene.Net.Tests.Icu.csproj", "{D5AA1A22-1B28-4DF6-BFDA-02519A189839}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -849,6 +853,54 @@ Global {FBCD6AFE-0A5C-4399-8044-99C58D2912D1}.Release35|Mixed Platforms.Build.0 = Release|Any CPU {FBCD6AFE-0A5C-4399-8044-99C58D2912D1}.Release35|x86.ActiveCfg = Release|Any CPU {FBCD6AFE-0A5C-4399-8044-99C58D2912D1}.Release35|x86.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|Any CPU.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|x86.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|x86.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|Any CPU.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|x86.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|x86.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|Any CPU.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|Any CPU.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|x86.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|x86.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|Any CPU.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|Any CPU.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|Mixed Platforms.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|x86.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|x86.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|x86.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|x86.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|Any CPU.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|x86.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|x86.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|Any CPU.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|x86.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|x86.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Any CPU.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Any CPU.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Mixed Platforms.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/NuGet.config ---------------------------------------------------------------------- diff --git a/NuGet.config b/NuGet.config index 8df6c0f..e0c6211 100644 --- a/NuGet.config +++ b/NuGet.config @@ -2,6 +2,7 @@ + http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/IcuBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/IcuBreakIterator.cs b/src/IcuBreakIterator.cs deleted file mode 100644 index cc0f7cd..0000000 --- a/src/IcuBreakIterator.cs +++ /dev/null @@ -1,394 +0,0 @@ -#if FEATURE_BREAKITERATOR -using Lucene.Net.Support; -using System; -using System.Collections.Generic; -using System.Globalization; -using System.Linq; -using System.Text; - -namespace Lucene.Net -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// - /// A implementation that encapsulates the functionality - /// of icu.net's static class. A - /// provides methods to move forward, reverse, and randomly through a set of text breaks - /// defined by the enumeration. - /// - // LUCENENET specific type - internal class IcuBreakIterator : BreakIterator - { - private readonly Icu.Locale locale; - private readonly Icu.BreakIterator.UBreakIteratorType type; - - private List boundaries = new List(); - private int currentBoundaryIndex; // Index (not the value) of the current boundary in boundaries - private string text; - - /// - /// The start offset for the string, if supplied by a - /// - protected int m_start; - - /// - /// The end offset for the string, if supplied by a - /// - protected int m_end; - - private bool enableHacks = false; - - public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type) - : this(type, CultureInfo.CurrentCulture) - { - } - - public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type, CultureInfo locale) - { - if (locale == null) - throw new ArgumentNullException("locale"); - this.locale = new Icu.Locale(locale.Name); - this.type = type; - } - - - public virtual bool EnableHacks - { - get { return enableHacks; } - set { enableHacks = value; } - } - - /// - /// Sets the current iteration position to the beginning of the text. - /// - /// The offset of the beginning of the text. - public override int First() - { - currentBoundaryIndex = 0; - return ReturnCurrent(); - } - - /// - /// Sets the current iteration position to the end of the text. - /// - /// The text's past-the-end offset. - public override int Last() - { - currentBoundaryIndex = boundaries.Count - 1; - return ReturnCurrent(); - } - - /// - /// Advances the iterator either forward or backward the specified number of steps. - /// Negative values move backward, and positive values move forward. This is - /// equivalent to repeatedly calling or . - /// - /// The number of steps to move. The sign indicates the direction - /// (negative is backwards, and positive is forwards). - /// The character offset of the boundary position n boundaries away from - /// the current one. - public override int Next(int n) - { - int result = Current; - while (n > 0) - { - result = Next(); - --n; - } - while (n < 0) - { - result = Previous(); - ++n; - } - return result; - } - - /// - /// Advances the iterator to the next boundary position. - /// - /// The position of the first boundary after this one. - public override int Next() - { - if (currentBoundaryIndex >= boundaries.Count - 1 || boundaries.Count == 0) - { - return DONE; - } - currentBoundaryIndex++; - return ReturnCurrent(); - } - - /// - /// Advances the iterator backwards, to the last boundary preceding this one. - /// - /// The position of the last boundary position preceding this one. - public override int Previous() - { - if (currentBoundaryIndex == 0 || boundaries.Count == 0) - { - return DONE; - } - currentBoundaryIndex--; - return ReturnCurrent(); - } - - /// - /// Throw unless begin <= offset < end. - /// - /// - private void CheckOffset(int offset) - { - if (offset < m_start || offset > m_end) - { - throw new ArgumentException("offset out of bounds"); - } - } - - /// - /// Sets the iterator to refer to the first boundary position following - /// the specified position. - /// - /// The position from which to begin searching for a break position. - /// The position of the first break after the current position. - public override int Following(int offset) - { - CheckOffset(offset); - - if (boundaries.Count == 0) - { - return DONE; - } - - int following = GetLowestIndexGreaterThan(offset); - if (following == -1) - { - currentBoundaryIndex = boundaries.Count - 1; - return DONE; - } - else - { - currentBoundaryIndex = following; - } - return ReturnCurrent(); - } - - private int GetLowestIndexGreaterThan(int offset) - { - int index = boundaries.BinarySearch(offset); - if (index < 0) - { - return ~index; - } - else if (index + 1 < boundaries.Count) - { - return index + 1; - } - - return -1; - } - - /// - /// Sets the iterator to refer to the last boundary position before the - /// specified position. - /// - /// The position to begin searching for a break from. - /// The position of the last boundary before the starting position. - public override int Preceding(int offset) - { - CheckOffset(offset); - - if (boundaries.Count == 0) - { - return DONE; - } - - int preceeding = GetHighestIndexLessThan(offset); - if (preceeding == -1) - { - currentBoundaryIndex = 0; - return DONE; - } - else - { - currentBoundaryIndex = preceeding; - } - return ReturnCurrent(); - } - - private int GetHighestIndexLessThan(int offset) - { - int index = boundaries.BinarySearch(offset); - if (index < 0) - { - return ~index - 1; - } - else - { - // NOTE: This is intentionally allowed to return -1 in the case - // where index == 0. This state indicates we are before the first boundary. - return index - 1; - } - } - - /// - /// Returns the current iteration position. - /// - public override int Current - { - get { return ReturnCurrent(); } - } - - /// - /// Gets the text being analyzed. - /// - public override string Text - { - get - { - return text; - } - } - - /// - /// Set the iterator to analyze a new piece of text. This function resets - /// the current iteration position to the beginning of the text. - /// - /// The text to analyze. - public override void SetText(string newText) - { - text = newText; - currentBoundaryIndex = 0; - m_start = 0; - m_end = newText.Length; - - LoadBoundaries(m_start, m_end); - } - - public override void SetText(CharacterIterator newText) - { - text = newText.GetTextAsString(); - currentBoundaryIndex = 0; - m_start = newText.BeginIndex; - m_end = newText.EndIndex; - - LoadBoundaries(m_start, m_end); - } - - private void LoadBoundaries(int start, int end) - { - IEnumerable icuBoundaries; - string offsetText = text.Substring(start, end - start); - -#if !NETSTANDARD - try - { -#endif - if (type == Icu.BreakIterator.UBreakIteratorType.WORD) - { - if (enableHacks) - { - // LUCENENET TODO: HACK - replacing hyphen with "a" so hyphenated words aren't broken - offsetText = offsetText.Replace("-", "a"); - } - - icuBoundaries = Icu.BreakIterator.GetWordBoundaries(locale, offsetText, true); - } - else - { - if (enableHacks && type == Icu.BreakIterator.UBreakIteratorType.SENTENCE) - { - // LUCENENET TODO: HACK - newline character causes incorrect sentence breaking. - offsetText = offsetText.Replace("\n", " "); - // LUCENENET TODO: HACK - the ICU sentence logic doesn't work (in English anyway) when sentences don't - // begin with capital letters. - offsetText = CapitalizeFirst(offsetText); - } - - icuBoundaries = Icu.BreakIterator.GetBoundaries(type, locale, offsetText); - } -#if !NETSTANDARD - } - catch (AccessViolationException ace) - { - // LUCENENET TODO: Find a reliable way to reproduce and report the - // AccessViolationException that happens here to the icu-dotnet project team - throw new Exception("Hit AccessViolationException: " + ace.ToString(), ace); - } -#endif - - boundaries = icuBoundaries - .Select(t => new[] { t.Start + start, t.End + start }) - .SelectMany(b => b) - .Distinct() - .ToList(); - } - - /// - /// Returns true if the specified character offset is a text boundary. - /// - /// the character offset to check. - /// true if "offset" is a boundary position, false otherwise. - public override bool IsBoundary(int offset) - { - CheckOffset(offset); - return boundaries.Contains(offset); - } - - private int ReturnCurrent() - { - if (boundaries.Count > 0) - { - return currentBoundaryIndex < boundaries.Count && currentBoundaryIndex > -1 - ? boundaries[currentBoundaryIndex] - : DONE; - } - - // If there are no boundaries, we must return the start offset - return m_start; - } - - /// - /// LUCENENET TODO: This is a temporary workaround for an issue with icu-dotnet - /// where it doesn't correctly break sentences unless they begin with a capital letter. - /// If/when ICU is fixed, this method should be deleted and the IcuBreakIterator - /// code changed to remove calls to this method. - /// - public static string CapitalizeFirst(string s) - { - bool isNewSentence = true; - var result = new StringBuilder(s.Length); - for (int i = 0; i < s.Length; i++) - { - if (isNewSentence && char.IsLetter(s[i])) - { - result.Append(char.ToUpper(s[i])); - isNewSentence = false; - } - else - result.Append(s[i]); - - if (s[i] == '!' || s[i] == '?' || s[i] == '.') - { - isNewSentence = true; - } - } - - return result.ToString(); - } - } -} -#endif http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs index aa6e1d7..0885069 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs @@ -111,7 +111,7 @@ namespace Lucene.Net.Analysis.Th /// built from a filtered with /// , , , and /// - protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj index fb403aa..02545b2 100644 --- a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj +++ b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj @@ -41,9 +41,6 @@ - - IcuBreakIterator.cs - http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Analysis.Common/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/project.json b/src/Lucene.Net.Analysis.Common/project.json index ca771a3..556a89e 100644 --- a/src/Lucene.Net.Analysis.Common/project.json +++ b/src/Lucene.Net.Analysis.Common/project.json @@ -26,8 +26,7 @@ "define": [ "NETSTANDARD" ], "compile": { "includeFiles": [ - "../CommonAssemblyInfo.cs", - "../IcuBreakIterator.cs" + "../CommonAssemblyInfo.cs" ] }, "embed": { @@ -52,8 +51,7 @@ "define": [ "FEATURE_CLONEABLE", "FEATURE_DTD_PROCESSING", "FEATURE_SERIALIZABLE" ], "compile": { "includeFiles": [ - "../CommonAssemblyInfo.cs", - "../IcuBreakIterator.cs" + "../CommonAssemblyInfo.cs" ] }, "embed": { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj b/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj index 31ac251..9c885d4 100644 --- a/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj +++ b/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj @@ -44,9 +44,6 @@ - - IcuBreakIterator.cs - @@ -101,7 +98,7 @@ - + Properties\CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs index 4538d46..6a38bec 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs @@ -1,4 +1,5 @@ -using System; +#if FEATURE_BREAKITERATOR +using System; using System.Text; namespace Lucene.Net.Search.PostingsHighlight @@ -161,3 +162,4 @@ namespace Lucene.Net.Search.PostingsHighlight } } } +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs index e5a5bcd..bd79c80 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs @@ -1,4 +1,5 @@ -using Lucene.Net.Analysis; +#if FEATURE_BREAKITERATOR +using Lucene.Net.Analysis; using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Index; using Lucene.Net.Search.Spans; @@ -344,3 +345,4 @@ namespace Lucene.Net.Search.PostingsHighlight } } } +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs index 54a2446..b9a664f 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs @@ -1,4 +1,5 @@ -using Lucene.Net.Util; +#if FEATURE_BREAKITERATOR +using Lucene.Net.Util; using System.Collections.Generic; using System.Diagnostics; @@ -183,3 +184,4 @@ namespace Lucene.Net.Search.PostingsHighlight } } } +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs index ce367a6..770a6fa 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs @@ -1,4 +1,5 @@ -namespace Lucene.Net.Search.PostingsHighlight +#if FEATURE_BREAKITERATOR +namespace Lucene.Net.Search.PostingsHighlight { /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -42,3 +43,4 @@ public abstract object Format(Passage[] passages, string content); // LUCENENET TODO: Make return type generic? } } +#endif http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs index af398da..de0fd45 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs @@ -1,4 +1,5 @@ -using System; +#if FEATURE_BREAKITERATOR +using System; namespace Lucene.Net.Search.PostingsHighlight { @@ -110,3 +111,4 @@ namespace Lucene.Net.Search.PostingsHighlight } } } +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs b/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs index 6d2eedf..8969ff6 100644 --- a/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs +++ b/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs @@ -24,7 +24,9 @@ using System.Runtime.InteropServices; // The following GUID is for the ID of the typelib if this project is exposed to COM [assembly: Guid("e9e769ea-8504-44bc-8dc9-ccf958765f8f")] +[assembly: InternalsVisibleTo("Lucene.Net.Icu")] // for testing [assembly: InternalsVisibleTo("Lucene.Net.Tests.Highlighter")] +[assembly: InternalsVisibleTo("Lucene.Net.Tests.Icu")] // NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/project.json b/src/Lucene.Net.Highlighter/project.json index 5016f93..ce4b726 100644 --- a/src/Lucene.Net.Highlighter/project.json +++ b/src/Lucene.Net.Highlighter/project.json @@ -25,8 +25,7 @@ "define": [ "NETSTANDARD" ], "compile": { "includeFiles": [ - "../CommonAssemblyInfo.cs", - "../IcuBreakIterator.cs" + "../CommonAssemblyInfo.cs" ] } }, @@ -40,8 +39,7 @@ "define": [ "FEATURE_SERIALIZABLE" ], "compile": { "includeFiles": [ - "../CommonAssemblyInfo.cs", - "../IcuBreakIterator.cs" + "../CommonAssemblyInfo.cs" ] } } http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Analysis/Th/stopwords.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Analysis/Th/stopwords.txt b/src/Lucene.Net.Icu/Analysis/Th/stopwords.txt new file mode 100644 index 0000000..07f0fab --- /dev/null +++ b/src/Lucene.Net.Icu/Analysis/Th/stopwords.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +ไว้ +ไม่ +ไป +ได้ +ให้ +ใน +โดย +แห่ง +แล้ว +และ +แรก +แบบ +แต่ +เอง +เห็น +เลย +เริ่ม +เรา +เมื่อ +เพื่อ +เพราะ +เป็นการ +เป็น +เปิดเผย +เปิด +เนื่องจาก +เดียวกัน +เดียว +เช่น +เฉพาะ +เคย +เข้า +เขา +อีก +อาจ +อะไร +ออก +อย่าง +อยู่ +อยาก +หาก +หลาย +หลังจาก +หลัง +หรือ +หนึ่ง +ส่วน +ส่ง +สุด +สําหรับ +ว่า +วัน +ลง +ร่วม +ราย +รับ +ระหว่าง +รวม +ยัง +มี +มาก +มา +พร้อม +พบ +ผ่าน +ผล +บาง +น่า +นี้ +นํา +นั้น +นัก +นอกจาก +ทุก +ที่สุด +ที่ +ทําให้ +ทํา +ทาง +ทั้งนี้ +ทั้ง +ถ้า +ถูก +ถึง +ต้อง +ต่างๆ +ต่าง +ต่อ +ตาม +ตั้งแต่ +ตั้ง +ด้าน +ด้วย +ดัง +ซึ่ง +ช่วง +จึง +จาก +จัด +จะ +คือ +ความ +ครั้ง +คง +ขึ้น +ของ +ขอ +ขณะ +ก่อน +ก็ +การ +กับ +กัน +กว่า +กล่าว http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Lucene.Net.Icu.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Lucene.Net.Icu.csproj b/src/Lucene.Net.Icu/Lucene.Net.Icu.csproj new file mode 100644 index 0000000..267132e --- /dev/null +++ b/src/Lucene.Net.Icu/Lucene.Net.Icu.csproj @@ -0,0 +1,124 @@ + + + + + Debug + AnyCPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE} + Library + Properties + Lucene.Net + Lucene.Net.Icu + v4.5.1 + 512 + + + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + $(DefineConstants);FEATURE_BREAKITERATOR;FEATURE_SERIALIZABLE + + + + + + + + + + Analysis\Th\ThaiAnalyzer.cs + + + Analysis\Th\ThaiTokenizer.cs + + + Analysis\Th\ThaiTokenizerFactory.cs + + + Analysis\Th\ThaiWordFilter.cs + + + Analysis\Th\ThaiWordFilterFactory.cs + + + Analysis\Util\CharArrayIterator.cs + + + Analysis\Util\SegmentingTokenizerBase.cs + + + Search\PostingsHighlight\DefaultPassageFormatter.cs + + + Search\PostingsHighlight\MultiTermHighlighting.cs + + + Search\PostingsHighlight\Passage.cs + + + Search\PostingsHighlight\PassageFormatter.cs + + + Search\PostingsHighlight\PassageScorer.cs + + + Search\PostingsHighlight\PostingsHighlighter.cs + + + Search\PostingsHighlight\WholeBreakIterator.cs + + + Search\VectorHighlight\BreakIteratorBoundaryScanner.cs + + + + + + + Properties\CommonAssemblyInfo.cs + + + + + + {4add0bbc-b900-4715-9526-d871de8eea64} + Lucene.Net.Analysis.Common + + + {e9e769ea-8504-44bc-8dc9-ccf958765f8f} + Lucene.Net.Highlighter + + + {5d4ad9be-1ffb-41ab-9943-25737971bf57} + Lucene.Net + + + + + + + + + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Lucene.Net.Icu.project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Lucene.Net.Icu.project.json b/src/Lucene.Net.Icu/Lucene.Net.Icu.project.json new file mode 100644 index 0000000..af28fc8 --- /dev/null +++ b/src/Lucene.Net.Icu/Lucene.Net.Icu.project.json @@ -0,0 +1,11 @@ +{ + "runtimes": { + "win": {} + }, + "dependencies": { + "icu.net": "54.1.1-alpha" + }, + "frameworks": { + "net451": {} + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Lucene.Net.Icu.xproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Lucene.Net.Icu.xproj b/src/Lucene.Net.Icu/Lucene.Net.Icu.xproj new file mode 100644 index 0000000..dd48901 --- /dev/null +++ b/src/Lucene.Net.Icu/Lucene.Net.Icu.xproj @@ -0,0 +1,19 @@ + + + + 14.0 + $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion) + + + + 44a5341b-0f52-429d-977a-c35e10eccadf + Lucene.Net.Search + .\obj + .\bin\ + v4.5.1 + + + 2.0 + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Properties/AssemblyInfo.cs b/src/Lucene.Net.Icu/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..3cdd9b2 --- /dev/null +++ b/src/Lucene.Net.Icu/Properties/AssemblyInfo.cs @@ -0,0 +1,31 @@ +using System; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Icu")] +[assembly: AssemblyDescription( + "International Components for Unicode-based features including Thai analyzer support, " + + "an international postings highlighter, and BreakIterator support for the vector highlighter in Lucene.Net.Highlighter " + + "for the Lucene.Net full-text search engine library from The Apache Software Foundation.")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyDefaultAlias("Lucene.Net.Icu")] +[assembly: AssemblyCulture("")] + +[assembly: CLSCompliant(true)] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("349cb7c9-7534-4e1d-9b0a-5521441af0ae")] + +// for testing +[assembly: InternalsVisibleTo("Lucene.Net.Tests.Icu")] + +// NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Support/BreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Support/BreakIterator.cs b/src/Lucene.Net.Icu/Support/BreakIterator.cs new file mode 100644 index 0000000..ded1c9c --- /dev/null +++ b/src/Lucene.Net.Icu/Support/BreakIterator.cs @@ -0,0 +1,231 @@ +#if FEATURE_BREAKITERATOR +using System; + +namespace Lucene.Net.Support +{ + /// + /// The BreakIterator class implements methods for finding + /// the location of boundaries in text. Instances of BreakIterator + /// maintain a current position and scan over text + /// returning the index of characters where boundaries occur. + /// + public abstract class BreakIterator +#if FEATURE_CLONEABLE + : ICloneable +#endif + { + /// + /// Constructor. BreakIterator is stateless and has no default behavior. + /// + protected BreakIterator() + { + } + + /// + /// Create a copy of this iterator + /// + /// A member-wise copy of this + public object Clone() + { + return MemberwiseClone(); + } + + /// + /// DONE is returned by Previous(), Next(), Next(int), Preceding(int) + /// and Following(int) when either the first or last text boundary has been + /// reached. + /// + public static readonly int DONE = -1; + + /// + /// Returns the first boundary. The iterator's current position is set + /// to the first text boundary. + /// + /// The character index of the first text boundary + public abstract int First(); + + /// + /// Returns the last boundary. The iterator's current position is set + /// to the last text boundary. + /// + /// The character index of the last text boundary. + public abstract int Last(); + + /// + /// Returns the nth boundary from the current boundary. If either + /// the first or last text boundary has been reached, it returns + /// and the current position is set to either + /// the first or last text boundary depending on which one is reached. Otherwise, + /// the iterator's current position is set to the new boundary. + /// For example, if the iterator's current position is the mth text boundary + /// and three more boundaries exist from the current boundary to the last text + /// boundary, the Next(2) call will return m + 2. The new text position is set + /// to the (m + 2)th text boundary. A Next(4) call would return + /// and the last text boundary would become the + /// new text position. + /// + /// + /// which boundary to return. A value of 0 + /// does nothing. Negative values move to previous boundaries + /// and positive values move to later boundaries. + /// + /// + /// The character index of the nth boundary from the current position + /// or if either first or last text boundary + /// has been reached. + /// + public abstract int Next(int n); + + /// + /// Returns the boundary following the current boundary. If the current boundary + /// is the last text boundary, it returns BreakIterator.DONE and + /// the iterator's current position is unchanged. Otherwise, the iterator's + /// current position is set to the boundary following the current boundary. + /// + /// + /// The character index of the next text boundary or + /// if the current boundary is the last text + /// boundary. + /// Equivalent to Next(1). + /// + /// + public abstract int Next(); + + /// + /// Returns the boundary preceding the current boundary. If the current boundary + /// is the first text boundary, it returns BreakIterator.DONE and + /// the iterator's current position is unchanged. Otherwise, the iterator's + /// current position is set to the boundary preceding the current boundary. + /// + /// + /// The character index of the previous text boundary or + /// if the current boundary is the first text + /// boundary. + /// + public abstract int Previous(); + + /// + /// Returns the first boundary following the specified character offset. If the + /// specified offset equals to the last text boundary, it returns + /// and the iterator's current position is unchanged. + /// Otherwise, the iterator's current position is set to the returned boundary. + /// The value returned is always greater than the offset or the value + /// . + /// + /// the character offset to begin scanning. + /// + /// The first boundary after the specified offset or + /// if the last text boundary is passed in + /// as the offset. + /// + /// + /// if the specified offset is less than + /// the first text boundary or greater than the last text boundary. + /// + public abstract int Following(int offset); + + /// + /// Returns the last boundary preceding the specified character offset. If the + /// specified offset equals to the first text boundary, it returns + /// and the iterator's current position is unchanged. + /// Otherwise, the iterator's current position is set to the returned boundary. + /// The value returned is always less than the offset or the value + /// . + /// + /// the character offset to begin scanning. + /// + /// The last boundary before the specified offset or + /// if the first text boundary is passed in + /// as the offset. + /// + public abstract int Preceding(int offset); + //{ + // // NOTE: This implementation is here solely because we can't add new + // // abstract methods to an existing class. There is almost ALWAYS a + // // better, faster way to do this. + // int pos = Following(offset); + // while (pos >= offset && pos != DONE) + // { + // pos = Previous(); + // } + // return pos; + //} + + /// + /// Returns true if the specified character offset is a text boundary. + /// + /// the character offset to check. + /// true if "offset" is a boundary position, false otherwise. + /// + /// if the specified offset is less than + /// the first text boundary or greater than the last text boundary. + /// + public abstract bool IsBoundary(int offset); + //{ + // // NOTE: This implementation probably is wrong for most situations + // // because it fails to take into account the possibility that a + // // CharacterIterator passed to setText() may not have a begin offset + // // of 0. But since the abstract BreakIterator doesn't have that + // // knowledge, it assumes the begin offset is 0. If you subclass + // // BreakIterator, copy the SimpleTextBoundary implementation of this + // // function into your subclass. [This should have been abstract at + // // this level, but it's too late to fix that now.] + // if (offset == 0) + // { + // return true; + // } + // int boundary = Following(offset - 1); + // if (boundary == DONE) + // { + // throw new ArgumentException(); + // } + // return boundary == offset; + //} + + /// + /// Returns character index of the text boundary that was most + /// recently returned by Next(), Next(int), Previous(), First(), Last(), + /// Following(int) or Preceding(int). If any of these methods returns + /// because either first or last text boundary + /// has been reached, it returns the first or last text boundary depending on + /// which one is reached. + /// + /// + /// The text boundary returned from the above methods, first or last + /// text boundary. + /// + /// + /// + /// + /// + /// + /// + /// + public abstract int Current { get; } + + /// + /// Get the text being scanned + /// + /// the text being scanned + //public abstract CharacterIterator GetText(); + public abstract string Text { get; } + + /// + /// Set a new text string to be scanned. The current scan + /// position is reset to First(). + /// + /// new text to scan. + public virtual void SetText(string newText) + { + SetText(new StringCharacterIterator(newText)); + } + + /// + /// Set a new text string to be scanned. The current scan + /// position is reset to First(). + /// + /// new text to scan. + public abstract void SetText(CharacterIterator newText); + } +} +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Support/CharacterIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Support/CharacterIterator.cs b/src/Lucene.Net.Icu/Support/CharacterIterator.cs new file mode 100644 index 0000000..0c81629 --- /dev/null +++ b/src/Lucene.Net.Icu/Support/CharacterIterator.cs @@ -0,0 +1,50 @@ +#if FEATURE_BREAKITERATOR +using System; + +namespace Lucene.Net.Support +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public abstract class CharacterIterator + { + public static readonly char DONE = '\uFFFF'; + + public abstract char Current { get; } + + public abstract char First(); + + public abstract char Last(); + + public abstract char Next(); + + public abstract char Previous(); + + public abstract char SetIndex(int position); + + public abstract int BeginIndex { get; } + + public abstract int EndIndex { get; } + + public abstract int Index { get; } + + public abstract object Clone(); + + public abstract string GetTextAsString(); + } +} +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Support/IcuBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Support/IcuBreakIterator.cs b/src/Lucene.Net.Icu/Support/IcuBreakIterator.cs new file mode 100644 index 0000000..79819ed --- /dev/null +++ b/src/Lucene.Net.Icu/Support/IcuBreakIterator.cs @@ -0,0 +1,394 @@ +#if FEATURE_BREAKITERATOR +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; + +namespace Lucene.Net +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// A implementation that encapsulates the functionality + /// of icu.net's static class. A + /// provides methods to move forward, reverse, and randomly through a set of text breaks + /// defined by the enumeration. + /// + // LUCENENET specific type + public class IcuBreakIterator : BreakIterator + { + private readonly Icu.Locale locale; + private readonly Icu.BreakIterator.UBreakIteratorType type; + + private List boundaries = new List(); + private int currentBoundaryIndex; // Index (not the value) of the current boundary in boundaries + private string text; + + /// + /// The start offset for the string, if supplied by a + /// + protected int m_start; + + /// + /// The end offset for the string, if supplied by a + /// + protected int m_end; + + private bool enableHacks = false; + + public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type) + : this(type, CultureInfo.CurrentCulture) + { + } + + public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type, CultureInfo locale) + { + if (locale == null) + throw new ArgumentNullException("locale"); + this.locale = new Icu.Locale(locale.Name); + this.type = type; + } + + + public virtual bool EnableHacks + { + get { return enableHacks; } + set { enableHacks = value; } + } + + /// + /// Sets the current iteration position to the beginning of the text. + /// + /// The offset of the beginning of the text. + public override int First() + { + currentBoundaryIndex = 0; + return ReturnCurrent(); + } + + /// + /// Sets the current iteration position to the end of the text. + /// + /// The text's past-the-end offset. + public override int Last() + { + currentBoundaryIndex = boundaries.Count - 1; + return ReturnCurrent(); + } + + /// + /// Advances the iterator either forward or backward the specified number of steps. + /// Negative values move backward, and positive values move forward. This is + /// equivalent to repeatedly calling or . + /// + /// The number of steps to move. The sign indicates the direction + /// (negative is backwards, and positive is forwards). + /// The character offset of the boundary position n boundaries away from + /// the current one. + public override int Next(int n) + { + int result = Current; + while (n > 0) + { + result = Next(); + --n; + } + while (n < 0) + { + result = Previous(); + ++n; + } + return result; + } + + /// + /// Advances the iterator to the next boundary position. + /// + /// The position of the first boundary after this one. + public override int Next() + { + if (currentBoundaryIndex >= boundaries.Count - 1 || boundaries.Count == 0) + { + return DONE; + } + currentBoundaryIndex++; + return ReturnCurrent(); + } + + /// + /// Advances the iterator backwards, to the last boundary preceding this one. + /// + /// The position of the last boundary position preceding this one. + public override int Previous() + { + if (currentBoundaryIndex == 0 || boundaries.Count == 0) + { + return DONE; + } + currentBoundaryIndex--; + return ReturnCurrent(); + } + + /// + /// Throw unless begin <= offset < end. + /// + /// + private void CheckOffset(int offset) + { + if (offset < m_start || offset > m_end) + { + throw new ArgumentException("offset out of bounds"); + } + } + + /// + /// Sets the iterator to refer to the first boundary position following + /// the specified position. + /// + /// The position from which to begin searching for a break position. + /// The position of the first break after the current position. + public override int Following(int offset) + { + CheckOffset(offset); + + if (boundaries.Count == 0) + { + return DONE; + } + + int following = GetLowestIndexGreaterThan(offset); + if (following == -1) + { + currentBoundaryIndex = boundaries.Count - 1; + return DONE; + } + else + { + currentBoundaryIndex = following; + } + return ReturnCurrent(); + } + + private int GetLowestIndexGreaterThan(int offset) + { + int index = boundaries.BinarySearch(offset); + if (index < 0) + { + return ~index; + } + else if (index + 1 < boundaries.Count) + { + return index + 1; + } + + return -1; + } + + /// + /// Sets the iterator to refer to the last boundary position before the + /// specified position. + /// + /// The position to begin searching for a break from. + /// The position of the last boundary before the starting position. + public override int Preceding(int offset) + { + CheckOffset(offset); + + if (boundaries.Count == 0) + { + return DONE; + } + + int preceeding = GetHighestIndexLessThan(offset); + if (preceeding == -1) + { + currentBoundaryIndex = 0; + return DONE; + } + else + { + currentBoundaryIndex = preceeding; + } + return ReturnCurrent(); + } + + private int GetHighestIndexLessThan(int offset) + { + int index = boundaries.BinarySearch(offset); + if (index < 0) + { + return ~index - 1; + } + else + { + // NOTE: This is intentionally allowed to return -1 in the case + // where index == 0. This state indicates we are before the first boundary. + return index - 1; + } + } + + /// + /// Returns the current iteration position. + /// + public override int Current + { + get { return ReturnCurrent(); } + } + + /// + /// Gets the text being analyzed. + /// + public override string Text + { + get + { + return text; + } + } + + /// + /// Set the iterator to analyze a new piece of text. This function resets + /// the current iteration position to the beginning of the text. + /// + /// The text to analyze. + public override void SetText(string newText) + { + text = newText; + currentBoundaryIndex = 0; + m_start = 0; + m_end = newText.Length; + + LoadBoundaries(m_start, m_end); + } + + public override void SetText(CharacterIterator newText) + { + text = newText.GetTextAsString(); + currentBoundaryIndex = 0; + m_start = newText.BeginIndex; + m_end = newText.EndIndex; + + LoadBoundaries(m_start, m_end); + } + + private void LoadBoundaries(int start, int end) + { + IEnumerable icuBoundaries; + string offsetText = text.Substring(start, end - start); + +#if !NETSTANDARD + try + { +#endif + if (type == Icu.BreakIterator.UBreakIteratorType.WORD) + { + if (enableHacks) + { + // LUCENENET TODO: HACK - replacing hyphen with "a" so hyphenated words aren't broken + offsetText = offsetText.Replace("-", "a"); + } + + icuBoundaries = Icu.BreakIterator.GetWordBoundaries(locale, offsetText, true); + } + else + { + if (enableHacks && type == Icu.BreakIterator.UBreakIteratorType.SENTENCE) + { + // LUCENENET TODO: HACK - newline character causes incorrect sentence breaking. + offsetText = offsetText.Replace("\n", " "); + // LUCENENET TODO: HACK - the ICU sentence logic doesn't work (in English anyway) when sentences don't + // begin with capital letters. + offsetText = CapitalizeFirst(offsetText); + } + + icuBoundaries = Icu.BreakIterator.GetBoundaries(type, locale, offsetText); + } +#if !NETSTANDARD + } + catch (AccessViolationException ace) + { + // LUCENENET TODO: Find a reliable way to reproduce and report the + // AccessViolationException that happens here to the icu-dotnet project team + throw new Exception("Hit AccessViolationException: " + ace.ToString(), ace); + } +#endif + + boundaries = icuBoundaries + .Select(t => new[] { t.Start + start, t.End + start }) + .SelectMany(b => b) + .Distinct() + .ToList(); + } + + /// + /// Returns true if the specified character offset is a text boundary. + /// + /// the character offset to check. + /// true if "offset" is a boundary position, false otherwise. + public override bool IsBoundary(int offset) + { + CheckOffset(offset); + return boundaries.Contains(offset); + } + + private int ReturnCurrent() + { + if (boundaries.Count > 0) + { + return currentBoundaryIndex < boundaries.Count && currentBoundaryIndex > -1 + ? boundaries[currentBoundaryIndex] + : DONE; + } + + // If there are no boundaries, we must return the start offset + return m_start; + } + + /// + /// LUCENENET TODO: This is a temporary workaround for an issue with icu-dotnet + /// where it doesn't correctly break sentences unless they begin with a capital letter. + /// If/when ICU is fixed, this method should be deleted and the IcuBreakIterator + /// code changed to remove calls to this method. + /// + public static string CapitalizeFirst(string s) + { + bool isNewSentence = true; + var result = new StringBuilder(s.Length); + for (int i = 0; i < s.Length; i++) + { + if (isNewSentence && char.IsLetter(s[i])) + { + result.Append(char.ToUpper(s[i])); + isNewSentence = false; + } + else + result.Append(s[i]); + + if (s[i] == '!' || s[i] == '?' || s[i] == '.') + { + isNewSentence = true; + } + } + + return result.ToString(); + } + } +} +#endif http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Support/StringCharacterIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Support/StringCharacterIterator.cs b/src/Lucene.Net.Icu/Support/StringCharacterIterator.cs new file mode 100644 index 0000000..a91e49a --- /dev/null +++ b/src/Lucene.Net.Icu/Support/StringCharacterIterator.cs @@ -0,0 +1,232 @@ +#if FEATURE_BREAKITERATOR +/* + * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved + * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved + * + * The original version of this source code and documentation + * is copyrighted and owned by Taligent, Inc., a wholly-owned + * subsidiary of IBM. These materials are provided under terms + * of a License Agreement between Taligent and Sun. This technology + * is protected by multiple US and International patents. + * + * This notice and attribution to Taligent may not be removed. + * Taligent is a registered trademark of Taligent, Inc. + * + */ + +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Lucene.Net.Support +{ + /// + /// implements the + /// protocol for a . + /// The class iterates over the + /// entire . + /// + /// + public class StringCharacterIterator : CharacterIterator + { + private string text; + private int begin; + private int end; + // invariant: begin <= pos <= end + private int pos; + + + public StringCharacterIterator(string text) + : this(text, 0) + { + } + + public StringCharacterIterator(string text, int pos) + : this(text, 0, text.Length, pos) + { + } + + public StringCharacterIterator(string text, int begin, int end, int pos) + { + if (text == null) + throw new ArgumentNullException("text"); + this.text = text; + + if (begin < 0 || begin > end || end > text.Length) + throw new ArgumentException("Invalid substring range"); + + if (pos < begin || pos > end) + throw new ArgumentException("Invalid position"); + + this.begin = begin; + this.end = end; + this.pos = pos; + } + + public void SetText(string text) + { + if (text == null) + throw new ArgumentNullException("text"); + this.text = text; + this.begin = 0; + this.end = text.Length; + this.pos = 0; + } + + public override char First() + { + pos = begin; + return Current; + } + + public override char Last() + { + if (end != begin) + { + pos = end - 1; + } + else + { + pos = end; + } + return Current; + } + + public override char SetIndex(int position) + { + if (position < begin || position > end) + throw new ArgumentException("Invalid index"); + pos = position; + return Current; + } + + public override char Current + { + get + { + if (pos >= begin && pos < end) + { + return text[pos]; + } + else + { + return DONE; + } + } + } + + public override char Next() + { + if (pos < end - 1) + { + pos++; + return text[pos]; + } + else + { + pos = end; + return DONE; + } + } + + public override char Previous() + { + if (pos > begin) + { + pos--; + return text[pos]; + } + else + { + return DONE; + } + } + + + public override int BeginIndex + { + get + { + return begin; + } + } + + public override int EndIndex + { + get + { + return end; + } + } + + public override int Index + { + get + { + return pos; + } + } + + public override string GetTextAsString() + { + return text; + } + + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (!(obj is StringCharacterIterator)) + return false; + + StringCharacterIterator that = (StringCharacterIterator)obj; + + if (GetHashCode() != that.GetHashCode()) + return false; + if (!text.Equals(that.text, StringComparison.Ordinal)) + return false; + if (pos != that.pos || begin != that.begin || end != that.end) + return false; + return true; + } + + public override int GetHashCode() + { + return base.GetHashCode() ^ pos ^ begin ^ end; + } + + public override object Clone() + { + return MemberwiseClone(); + } + } +} +#endif http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/project.json b/src/Lucene.Net.Icu/project.json new file mode 100644 index 0000000..2e8f212 --- /dev/null +++ b/src/Lucene.Net.Icu/project.json @@ -0,0 +1,63 @@ +{ + "version": "4.8.0", + "dependencies": { + "icu.net": "54.1.1-alpha", + "Lucene.Net": "4.8.0", + "Lucene.Net.Analysis.Common": "4.8.0", + "Lucene.Net.Highlighter": "4.8.0" + }, + "buildOptions": { + "debugType": "portable", + "compile": { + "includeFiles": [ + "../CommonAssemblyInfo.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizerFactory.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilterFactory.cs", + "../Lucene.Net.Analysis.Common/Analysis/Util/CharArrayIterator.cs", + "../Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/Passage.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/PostingsHighlighter.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/WholeBreakIterator.cs", + "../Lucene.Net.Highlighter/VectorHighlight/BreakIteratorBoundaryScanner.cs" + ] + }, + "embed": { + "includeFiles": [ "Analysis/Th/stopwords.txt" ] + } + }, + "packOptions": { + "summary": "", + "licenseUrl": "https://github.com/apache/lucenenet/blob/master/LICENSE.txt", + "iconUrl": "https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true", + "owners": [ + "The Apache Software Foundation" + ], + "repository": { + "url": "https://github.com/apache/lucenenet" + }, + "tags": [ "lucene.net", "core", "text", "search", "information", "retrieval", "lucene", "apache", "analysis", "index", "query" ] + }, + "frameworks": { + "netstandard1.5": { + "imports": "dnxcore50", + "buildOptions": { + "define": [ "NETSTANDARD", "FEATURE_BREAKITERATOR" ] + }, + "dependencies": { + "NETStandard.Library": "1.6.0" + } + }, + "net451": { + "buildOptions": { + "define": [ "FEATURE_BREAKITERATOR", "FEATURE_SERIALIZABLE" ] + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj b/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj index 3ed7239..d87e43d 100644 --- a/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj +++ b/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj @@ -56,7 +56,6 @@ - @@ -70,7 +69,7 @@ - + Properties\CommonAssemblyInfo.cs