Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id A9232200BB9 for ; Sun, 23 Oct 2016 15:02:00 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id A7D56160AD8; Sun, 23 Oct 2016 13:02:00 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 1D589160B03 for ; Sun, 23 Oct 2016 15:01:57 +0200 (CEST) Received: (qmail 6400 invoked by uid 500); 23 Oct 2016 13:01:55 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 3652 invoked by uid 99); 23 Oct 2016 13:01:49 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 23 Oct 2016 13:01:49 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 9F041F16CD; Sun, 23 Oct 2016 13:01:48 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: nightowl888@apache.org To: commits@lucenenet.apache.org Date: Sun, 23 Oct 2016 13:02:34 -0000 Message-Id: <197c474cbc314ee38bb59af5b054dfae@git.apache.org> In-Reply-To: <8416623a27914fd195213e3a8d36fe03@git.apache.org> References: <8416623a27914fd195213e3a8d36fe03@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [48/50] [abbrv] lucenenet git commit: Ported Analysis.Stempel + tests (closes #190) archived-at: Sun, 23 Oct 2016 13:02:00 -0000 http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Optimizer2.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Optimizer2.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Optimizer2.cs new file mode 100644 index 0000000..d1b25a6 --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Optimizer2.cs @@ -0,0 +1,92 @@ +/* + Egothor Software License version 1.00 + Copyright (C) 1997-2004 Leo Galambos. + Copyright (C) 2002-2004 "Egothor developers" + on behalf of the Egothor Project. + All rights reserved. + + This software is copyrighted by the "Egothor developers". If this + license applies to a single file or document, the "Egothor developers" + are the people or entities mentioned as copyright holders in that file + or document. If this license applies to the Egothor project as a + whole, the copyright holders are the people or entities mentioned in + the file CREDITS. This file can be found in the same location as this + license in the distribution. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, the list of contributors, this list of conditions, and the + following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, the list of contributors, this list of conditions, and the + disclaimer that follows these conditions in the documentation + and/or other materials provided with the distribution. + 3. The name "Egothor" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact Leo.G@seznam.cz + 4. Products derived from this software may not be called "Egothor", + nor may "Egothor" appear in their name, without prior written + permission from Leo.G@seznam.cz. + + In addition, we request that you include in the end-user documentation + provided with the redistribution and/or in the software itself an + acknowledgement equivalent to the following: + "This product includes software developed by the Egothor Project. + http://egothor.sf.net/" + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the Egothor Project and was originally + created by Leo Galambos (Leo.G@seznam.cz). + */ + +namespace Egothor.Stemmer +{ + /// + /// The Optimizer class is a Trie that will be reduced (have empty rows removed). + /// + /// This is the result of allowing a joining of rows when there is no collision + /// between non-null values in the rows. Information loss, resulting in + /// the stemmer not being able to recognize words (as in Optimizer), is + /// curtailed, allowing the stemmer to recognize words for which the original + /// trie was built. Use of this class allows the stemmer to be self-teaching. + /// + /// + public class Optimizer2 : Optimizer + { + /// + /// Constructor for the object. + /// + public Optimizer2() { } + + /// + /// Merge the given s and return the resulting . + /// + /// the master + /// the existing + /// the resulting , or null if the operation cannot be realized + public override Cell Merge(Cell m, Cell e) + { + if (m.cmd == e.cmd && m.@ref == e.@ref && m.skip == e.skip) { + Cell c = new Cell(m); + c.cnt += e.cnt; + return c; + } else { + return null; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Reduce.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Reduce.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Reduce.cs new file mode 100644 index 0000000..61a1c3f --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Reduce.cs @@ -0,0 +1,143 @@ +using Lucene.Net.Support; +using System.Collections.Generic; + +/* + Egothor Software License version 1.00 + Copyright (C) 1997-2004 Leo Galambos. + Copyright (C) 2002-2004 "Egothor developers" + on behalf of the Egothor Project. + All rights reserved. + + This software is copyrighted by the "Egothor developers". If this + license applies to a single file or document, the "Egothor developers" + are the people or entities mentioned as copyright holders in that file + or document. If this license applies to the Egothor project as a + whole, the copyright holders are the people or entities mentioned in + the file CREDITS. This file can be found in the same location as this + license in the distribution. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, the list of contributors, this list of conditions, and the + following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, the list of contributors, this list of conditions, and the + disclaimer that follows these conditions in the documentation + and/or other materials provided with the distribution. + 3. The name "Egothor" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact Leo.G@seznam.cz + 4. Products derived from this software may not be called "Egothor", + nor may "Egothor" appear in their name, without prior written + permission from Leo.G@seznam.cz. + + In addition, we request that you include in the end-user documentation + provided with the redistribution and/or in the software itself an + acknowledgement equivalent to the following: + "This product includes software developed by the Egothor Project. + http://egothor.sf.net/" + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the Egothor Project and was originally + created by Leo Galambos (Leo.G@seznam.cz). + */ + +namespace Egothor.Stemmer +{ + /// + /// The object is used to remove gaps in a which stores a dictionary. + /// + public class Reduce + { + /// + /// Constructor for the object. + /// + public Reduce() { } + + /// + /// Optimize (remove holes in the rows) the given and return the + /// restructured . + /// + /// the to optimize + /// the restructured + public virtual Trie Optimize(Trie orig) + { + IList cmds = orig.cmds; + IList rows = new List(); + IList orows = orig.rows; + int[] remap = new int[orows.Count]; + + Arrays.Fill(remap, -1); + rows = RemoveGaps(orig.root, rows, new List(), remap); + + return new Trie(orig.forward, remap[orig.root], cmds, rows); + } + + internal virtual IList RemoveGaps(int ind, IList old, IList to, int[] remap) + { + remap[ind] = to.Count; + + Row now = old[ind]; + to.Add(now); + IEnumerator i = now.cells.Values.GetEnumerator(); + for (; i.MoveNext();) + { + Cell c = i.Current; + if (c.@ref >= 0 && remap[c.@ref] < 0) + { + RemoveGaps(c.@ref, old, to, remap); + } + } + to[remap[ind]] = new Remap(now, remap); + return to; + } + + /// + /// This class is part of the Egothor Project + /// + internal class Remap : Row + { + /** + * Constructor for the object + * + * @param old Description of the Parameter + * @param remap Description of the Parameter + */ + public Remap(Row old, int[] remap) + : base() + { + var i = old.cells.Keys.GetEnumerator(); + for (; i.MoveNext();) + { + char ch = i.Current; + Cell c = old.At(ch); + Cell nc; + if (c.@ref >= 0) + { + nc = new Cell(c); + nc.@ref = remap[nc.@ref]; + } + else + { + nc = new Cell(c); + } + cells[ch] = nc; + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Row.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Row.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Row.cs new file mode 100644 index 0000000..6fdad76 --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Row.cs @@ -0,0 +1,342 @@ +using Lucene.Net.Support; +using System.Collections.Generic; +using System.IO; + +/* + Egothor Software License version 1.00 + Copyright (C) 1997-2004 Leo Galambos. + Copyright (C) 2002-2004 "Egothor developers" + on behalf of the Egothor Project. + All rights reserved. + + This software is copyrighted by the "Egothor developers". If this + license applies to a single file or document, the "Egothor developers" + are the people or entities mentioned as copyright holders in that file + or document. If this license applies to the Egothor project as a + whole, the copyright holders are the people or entities mentioned in + the file CREDITS. This file can be found in the same location as this + license in the distribution. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, the list of contributors, this list of conditions, and the + following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, the list of contributors, this list of conditions, and the + disclaimer that follows these conditions in the documentation + and/or other materials provided with the distribution. + 3. The name "Egothor" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact Leo.G@seznam.cz + 4. Products derived from this software may not be called "Egothor", + nor may "Egothor" appear in their name, without prior written + permission from Leo.G@seznam.cz. + + In addition, we request that you include in the end-user documentation + provided with the redistribution and/or in the software itself an + acknowledgement equivalent to the following: + "This product includes software developed by the Egothor Project. + http://egothor.sf.net/" + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the Egothor Project and was originally + created by Leo Galambos (Leo.G@seznam.cz). + */ + +namespace Egothor.Stemmer +{ + /// + /// The class represents a row in a matrix representation of a . + /// + public class Row + { + // LUCENENET NOTE: This was TreeMap in Java, which allows duplicate keys, but + // SortedDictionary does not. If there are issues with Stempel, check this!! + internal SortedDictionary cells = new SortedDictionary(); + internal int uniformCnt = 0; + internal int uniformSkip = 0; + + /// + /// Construct a object from input carried in via the given input stream. + /// + /// the input stream + /// if an I/O error occurs + public Row(IDataInput @is) + { + for (int i = @is.ReadInt(); i > 0; i--) + { + char ch = @is.ReadChar(); + Cell c = new Cell(); + c.cmd = @is.ReadInt(); + c.cnt = @is.ReadInt(); + c.@ref = @is.ReadInt(); + c.skip = @is.ReadInt(); + cells[ch] = c; + } + } + + /// + /// The default constructor for the object. + /// + public Row() { } + + /// + /// Construct a using the cells of the given . + /// + /// the to copy + public Row(Row old) + { + cells = old.cells; + } + + /// + /// Set the command in the of the given to the given . + /// + /// the defining the + /// the new command + public void SetCmd(char way, int cmd) + { + Cell c = At(way); + if (c == null) + { + c = new Cell(); + c.cmd = cmd; + cells[way] = c; + } + else + { + c.cmd = cmd; + } + c.cnt = (cmd >= 0) ? 1 : 0; + } + + /// + /// Set the reference to the next row in the of the given to the + /// given . + /// + /// the defining the + /// The new ref value + public void SetRef(char way, int @ref) + { + Cell c = At(way); + if (c == null) + { + c = new Cell(); + c.@ref = @ref; + cells[way] = c; + } + else + { + c.@ref = @ref; + } + } + + /// + /// Return the number of cells in use. + /// + /// the number of cells in use + public int GetCells() + { + IEnumerator i = cells.Keys.GetEnumerator(); + int size = 0; + for (; i.MoveNext();) + { + char c = i.Current; + Cell e = At(c); + if (e.cmd >= 0 || e.@ref >= 0) + { + size++; + } + } + return size; + } + + /// + /// Return the number of references (how many transitions) to other rows. + /// + /// the number of references + public int GetCellsPnt() + { + IEnumerator i = cells.Keys.GetEnumerator(); + int size = 0; + for (; i.MoveNext();) + { + char c = i.Current; + Cell e = At(c); + if (e.@ref >= 0) + { + size++; + } + } + return size; + } + + /// + /// Return the number of patch commands saved in this Row. + /// + /// the number of patch commands + public int GetCellsVal() + { + IEnumerator i = cells.Keys.GetEnumerator(); + int size = 0; + for (; i.MoveNext();) + { + char c = i.Current; + Cell e = At(c); + if (e.cmd >= 0) + { + size++; + } + } + return size; + } + + /// + /// Return the command in the associated with the given . + /// + /// the associated with the holding the desired command + /// the command + public int GetCmd(char way) + { + Cell c = At(way); + return (c == null) ? -1 : c.cmd; + } + + /// + /// Return the number of patch commands were in the associated with the + /// given before the containing this was reduced. + /// + /// the associated with the desired + /// the number of patch commands before reduction + public int GetCnt(char way) + { + Cell c = At(way); + return (c == null) ? -1 : c.cnt; + } + + /// + /// Return the reference to the next in the associated with the given + /// . + /// + /// the associated with the desired + /// the reference, or -1 if the is null + public int GetRef(char way) + { + Cell c = At(way); + return (c == null) ? -1 : c.@ref; + } + + /// + /// Write the contents of this to the given output stream. + /// + /// the output stream + /// if an I/O error occurs + public virtual void Store(IDataOutput os) + { + os.WriteInt(cells.Count); + IEnumerator i = cells.Keys.GetEnumerator(); + for (; i.MoveNext();) + { + char c = i.Current; + Cell e = At(c); + if (e.cmd < 0 && e.@ref < 0) + { + continue; + } + + os.WriteChar(c); + os.WriteInt(e.cmd); + os.WriteInt(e.cnt); + os.WriteInt(e.@ref); + os.WriteInt(e.skip); + } + } + + /// + /// Return the number of identical s (containing patch commands) in this + /// Row. + /// + /// when set to false the removed patch commands are considered + /// the number of identical s, or -1 if there are (at least) two different s + public int UniformCmd(bool eqSkip) + { + IEnumerator i = cells.Values.GetEnumerator(); + int ret = -1; + uniformCnt = 1; + uniformSkip = 0; + for (; i.MoveNext();) + { + Cell c = i.Current; + if (c.@ref >= 0) + { + return -1; + } + if (c.cmd >= 0) + { + if (ret < 0) + { + ret = c.cmd; + uniformSkip = c.skip; + } + else if (ret == c.cmd) + { + if (eqSkip) + { + if (uniformSkip == c.skip) + { + uniformCnt++; + } + else + { + return -1; + } + } + else + { + uniformCnt++; + } + } + else + { + return -1; + } + } + } + return ret; + } + + /// + /// Write the contents of this to the . + /// + /// + public virtual void Print(TextWriter @out) + { + for (IEnumerator i = cells.Keys.GetEnumerator(); i.MoveNext();) + { + char ch = i.Current; + Cell c = At(ch); + @out.Write("[" + ch + ":" + c + "]"); + } + @out.WriteLine(); + } + + internal Cell At(char index) + { + return cells.ContainsKey(index) ? cells[index] : null; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Trie.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Trie.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Trie.cs new file mode 100644 index 0000000..905f213 --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Trie.cs @@ -0,0 +1,472 @@ +using Lucene.Net.Support; +using System.Collections.Generic; +using System.IO; + +/* + Egothor Software License version 1.00 + Copyright (C) 1997-2004 Leo Galambos. + Copyright (C) 2002-2004 "Egothor developers" + on behalf of the Egothor Project. + All rights reserved. + + This software is copyrighted by the "Egothor developers". If this + license applies to a single file or document, the "Egothor developers" + are the people or entities mentioned as copyright holders in that file + or document. If this license applies to the Egothor project as a + whole, the copyright holders are the people or entities mentioned in + the file CREDITS. This file can be found in the same location as this + license in the distribution. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, the list of contributors, this list of conditions, and the + following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, the list of contributors, this list of conditions, and the + disclaimer that follows these conditions in the documentation + and/or other materials provided with the distribution. + 3. The name "Egothor" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact Leo.G@seznam.cz + 4. Products derived from this software may not be called "Egothor", + nor may "Egothor" appear in their name, without prior written + permission from Leo.G@seznam.cz. + + In addition, we request that you include in the end-user documentation + provided with the redistribution and/or in the software itself an + acknowledgement equivalent to the following: + "This product includes software developed by the Egothor Project. + http://egothor.sf.net/" + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the Egothor Project and was originally + created by Leo Galambos (Leo.G@seznam.cz). + */ + +namespace Egothor.Stemmer +{ + /// + /// A is used to store a dictionary of words and their stems. + /// + /// Actually, what is stored are words with their respective patch commands. A + /// trie can be termed forward (keys read from left to right) or backward (keys + /// read from right to left). This property will vary depending on the language + /// for which a is constructed. + /// + /// + public class Trie + { + internal IList rows = new List(); + internal IList cmds = new List(); + internal int root; + + internal bool forward = false; + + /// + /// Constructor for the object. + /// + /// the input stream + /// if an I/O error occurs + public Trie(IDataInput @is) + { + forward = @is.ReadBoolean(); + root = @is.ReadInt(); + for (int i = @is.ReadInt(); i > 0; i--) + { + cmds.Add(@is.ReadUTF()); + } + for (int i = @is.ReadInt(); i > 0; i--) + { + rows.Add(new Row(@is)); + } + } + + /// + /// Constructor for the object. + /// + /// set to true + public Trie(bool forward) + { + rows.Add(new Row()); + root = 0; + this.forward = forward; + } + + /// + /// Constructor for the object. + /// + /// true if read left to right, false if read right to left + /// index of the row that is the root node + /// the patch commands to store + /// a Vector of Vectors. Each inner Vector is a node of this + public Trie(bool forward, int root, IList cmds, IList rows) + { + this.rows = rows; + this.cmds = cmds; + this.root = root; + this.forward = forward; + } + + /// + /// Gets the all attribute of the object + /// + /// Description of the Parameter + /// The all value + public virtual string[] GetAll(string key) + { + int[] res = new int[key.Length]; + int resc = 0; + Row now = GetRow(root); + int w; + StrEnum e = new StrEnum(key, forward); + bool br = false; + + for (int i = 0; i < key.Length - 1; i++) + { + char ch = e.Next(); + w = now.GetCmd(ch); + if (w >= 0) + { + int n = w; + for (int j = 0; j < resc; j++) + { + if (n == res[j]) + { + n = -1; + break; + } + } + if (n >= 0) + { + res[resc++] = n; + } + } + w = now.GetRef(ch); + if (w >= 0) + { + now = GetRow(w); + } + else + { + br = true; + break; + } + } + if (br == false) + { + w = now.GetCmd(e.Next()); + if (w >= 0) + { + int n = w; + for (int j = 0; j < resc; j++) + { + if (n == res[j]) + { + n = -1; + break; + } + } + if (n >= 0) + { + res[resc++] = n; + } + } + } + + if (resc < 1) + { + return null; + } + string[] R = new string[resc]; + for (int j = 0; j < resc; j++) + { + R[j] = cmds[res[j]]; + } + return R; + } + + /// + /// Return the number of cells in this object. + /// + /// the number of cells + public virtual int GetCells() + { + int size = 0; + foreach (Row row in rows) + size += row.GetCells(); + return size; + } + + /// + /// Gets the cellsPnt attribute of the object + /// + /// The cellsPnt value + public virtual int GetCellsPnt() + { + int size = 0; + foreach (Row row in rows) + size += row.GetCellsPnt(); + return size; + } + + /// + /// Gets the cellsVal attribute of the object + /// + /// The cellsVal value + public virtual int GetCellsVal() + { + int size = 0; + foreach (Row row in rows) + size += row.GetCellsVal(); + return size; + } + + /// + /// Return the element that is stored in a cell associated with the given key. + /// + /// the key + /// the associated element + public virtual string GetFully(string key) + { + Row now = GetRow(root); + int w; + Cell c; + int cmd = -1; + StrEnum e = new StrEnum(key, forward); + char ch; + char aux; + + for (int i = 0; i < key.Length;) + { + ch = e.Next(); + i++; + + c = now.At(ch); + if (c == null) + { + return null; + } + + cmd = c.cmd; + + for (int skip = c.skip; skip > 0; skip--) + { + if (i < key.Length) + { + aux = e.Next(); + } + else + { + return null; + } + i++; + } + + w = now.GetRef(ch); + if (w >= 0) + { + now = GetRow(w); + } + else if (i < key.Length) + { + return null; + } + } + return (cmd == -1) ? null : cmds[cmd]; + } + + /// + /// Return the element that is stored as last on a path associated with the + /// given key. + /// + /// the key associated with the desired element + /// the last on path element + public virtual string GetLastOnPath(string key) + { + Row now = GetRow(root); + int w; + string last = null; + StrEnum e = new StrEnum(key, forward); + + for (int i = 0; i < key.Length - 1; i++) + { + char ch = e.Next(); + w = now.GetCmd(ch); + if (w >= 0) + { + last = cmds[w]; + } + w = now.GetRef(ch); + if (w >= 0) + { + now = GetRow(w); + } + else + { + return last; + } + } + w = now.GetCmd(e.Next()); + return (w >= 0) ? cmds[w] : last; + } + + /// + /// Return the at the given index. + /// + /// the index containing the desired + /// the + private Row GetRow(int index) + { + if (index < 0 || index >= rows.Count) + { + return null; + } + return rows[index]; + } + + /// + /// Write this to the given output stream. + /// + /// the output stream + /// if an I/O error occurs + public virtual void Store(IDataOutput os) + { + os.WriteBoolean(forward); + os.WriteInt(root); + os.WriteInt(cmds.Count); + foreach (string cmd in cmds) + os.WriteUTF(cmd); + + os.WriteInt(rows.Count); + foreach (Row row in rows) + row.Store(os); + } + + /// + /// Add the given key associated with the given patch command. If either + /// parameter is null this method will return without executing. + /// + /// the key + /// the patch command + public virtual void Add(string key, string cmd) + { + if (key == null || cmd == null) + { + return; + } + if (cmd.Length == 0) + { + return; + } + int id_cmd = cmds.IndexOf(cmd); + if (id_cmd == -1) + { + id_cmd = cmds.Count; + cmds.Add(cmd); + } + + int node = root; + Row r = GetRow(node); + + StrEnum e = new StrEnum(key, forward); + + for (int i = 0; i < e.Length - 1; i++) + { + char ch = e.Next(); + node = r.GetRef(ch); + if (node >= 0) + { + r = GetRow(node); + } + else + { + node = rows.Count; + Row n; + rows.Add(n = new Row()); + r.SetRef(ch, node); + r = n; + } + } + r.SetCmd(e.Next(), id_cmd); + } + + /// + /// Remove empty rows from the given and return the newly reduced . + /// + /// the to reduce + /// newly reduced + public virtual Trie Reduce(Reduce by) + { + return by.Optimize(this); + } + + /// + /// writes debugging info to the printstream + /// + public virtual void PrintInfo(TextWriter @out, string prefix) + { + @out.WriteLine(prefix + "nds " + rows.Count + " cmds " + cmds.Count + + " cells " + GetCells() + " valcells " + GetCellsVal() + " pntcells " + + GetCellsPnt()); + } + + /// + /// This class is part of the Egothor Project + /// + internal class StrEnum + { + private string s; + private int from; + private int by; + + /// + /// Constructor for the object + /// + /// Description of the Parameter + /// Description of the Parameter + internal StrEnum(string s, bool up) + { + this.s = s; + if (up) + { + from = 0; + by = 1; + } + else + { + from = s.Length - 1; + by = -1; + } + } + + internal int Length + { + get + { + return s.Length; + } + } + + internal char Next() + { + char ch = s[from]; + from += by; + return ch; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Lucene.Net.Analysis.Stempel.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Lucene.Net.Analysis.Stempel.csproj b/src/Lucene.Net.Analysis.Stempel/Lucene.Net.Analysis.Stempel.csproj new file mode 100644 index 0000000..0f82c21 --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Lucene.Net.Analysis.Stempel.csproj @@ -0,0 +1,87 @@ + + + + + Debug + AnyCPU + {A76DAD88-E3A5-40F9-9114-FACD77BD8265} + Library + Properties + Lucene.Net.Analysis + Lucene.Net.Analysis.Stempel + v4.5.1 + 512 + + + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {4add0bbc-b900-4715-9526-d871de8eea64} + Lucene.Net.Analysis.Common + + + {5d4ad9be-1ffb-41ab-9943-25737971bf57} + Lucene.Net + + + + + + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Pl/PolishAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Pl/PolishAnalyzer.cs b/src/Lucene.Net.Analysis.Stempel/Pl/PolishAnalyzer.cs new file mode 100644 index 0000000..7cc5773 --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Pl/PolishAnalyzer.cs @@ -0,0 +1,164 @@ +using Egothor.Stemmer; +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.Miscellaneous; +using Lucene.Net.Analysis.Standard; +using Lucene.Net.Analysis.Stempel; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System; +using System.IO; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Analysis.Pl +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// for Polish. + /// + public sealed class PolishAnalyzer : StopwordAnalyzerBase + { + private readonly CharArraySet stemExclusionSet; + private readonly Trie stemTable; + + /// + /// File containing default Polish stopwords. + /// + public readonly static string DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /// + /// File containing default Polish stemmer table. + /// + public readonly static string DEFAULT_STEMMER_FILE = "stemmer_20000.tbl"; + + /// + /// Returns an unmodifiable instance of the default stop words set. + /// + /// default stop words set. + public static CharArraySet GetDefaultStopSet() + { + return DefaultsHolder.DEFAULT_STOP_SET; + } + + /// + /// Returns an unmodifiable instance of the default stemmer table. + /// + public static Trie GetDefaultTable() + { + return DefaultsHolder.DEFAULT_TABLE; + } + + /// + /// Atomically loads the in a lazy fashion once the outer class + /// accesses the static final set the first time.; + /// + private class DefaultsHolder + { + internal static readonly CharArraySet DEFAULT_STOP_SET; + internal static readonly Trie DEFAULT_TABLE; + + static DefaultsHolder() + { + try + { + DEFAULT_STOP_SET = WordlistLoader.GetWordSet(IOUtils.GetDecodingReader(typeof(PolishAnalyzer), + typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STOPWORD_FILE, Encoding.UTF8), "#", +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT); +#pragma warning restore 612, 618 + } + catch (IOException ex) + { + // default set should always be present as it is part of the + // distribution (embedded resource) + throw new SystemException("Unable to load default stopword set", ex); + } + + try + { + DEFAULT_TABLE = StempelStemmer.Load(typeof(PolishAnalyzer).Assembly.GetManifestResourceStream( + typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STEMMER_FILE)); + } + catch (IOException ex) + { + // default set should always be present as it is part of the + // distribution (embedded resource) + throw new SystemException("Unable to load default stemming tables", ex); + } + } + } + + /// + /// Builds an analyzer with the default stop words: . + /// + /// lucene compatibility version + public PolishAnalyzer(LuceneVersion matchVersion) + : this(matchVersion, DefaultsHolder.DEFAULT_STOP_SET) + { + } + + /// + /// Builds an analyzer with the given stop words. + /// + /// lucene compatibility version + /// a stopword set + public PolishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) + : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) + { + } + + /// + /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is + /// provided this analyzer will add a before + /// stemming. + /// + /// lucene compatibility version + /// a stopword set + /// a set of terms not to be stemmed + public PolishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) + : base(matchVersion, stopwords) + { + this.stemTable = DefaultsHolder.DEFAULT_TABLE; + this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy( + matchVersion, stemExclusionSet)); + } + + /// + /// Creates a + /// which tokenizes all the text in the provided . + /// + /// + /// A built from an + /// filtered with , , , + /// if a stem excusion set is provided and . + /// + public override TokenStreamComponents CreateComponents(string fieldName, + TextReader reader) + { + Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(matchVersion, source); + result = new LowerCaseFilter(matchVersion, result); + result = new StopFilter(matchVersion, result, stopwords); + if (stemExclusionSet.Any()) + result = new SetKeywordMarkerFilter(result, stemExclusionSet); + result = new StempelFilter(result, new StempelStemmer(stemTable)); + return new TokenStreamComponents(source, result); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Pl/stemmer_20000.tbl ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Pl/stemmer_20000.tbl b/src/Lucene.Net.Analysis.Stempel/Pl/stemmer_20000.tbl new file mode 100644 index 0000000..64c89a9 Binary files /dev/null and b/src/Lucene.Net.Analysis.Stempel/Pl/stemmer_20000.tbl differ http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Pl/stopwords.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Pl/stopwords.txt b/src/Lucene.Net.Analysis.Stempel/Pl/stopwords.txt new file mode 100644 index 0000000..167e9e0 --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Pl/stopwords.txt @@ -0,0 +1,186 @@ +# This file was created from the carrot2 project and is distributed under the BSD license. +# See http://project.carrot2.org/license.html +# Also see http://www.opensource.org/licenses/bsd-license.html +# From trunk/core/carrot2-util-text/src-resources/stopwords.pl +vol +o.o. +mgr +godz +zł +www +pl +ul +tel +hab +prof +inż +dr +i +u +aby +albo +ale +ani +aż +bardzo +bez +bo +bowiem +by +byli +bym +był +była +było +były +być +będzie +będą +chce +choć +co +coraz +coś +czy +czyli +często +dla +do +gdy +gdyby +gdyż +gdzie +go +ich +im +inne +iż +ja +jak +jakie +jako +je +jednak +jednym +jedynie +jego +jej +jest +jeszcze +jeśli +jeżeli +już +ją +kiedy +kilku +kto +która +które +którego +której +który +których +którym +którzy +lat +lecz +lub +ma +mają +mamy +mi +miał +mimo +mnie +mogą +może +można +mu +musi +na +nad +nam +nas +nawet +nic +nich +nie +niej +nim +niż +no +nowe +np +nr +o +od +ok +on +one +oraz +pan +po +pod +ponad +ponieważ +poza +przed +przede +przez +przy +raz +razie +roku +również +się +sobie +sposób +swoje +są +ta +tak +takich +takie +także +tam +te +tego +tej +temu +ten +teraz +też +to +trzeba +tu +tych +tylko +tym +tys +tzw +tę +w +we +wie +więc +wszystko +wśród +właśnie +z +za +zaś +ze +że +żeby +ii +iii +iv +vi +vii +viii +ix +xi +xii +xiii +xiv +xv http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Properties/AssemblyInfo.cs b/src/Lucene.Net.Analysis.Stempel/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..56385ee --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Properties/AssemblyInfo.cs @@ -0,0 +1,39 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Analysis.Stempel")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("Lucene.Net.Analysis.Stempel")] +[assembly: AssemblyCopyright("Copyright © 2016")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("a76dad88-e3a5-40f9-9114-facd77bd8265")] + +// for testing +[assembly: InternalsVisibleTo("Lucene.Net.Tests.Analysis.Stempel")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/RectangularArrays.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/RectangularArrays.cs b/src/Lucene.Net.Analysis.Stempel/RectangularArrays.cs new file mode 100644 index 0000000..25acdf2 --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/RectangularArrays.cs @@ -0,0 +1,52 @@ +//---------------------------------------------------------------------------------------- +// Copyright © 2007 - 2013 Tangible Software Solutions Inc. +// this class can be used by anyone provided that the copyright notice remains intact. +// +// this class provides the logic to simulate Java rectangular arrays, which are jagged +// arrays with inner arrays of the same length. A size of -1 indicates unknown length. +//---------------------------------------------------------------------------------------- + +using Lucene.Net.Util; + +internal static partial class RectangularArrays +{ + internal static int[][] ReturnRectangularIntArray(int Size1, int Size2) + { + int[][] Array; + if (Size1 > -1) + { + Array = new int[Size1][]; + if (Size2 > -1) + { + for (int Array1 = 0; Array1 < Size1; Array1++) + { + Array[Array1] = new int[Size2]; + } + } + } + else + Array = null; + + return Array; + } + + internal static BytesRef[][] ReturnRectangularBytesRefArray(int Size1, int Size2) + { + BytesRef[][] Array; + if (Size1 > -1) + { + Array = new BytesRef[Size1][]; + if (Size2 > -1) + { + for (int Array1 = 0; Array1 < Size1; Array1++) + { + Array[Array1] = new BytesRef[Size2]; + } + } + } + else + Array = null; + + return Array; + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Stempel/StempelFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Stempel/StempelFilter.cs b/src/Lucene.Net.Analysis.Stempel/Stempel/StempelFilter.cs new file mode 100644 index 0000000..f2964ea --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Stempel/StempelFilter.cs @@ -0,0 +1,91 @@ +using Lucene.Net.Analysis.Tokenattributes; +using System.Text; + +namespace Lucene.Net.Analysis.Stempel +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Transforms the token stream as per the stemming algorithm. + /// + /// Note: the input to the stemming filter must already be in lower case, so you + /// will need to use or farther down the + /// chain in order for this to work properly! + /// + /// + public sealed class StempelFilter : TokenFilter + { + private readonly ICharTermAttribute termAtt; + private readonly IKeywordAttribute keywordAtt; + private readonly StempelStemmer stemmer; + private readonly int minLength; + + /// + /// Minimum length of input words to be processed. Shorter words are returned + /// unchanged. + /// + public static readonly int DEFAULT_MIN_LENGTH = 3; + + /// + /// Create filter using the supplied stemming table. + /// + /// input token stream + /// stemmer + public StempelFilter(TokenStream @in, StempelStemmer stemmer) + : this(@in, stemmer, DEFAULT_MIN_LENGTH) + { + } + + /// + /// Create filter using the supplied stemming table. + /// + /// input token stream + /// stemmer + /// For performance reasons words shorter than minLength + /// characters are not processed, but simply returned. + public StempelFilter(TokenStream @in, StempelStemmer stemmer, int minLength) + : base(@in) + { + this.stemmer = stemmer; + this.minLength = minLength; + this.termAtt = AddAttribute(); + this.keywordAtt = AddAttribute(); + } + + /// + /// Returns the next input , after being stemmed + /// + public override bool IncrementToken() + { + if (input.IncrementToken()) + { + if (!keywordAtt.Keyword && termAtt.Length > minLength) + { + StringBuilder sb = stemmer.Stem(termAtt.ToString()); + if (sb != null) // if we can't stem it, return unchanged + termAtt.SetEmpty().Append(sb); + } + return true; + } + else + { + return false; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Stempel/StempelPolishStemFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Stempel/StempelPolishStemFilterFactory.cs b/src/Lucene.Net.Analysis.Stempel/Stempel/StempelPolishStemFilterFactory.cs new file mode 100644 index 0000000..759f403 --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Stempel/StempelPolishStemFilterFactory.cs @@ -0,0 +1,48 @@ +using Lucene.Net.Analysis.Pl; +using Lucene.Net.Analysis.Util; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Lucene.Net.Analysis.Stempel +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Factory for using a Polish stemming table. + /// + public class StempelPolishStemFilterFactory : TokenFilterFactory + { + /// + /// Creates a new + /// + public StempelPolishStemFilterFactory(IDictionary args) + : base(args) + { + if (args.Any()) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override TokenStream Create(TokenStream input) + { + return new StempelFilter(input, new StempelStemmer(PolishAnalyzer.GetDefaultTable())); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Analysis.Stempel/Stempel/StempelStemmer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Stempel/StempelStemmer.cs b/src/Lucene.Net.Analysis.Stempel/Stempel/StempelStemmer.cs new file mode 100644 index 0000000..43e544a --- /dev/null +++ b/src/Lucene.Net.Analysis.Stempel/Stempel/StempelStemmer.cs @@ -0,0 +1,105 @@ +using Egothor.Stemmer; +using Lucene.Net.Support; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Stempel +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Stemmer class is a convenient facade for other stemmer-related classes. The + /// core stemming algorithm and its implementation is taken verbatim from the + /// Egothor project ( www.egothor.org ). + /// + /// Even though the stemmer tables supplied in the distribution package are built + /// for Polish language, there is nothing language-specific here. + /// + /// + public class StempelStemmer + { + private Trie stemmer = null; + private StringBuilder buffer = new StringBuilder(); + + /// + /// Create a Stemmer using selected stemmer table + /// + /// stemmer table. + public StempelStemmer(Stream stemmerTable) + : this(Load(stemmerTable)) + { + } + + /// + /// Create a Stemmer using pre-loaded stemmer table + /// + /// pre-loaded stemmer table + public StempelStemmer(Trie stemmer) + { + this.stemmer = stemmer; + } + + /// + /// Load a stemmer table from an inputstream. + /// + public static Trie Load(Stream stemmerTable) + { + DataInputStream @in = null; + try + { + @in = new DataInputStream(stemmerTable); + string method = @in.ReadUTF().ToUpperInvariant(); + if (method.IndexOf('M') < 0) + { + return new Trie(@in); + } + else + { + return new MultiTrie2(@in); + } + } + finally + { + @in.Dispose(); + } + } + + /// + /// Stem a word. + /// + /// input word to be stemmed. + /// stemmed word, or null if the stem could not be generated. + public StringBuilder Stem(string word) + { + string cmd = stemmer.GetLastOnPath(word); + + if (cmd == null) + return null; + + buffer.Length = 0; + buffer.Append(word); + + Diff.Apply(buffer, cmd); + + if (buffer.Length > 0) + return buffer; + else + return null; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Core/Lucene.Net.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Lucene.Net.csproj b/src/Lucene.Net.Core/Lucene.Net.csproj index 75817c5..39bf69e 100644 --- a/src/Lucene.Net.Core/Lucene.Net.csproj +++ b/src/Lucene.Net.Core/Lucene.Net.csproj @@ -626,12 +626,16 @@ + + + + http://git-wip-us.apache.org/repos/asf/lucenenet/blob/29525086/src/Lucene.Net.Core/Support/DataInputStream.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Support/DataInputStream.cs b/src/Lucene.Net.Core/Support/DataInputStream.cs new file mode 100644 index 0000000..30dc6a6 --- /dev/null +++ b/src/Lucene.Net.Core/Support/DataInputStream.cs @@ -0,0 +1,323 @@ +using System; +using System.IO; + +namespace Lucene.Net.Support +{ + /// + /// Java's DataInputStream is similar to .NET's BinaryReader. However, it reads + /// using a modified UTF-8 format that cannot be read using BinaryReader. + /// This is a port of DataInputStream that is fully compatible with Java's DataOutputStream. + /// + public class DataInputStream : IDataInput, IDisposable + { + private readonly Stream @in; + + /// + /// Creates a DataInputStream that uses the specified + /// underlying InputStream. + /// + /// the specified input stream + public DataInputStream(Stream @in) + { + this.@in = @in; + } + + /// + /// working arrays initialized on demand by readUTF + /// + private byte[] bytearr = new byte[80]; + private char[] chararr = new char[80]; + + public int Read(byte[] b) + { + return @in.Read(b, 0, b.Length); + } + + public int Read(byte[] b, int off, int len) + { + return @in.Read(b, off, len); + } + + public void ReadFully(byte[] b) + { + ReadFully(b, 0, b.Length); + } + + public void ReadFully(byte[] b, int off, int len) + { + if (len < 0) + throw new IndexOutOfRangeException(); + int n = 0; + while (n < len) + { + int count = @in.Read(b, off + n, len - n); + if (count == 0) + throw new EndOfStreamException(); + n += count; + } + } + + public int SkipBytes(int n) + { + int total = 0; + int cur = 0; + + while ((total < n) && ((cur = (int)@in.Seek(n - total, SeekOrigin.Begin)) > 0)) + { + total += cur; + } + + return total; + } + + public bool ReadBoolean() + { + int ch = @in.ReadByte(); + if (ch < 0) + throw new EndOfStreamException(); + return (ch != 0); + } + + public byte ReadByte() + { + int ch = @in.ReadByte(); + if (ch < 0) + throw new EndOfStreamException(); + return (byte)(ch); + } + + public int ReadUnsignedByte() + { + int ch = @in.ReadByte(); + if (ch < 0) + throw new EndOfStreamException(); + return ch; + } + + public short ReadShort() + { + int ch1 = @in.ReadByte(); + int ch2 = @in.ReadByte(); + if ((ch1 | ch2) < 0) + throw new EndOfStreamException(); + return (short)((ch1 << 8) + (ch2 << 0)); + } + + public int ReadUnsignedShort() + { + int ch1 = @in.ReadByte(); + int ch2 = @in.ReadByte(); + if ((ch1 | ch2) < 0) + throw new EndOfStreamException(); + return (ch1 << 8) + (ch2 << 0); + } + + public char ReadChar() + { + int ch1 = @in.ReadByte(); + int ch2 = @in.ReadByte(); + if ((ch1 | ch2) < 0) + throw new EndOfStreamException(); + return (char)((ch1 << 8) + (ch2 << 0)); + } + + public int ReadInt() + { + int ch1 = @in.ReadByte(); + int ch2 = @in.ReadByte(); + int ch3 = @in.ReadByte(); + int ch4 = @in.ReadByte(); + if ((ch1 | ch2 | ch3 | ch4) < 0) + throw new EndOfStreamException(); + return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)); + } + + private byte[] readBuffer = new byte[8]; + + public long ReadLong() + { + ReadFully(readBuffer, 0, 8); + return (((long)readBuffer[0] << 56) + + ((long)(readBuffer[1] & 255) << 48) + + ((long)(readBuffer[2] & 255) << 40) + + ((long)(readBuffer[3] & 255) << 32) + + ((long)(readBuffer[4] & 255) << 24) + + ((readBuffer[5] & 255) << 16) + + ((readBuffer[6] & 255) << 8) + + ((readBuffer[7] & 255) << 0)); + } + + public float ReadFloat() + { + return Number.IntBitsToFloat(ReadInt()); + } + + public double ReadDouble() + { + throw new NotImplementedException(); + //return Number.LongBitsToDouble(ReadLong()); + } + + private char[] lineBuffer; + + [Obsolete] + public string ReadLine() + { + char[] buf = lineBuffer; + + if (buf == null) + { + buf = lineBuffer = new char[128]; + } + + int room = buf.Length; + int offset = 0; + int c; + + while (true) + { + switch (c = @in.ReadByte()) + { + case -1: + case '\n': + goto loop; + + case '\r': + int c2 = @in.ReadByte(); + if ((c2 != '\n') && (c2 != -1)) + { + using (StreamReader reader = new StreamReader(@in)) + { + c2 = reader.Peek(); + } + // http://stackoverflow.com/a/8021738/181087 + //if (!(in is PushbackInputStream)) { + // this.in = new PushbackInputStream(in); + //} + //((PushbackInputStream)in).unread(c2); + } + goto loop; + + default: + if (--room < 0) + { + buf = new char[offset + 128]; + room = buf.Length - offset - 1; + System.Array.Copy(lineBuffer, 0, buf, 0, offset); + lineBuffer = buf; + } + buf[offset++] = (char)c; + break; + } + } + loop: + if ((c == -1) && (offset == 0)) + { + return null; + } + return new string(buf, 0, offset); + } + + public string ReadUTF() + { + return ReadUTF(this); + } + + public static string ReadUTF(IDataInput @in) + { + int utflen = @in.ReadUnsignedShort(); + byte[] bytearr = null; + char[] chararr = null; + if (@in is DataInputStream) + { + DataInputStream dis = (DataInputStream)@in; + if (dis.bytearr.Length < utflen) + { + dis.bytearr = new byte[utflen * 2]; + dis.chararr = new char[utflen * 2]; + } + chararr = dis.chararr; + bytearr = dis.bytearr; + } + else + { + bytearr = new byte[utflen]; + chararr = new char[utflen]; + } + + int c, char2, char3; + int count = 0; + int chararr_count = 0; + + @in.ReadFully(bytearr, 0, utflen); + + while (count < utflen) + { + c = (int)bytearr[count] & 0xff; + if (c > 127) break; + count++; + chararr[chararr_count++] = (char)c; + } + + while (count < utflen) + { + c = (int)bytearr[count] & 0xff; + switch (c >> 4) + { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + /* 0xxxxxxx*/ + count++; + chararr[chararr_count++] = (char)c; + break; + case 12: + case 13: + /* 110x xxxx 10xx xxxx*/ + count += 2; + if (count > utflen) + throw new FormatException( + "malformed input: partial character at end"); + char2 = (int)bytearr[count - 1]; + if ((char2 & 0xC0) != 0x80) + throw new FormatException( + "malformed input around byte " + count); + chararr[chararr_count++] = (char)(((c & 0x1F) << 6) | + (char2 & 0x3F)); + break; + case 14: + /* 1110 xxxx 10xx xxxx 10xx xxxx */ + count += 3; + if (count > utflen) + throw new FormatException( + "malformed input: partial character at end"); + char2 = (int)bytearr[count - 2]; + char3 = (int)bytearr[count - 1]; + if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) + throw new FormatException( + "malformed input around byte " + (count - 1)); + chararr[chararr_count++] = (char)(((c & 0x0F) << 12) | + ((char2 & 0x3F) << 6) | + ((char3 & 0x3F) << 0)); + break; + default: + /* 10xx xxxx, 1111 xxxx */ + throw new FormatException( + "malformed input around byte " + count); + } + } + // The number of chars produced may be less than utflen + return new string(chararr, 0, chararr_count); + } + + public void Dispose() + { + @in.Dispose(); + } + } +}