Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 332DF200BDC for ; Wed, 14 Dec 2016 21:00:51 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 319B0160B0D; Wed, 14 Dec 2016 20:00:51 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id A675B160B2E for ; Wed, 14 Dec 2016 21:00:48 +0100 (CET) Received: (qmail 95442 invoked by uid 500); 14 Dec 2016 20:00:47 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 95326 invoked by uid 99); 14 Dec 2016 20:00:47 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 14 Dec 2016 20:00:47 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 6A6BFF1710; Wed, 14 Dec 2016 20:00:47 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: synhershko@apache.org To: commits@lucenenet.apache.org Date: Wed, 14 Dec 2016 20:00:49 -0000 Message-Id: <2882626a8a0145a88db48b14436e73ce@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [3/4] lucenenet git commit: Renamed hyphenation to Hyphenation to fix build and run on case sensitive file systems archived-at: Wed, 14 Dec 2016 20:00:51 -0000 Renamed hyphenation to Hyphenation to fix build and run on case sensitive file systems Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/7ecb7529 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/7ecb7529 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/7ecb7529 Branch: refs/heads/master Commit: 7ecb7529066c1d95dce83b175dac3322fc4068ab Parents: 96d38ef Author: Yaroslav Authored: Mon Nov 28 21:23:25 2016 +0200 Committer: Yaroslav Committed: Mon Nov 28 21:23:25 2016 +0200 ---------------------------------------------------------------------- .../Analysis/Compound/Hyphenation/ByteVector.cs | 156 ++++ .../Analysis/Compound/Hyphenation/CharVector.cs | 171 ++++ .../Analysis/Compound/Hyphenation/Hyphen.cs | 72 ++ .../Compound/Hyphenation/Hyphenation.cs | 53 ++ .../Compound/Hyphenation/HyphenationTree.cs | 581 +++++++++++++ .../Compound/Hyphenation/PatternConsumer.cs | 54 ++ .../Compound/Hyphenation/PatternParser.cs | 483 +++++++++++ .../Compound/Hyphenation/TernaryTree.cs | 816 +++++++++++++++++++ .../Compound/Hyphenation/hyphenation.dtd | 68 ++ .../Analysis/Compound/hyphenation/ByteVector.cs | 156 ---- .../Analysis/Compound/hyphenation/CharVector.cs | 171 ---- .../Analysis/Compound/hyphenation/Hyphen.cs | 72 -- .../Compound/hyphenation/Hyphenation.cs | 53 -- .../Compound/hyphenation/HyphenationTree.cs | 581 ------------- .../Compound/hyphenation/PatternConsumer.cs | 54 -- .../Compound/hyphenation/PatternParser.cs | 483 ----------- .../Compound/hyphenation/TernaryTree.cs | 816 ------------------- .../Compound/hyphenation/hyphenation.dtd | 68 -- 18 files changed, 2454 insertions(+), 2454 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/ByteVector.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/ByteVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/ByteVector.cs new file mode 100644 index 0000000..6442d11 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/ByteVector.cs @@ -0,0 +1,156 @@ +namespace Lucene.Net.Analysis.Compound.Hyphenation +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// This class implements a simple byte vector with access to the underlying + /// array. + /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + /// + public class ByteVector + { + + /// + /// Capacity increment size + /// + private const int DEFAULT_BLOCK_SIZE = 2048; + + private int blockSize; + + /// + /// The encapsulated array + /// + private sbyte[] array; + + /// + /// Points to next free item + /// + private int n; + + public ByteVector() : this(DEFAULT_BLOCK_SIZE) + { + } + + public ByteVector(int capacity) + { + if (capacity > 0) + { + blockSize = capacity; + } + else + { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = new sbyte[blockSize]; + n = 0; + } + + public ByteVector(sbyte[] a) + { + blockSize = DEFAULT_BLOCK_SIZE; + array = a; + n = 0; + } + + public ByteVector(sbyte[] a, int capacity) + { + if (capacity > 0) + { + blockSize = capacity; + } + else + { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = a; + n = 0; + } + + public virtual sbyte[] Array + { + get + { + return array; + } + } + + /// + /// LUCENENET indexer for .NET + /// + /// + /// + public virtual sbyte this[int index] + { + get { return array[index]; } + set { array[index] = value; } + } + + /// + /// return number of items in array + /// + public virtual int Length + { + get { return n; } + } + + /// + /// returns current capacity of array + /// + public virtual int Capacity + { + get { return array.Length; } + } + + //public virtual void Put(int index, sbyte val) + //{ + // array[index] = val; + //} + + //public virtual sbyte Get(int index) + //{ + // return array[index]; + //} + + /// + /// This is to implement memory allocation in the array. Like malloc(). + /// + public virtual int Alloc(int size) + { + int index = n; + int len = array.Length; + if (n + size >= len) + { + sbyte[] aux = new sbyte[len + blockSize]; + System.Array.Copy(array, 0, aux, 0, len); + array = aux; + } + n += size; + return index; + } + + public virtual void TrimToSize() + { + if (n < array.Length) + { + sbyte[] aux = new sbyte[n]; + System.Array.Copy(array, 0, aux, 0, n); + array = aux; + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/CharVector.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/CharVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/CharVector.cs new file mode 100644 index 0000000..26fcea5 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/CharVector.cs @@ -0,0 +1,171 @@ +using System; + +namespace Lucene.Net.Analysis.Compound.Hyphenation +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// This class implements a simple char vector with access to the underlying + /// array. + /// + /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + /// + public class CharVector : ICloneable + { + + /// + /// Capacity increment size + /// + private const int DEFAULT_BLOCK_SIZE = 2048; + + private int blockSize; + + /// + /// The encapsulated array + /// + private char[] array; + + /// + /// Points to next free item + /// + private int n; + + public CharVector() : this(DEFAULT_BLOCK_SIZE) + { + } + + public CharVector(int capacity) + { + if (capacity > 0) + { + blockSize = capacity; + } + else + { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = new char[blockSize]; + n = 0; + } + + public CharVector(char[] a) + { + blockSize = DEFAULT_BLOCK_SIZE; + array = a; + n = a.Length; + } + + public CharVector(char[] a, int capacity) + { + if (capacity > 0) + { + blockSize = capacity; + } + else + { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = a; + n = a.Length; + } + + /// + /// Reset Vector but don't resize or clear elements + /// + public virtual void Clear() + { + n = 0; + } + + public virtual object Clone() + { + CharVector cv = new CharVector(array, blockSize); + cv.n = this.n; + return cv; + } + + public virtual char[] Array + { + get + { + return array; + } + } + + /// + /// LUCENENET indexer for .NET + /// + /// + /// + public virtual char this[int index] + { + get { return array[index]; } + set { array[index] = value; } + } + + /// + /// return number of items in array + /// + public virtual int Length() + { + return n; + } + + /// + /// returns current capacity of array + /// + public virtual int Capacity + { + get { return array.Length; } + } + + //public virtual void Put(int index, char val) + //{ + // array[index] = val; + //} + + //public virtual char get(int index) + //{ + // return array[index]; + //} + + public virtual int Alloc(int size) + { + int index = n; + int len = array.Length; + if (n + size >= len) + { + char[] aux = new char[len + blockSize]; + System.Array.Copy(array, 0, aux, 0, len); + array = aux; + } + n += size; + return index; + } + + public virtual void TrimToSize() + { + if (n < array.Length) + { + char[] aux = new char[n]; + System.Array.Copy(array, 0, aux, 0, n); + array = aux; + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphen.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphen.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphen.cs new file mode 100644 index 0000000..91009b1 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphen.cs @@ -0,0 +1,72 @@ +using System.Text; + +namespace Lucene.Net.Analysis.Compound.Hyphenation +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// This class represents a hyphen. A 'full' hyphen is made of 3 parts: the + /// pre-break text, post-break text and no-break. If no line-break is generated + /// at this position, the no-break text is used, otherwise, pre-break and + /// post-break are used. Typically, pre-break is equal to the hyphen character + /// and the others are empty. However, this general scheme allows support for + /// cases in some languages where words change spelling if they're split across + /// lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes + /// from TeX. + /// + /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + /// + public class Hyphen + { + public string preBreak; + + public string noBreak; + + public string postBreak; + + internal Hyphen(string pre, string no, string post) + { + preBreak = pre; + noBreak = no; + postBreak = post; + } + + internal Hyphen(string pre) + { + preBreak = pre; + noBreak = null; + postBreak = null; + } + + public override string ToString() + { + if (noBreak == null && postBreak == null && preBreak != null && preBreak.Equals("-")) + { + return "-"; + } + StringBuilder res = new StringBuilder("{"); + res.Append(preBreak); + res.Append("}{"); + res.Append(postBreak); + res.Append("}{"); + res.Append(noBreak); + res.Append('}'); + return res.ToString(); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphenation.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphenation.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphenation.cs new file mode 100644 index 0000000..fdbac29 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/Hyphenation.cs @@ -0,0 +1,53 @@ +namespace Lucene.Net.Analysis.Compound.Hyphenation +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// This class represents a hyphenated word. + /// + /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + /// + public class Hyphenation + { + + private readonly int[] hyphenPoints; + + /// + /// rawWord as made of alternating strings and instances + /// + internal Hyphenation(int[] points) + { + hyphenPoints = points; + } + + /// the number of hyphenation points in the word + public virtual int Length + { + get { return hyphenPoints.Length; } + } + + /// the hyphenation points + public virtual int[] HyphenationPoints + { + get + { + return hyphenPoints; + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs new file mode 100644 index 0000000..287f6f3 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs @@ -0,0 +1,581 @@ +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; +using System.Xml; + +namespace Lucene.Net.Analysis.Compound.Hyphenation +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// This tree structure stores the hyphenation patterns in an efficient way for + /// fast lookup. It provides the provides the method to hyphenate a word. + /// + /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + /// + public class HyphenationTree : TernaryTree, IPatternConsumer + { + + /// + /// value space: stores the interletter values + /// + protected internal ByteVector vspace; + + /// + /// This map stores hyphenation exceptions + /// + protected internal IDictionary> stoplist; + + /// + /// This map stores the character classes + /// + protected internal TernaryTree classmap; + + /// + /// Temporary map to store interletter values on pattern loading. + /// + [NonSerialized] + private TernaryTree ivalues; + + public HyphenationTree() + { + stoplist = new HashMap>(23); // usually a small table + classmap = new TernaryTree(); + vspace = new ByteVector(); + vspace.Alloc(1); // this reserves index 0, which we don't use + } + + /// + /// Packs the values by storing them in 4 bits, two values into a byte Values + /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the + /// value. + /// + /// a string of digits from '0' to '9' representing the + /// interletter values. + /// the index into the vspace array where the packed values are stored. + protected internal virtual int PackValues(string values) + { + int i, n = values.Length; + int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; + int offset = vspace.Alloc(m); + sbyte[] va = vspace.Array; + for (i = 0; i < n; i++) + { + int j = i >> 1; + sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f); + if ((i & 1) == 1) + { + va[j + offset] = (sbyte)(va[j + offset] | v); + } + else + { + va[j + offset] = (sbyte)(v << 4); // big endian + } + } + va[m - 1 + offset] = 0; // terminator + return offset; + } + + protected internal virtual string UnpackValues(int k) + { + StringBuilder buf = new StringBuilder(); + sbyte v = vspace[k++]; + while (v != 0) + { + char c = (char)(((int)((uint)v >> 4)) - 1 + '0'); + buf.Append(c); + c = (char)(v & 0x0f); + if (c == 0) + { + break; + } + c = (char)(c - 1 + '0'); + buf.Append(c); + v = vspace[k++]; + } + return buf.ToString(); + } + + /// + /// Read hyphenation patterns from an XML file. + /// + /// the filename + /// In case the parsing fails + public virtual void LoadPatterns(string filename) + { + LoadPatterns(filename, Encoding.UTF8); + } + + /// + /// Read hyphenation patterns from an XML file. + /// + /// the filename + /// In case the parsing fails + public virtual void LoadPatterns(string filename, Encoding encoding) + { + var src = new FileStream(filename, FileMode.Open, FileAccess.Read); + LoadPatterns(src, encoding); + } + + /// + /// Read hyphenation patterns from an XML file. + /// + /// the filename + /// In case the parsing fails + public virtual void LoadPatterns(FileInfo f) + { + LoadPatterns(f, Encoding.UTF8); + } + + /// + /// Read hyphenation patterns from an XML file. + /// + /// the filename + /// In case the parsing fails + public virtual void LoadPatterns(FileInfo f, Encoding encoding) + { + var src = new FileStream(f.FullName, FileMode.Open, FileAccess.Read); + LoadPatterns(src, encoding); + } + + /// + /// Read hyphenation patterns from an XML file. + /// + /// the InputSource for the file + /// In case the parsing fails + public virtual void LoadPatterns(Stream source) + { + LoadPatterns(source, Encoding.UTF8); + } + + /// + /// Read hyphenation patterns from an XML file. + /// + /// the InputSource for the file + /// In case the parsing fails + public virtual void LoadPatterns(Stream source, Encoding encoding) + { + // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in. + using (var reader = XmlReader.Create(new StreamReader(source, encoding), new XmlReaderSettings + { + DtdProcessing = DtdProcessing.Parse, + XmlResolver = new PatternParser.DtdResolver() + })) + { + LoadPatterns(reader); + } + } + + public virtual void LoadPatterns(XmlReader source) + { + PatternParser pp = new PatternParser(this); + ivalues = new TernaryTree(); + + pp.Parse(source); + + // patterns/values should be now in the tree + // let's optimize a bit + TrimToSize(); + vspace.TrimToSize(); + classmap.TrimToSize(); + + // get rid of the auxiliary map + ivalues = null; + } + + public virtual string FindPattern(string pat) + { + int k = base.Find(pat); + if (k >= 0) + { + return UnpackValues(k); + } + return ""; + } + + /// + /// String compare, returns 0 if equal or t is a substring of s + /// + protected internal virtual int HStrCmp(char[] s, int si, char[] t, int ti) + { + for (; s[si] == t[ti]; si++, ti++) + { + if (s[si] == 0) + { + return 0; + } + } + if (t[ti] == 0) + { + return 0; + } + return s[si] - t[ti]; + } + + protected internal virtual sbyte[] GetValues(int k) + { + StringBuilder buf = new StringBuilder(); + sbyte v = vspace[k++]; + while (v != 0) + { + char c = (char)((((int)((uint)v >> 4))) - 1); + buf.Append(c); + c = (char)(v & 0x0f); + if (c == 0) + { + break; + } + c = (char)(c - 1); + buf.Append(c); + v = vspace[k++]; + } + sbyte[] res = new sbyte[buf.Length]; + for (int i = 0; i < res.Length; i++) + { + res[i] = (sbyte)buf[i]; + } + return res; + } + + /// + /// + /// Search for all possible partial matches of word starting at index an update + /// interletter values. In other words, it does something like: + /// + /// + /// for(i=0; i<patterns.length; i++) { + /// if ( word.substring(index).startsWidth(patterns[i]) ) + /// update_interletter_values(patterns[i]); + /// } + /// + /// + /// But it is done in an efficient way since the patterns are stored in a + /// ternary tree. In fact, this is the whole purpose of having the tree: doing + /// this search without having to test every single pattern. The number of + /// patterns for languages such as English range from 4000 to 10000. Thus, + /// doing thousands of string comparisons for each word to hyphenate would be + /// really slow without the tree. The tradeoff is memory, but using a ternary + /// tree instead of a trie, almost halves the the memory used by Lout or TeX. + /// It's also faster than using a hash table + /// + /// + /// null terminated word to match + /// start index from word + /// interletter values array to update + protected internal virtual void SearchPatterns(char[] word, int index, sbyte[] il) + { + sbyte[] values; + int i = index; + char p, q; + char sp = word[i]; + p = root; + + while (p > 0 && p < sc.Length) + { + if (sc[p] == 0xFFFF) + { + if (HStrCmp(word, i, kv.Array, lo[p]) == 0) + { + values = GetValues(eq[p]); // data pointer is in eq[] + int j = index; + for (int k = 0; k < values.Length; k++) + { + if (j < il.Length && values[k] > il[j]) + { + il[j] = values[k]; + } + j++; + } + } + return; + } + int d = sp - sc[p]; + if (d == 0) + { + if (sp == 0) + { + break; + } + sp = word[++i]; + p = eq[p]; + q = p; + + // look for a pattern ending at this position by searching for + // the null char ( splitchar == 0 ) + while (q > 0 && q < sc.Length) + { + if (sc[q] == 0xFFFF) // stop at compressed branch + { + break; + } + if (sc[q] == 0) + { + values = GetValues(eq[q]); + int j = index; + for (int k = 0; k < values.Length; k++) + { + if (j < il.Length && values[k] > il[j]) + { + il[j] = values[k]; + } + j++; + } + break; + } + else + { + q = lo[q]; + + /// + /// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but + /// java chars are unsigned + /// + } + } + } + else + { + p = d < 0 ? lo[p] : hi[p]; + } + } + } + + /// + /// Hyphenate word and return a Hyphenation object. + /// + /// the word to be hyphenated + /// Minimum number of characters allowed before the + /// hyphenation point. + /// Minimum number of characters allowed after the + /// hyphenation point. + /// a object representing the + /// hyphenated word or null if word is not hyphenated. + public virtual Hyphenation Hyphenate(string word, int remainCharCount, int pushCharCount) + { + char[] w = word.ToCharArray(); + return Hyphenate(w, 0, w.Length, remainCharCount, pushCharCount); + } + + /// + /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n + /// may be absent, the first n is at offset, the first l is at offset + + /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied + /// into word. In the first part of the routine len = w.length, in the second + /// part of the routine len = word.length. Three indices are used: index(w), + /// the index in w, index(word), the index in word, letterindex(word), the + /// index in the letter part of word. The following relations exist: index(w) = + /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) = + /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) = + /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset + + /// iIgnoreAtBeginning + /// + + /// + /// Hyphenate word and return an array of hyphenation points. + /// + /// char array that contains the word + /// Offset to first character in word + /// Length of word + /// Minimum number of characters allowed before the + /// hyphenation point. + /// Minimum number of characters allowed after the + /// hyphenation point. + /// a object representing the + /// hyphenated word or null if word is not hyphenated. + public virtual Hyphenation Hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount) + { + int i; + char[] word = new char[len + 3]; + + // normalize word + char[] c = new char[2]; + int iIgnoreAtBeginning = 0; + int iLength = len; + bool bEndOfLetters = false; + for (i = 1; i <= len; i++) + { + c[0] = w[offset + i - 1]; + int nc = classmap.Find(c, 0); + if (nc < 0) // found a non-letter character ... + { + if (i == (1 + iIgnoreAtBeginning)) + { + // ... before any letter character + iIgnoreAtBeginning++; + } + else + { + // ... after a letter character + bEndOfLetters = true; + } + iLength--; + } + else + { + if (!bEndOfLetters) + { + word[i - iIgnoreAtBeginning] = (char)nc; + } + else + { + return null; + } + } + } + len = iLength; + if (len < (remainCharCount + pushCharCount)) + { + // word is too short to be hyphenated + return null; + } + int[] result = new int[len + 1]; + int k = 0; + + // check exception list first + string sw = new string(word, 1, len); + if (stoplist.ContainsKey(sw)) + { + // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = + // null) + IList hw = stoplist[sw]; + int j = 0; + for (i = 0; i < hw.Count; i++) + { + object o = hw[i]; + // j = index(sw) = letterindex(word)? + // result[k] = corresponding index(w) + if (o is string) + { + j += ((string)o).Length; + if (j >= remainCharCount && j < (len - pushCharCount)) + { + result[k++] = j + iIgnoreAtBeginning; + } + } + } + } + else + { + // use algorithm to get hyphenation points + word[0] = '.'; // word start marker + word[len + 1] = '.'; // word end marker + word[len + 2] = (char)0; // null terminated + sbyte[] il = new sbyte[len + 3]; // initialized to zero + for (i = 0; i < len + 1; i++) + { + SearchPatterns(word, i, il); + } + + // hyphenation points are located where interletter value is odd + // i is letterindex(word), + // i + 1 is index(word), + // result[k] = corresponding index(w) + for (i = 0; i < len; i++) + { + if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount)) + { + result[k++] = i + iIgnoreAtBeginning; + } + } + } + + if (k > 0) + { + // trim result array + int[] res = new int[k + 2]; + Array.Copy(result, 0, res, 1, k); + // We add the synthetical hyphenation points + // at the beginning and end of the word + res[0] = 0; + res[k + 1] = len; + return new Hyphenation(res); + } + else + { + return null; + } + } + + /// + /// Add a character class to the tree. It is used by + /// as callback to add character classes. + /// Character classes define the valid word characters for hyphenation. If a + /// word contains a character not defined in any of the classes, it is not + /// hyphenated. It also defines a way to normalize the characters in order to + /// compare them with the stored patterns. Usually pattern files use only lower + /// case characters, in this case a class for letter 'a', for example, should + /// be defined as "aA", the first character being the normalization char. + /// + public virtual void AddClass(string chargroup) + { + if (chargroup.Length > 0) + { + char equivChar = chargroup[0]; + char[] key = new char[2]; + key[1] = (char)0; + for (int i = 0; i < chargroup.Length; i++) + { + key[0] = chargroup[i]; + classmap.Insert(key, 0, equivChar); + } + } + } + + /// + /// Add an exception to the tree. It is used by + /// class as callback to store the + /// hyphenation exceptions. + /// + /// normalized word + /// a vector of alternating strings and + /// objects. + public virtual void AddException(string word, List hyphenatedword) + { + stoplist[word] = hyphenatedword; + } + + /// + /// Add a pattern to the tree. Mainly, to be used by + /// class as callback to add a pattern to + /// the tree. + /// + /// the hyphenation pattern + /// interletter weight values indicating the desirability and + /// priority of hyphenating at a given point within the pattern. It + /// should contain only digit characters. (i.e. '0' to '9'). + public virtual void AddPattern(string pattern, string ivalue) + { + int k = ivalues.Find(ivalue); + if (k <= 0) + { + k = PackValues(ivalue); + ivalues.Insert(ivalue, (char)k); + } + Insert(pattern, (char)k); + } + + // public override void printStats(PrintStream @out) + // { + //@out.println("Value space size = " + Convert.ToString(vspace.length())); + //base.printStats(@out); + + // } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternConsumer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternConsumer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternConsumer.cs new file mode 100644 index 0000000..069badd --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternConsumer.cs @@ -0,0 +1,54 @@ +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Compound.Hyphenation +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// This interface is used to connect the XML pattern file parser to the + /// hyphenation tree. + /// + /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + /// + public interface IPatternConsumer + { + + /// + /// Add a character class. A character class defines characters that are + /// considered equivalent for the purpose of hyphenation (e.g. "aA"). It + /// usually means to ignore case. + /// + /// character group + void AddClass(string chargroup); + + /// + /// Add a hyphenation exception. An exception replaces the result obtained by + /// the algorithm for cases for which this fails or the user wants to provide + /// his own hyphenation. A hyphenatedword is a vector of alternating String's + /// and instances + /// + void AddException(string word, List hyphenatedword); + + /// + /// Add hyphenation patterns. + /// + /// the pattern + /// interletter values expressed as a string of digit characters. + void AddPattern(string pattern, string values); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternParser.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternParser.cs new file mode 100644 index 0000000..8c00d19 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/PatternParser.cs @@ -0,0 +1,483 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Xml; + +namespace Lucene.Net.Analysis.Compound.Hyphenation +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// A XMLReader document handler to read and parse hyphenation patterns from a XML + /// file. + /// + /// LUCENENET: This class has been refactored from its Java counterpart to use XmlReader rather + /// than a SAX parser. + /// + public class PatternParser + { + internal int currElement; + + internal IPatternConsumer consumer; + + internal StringBuilder token; + + internal List exception; + + internal char hyphenChar; + + internal string errMsg; + + internal const int ELEM_CLASSES = 1; + + internal const int ELEM_EXCEPTIONS = 2; + + internal const int ELEM_PATTERNS = 3; + + internal const int ELEM_HYPHEN = 4; + + public PatternParser() + { + token = new StringBuilder(); + hyphenChar = '-'; // default + } + + public PatternParser(IPatternConsumer consumer) : this() + { + this.consumer = consumer; + } + + public virtual IPatternConsumer Consumer + { + set + { + this.consumer = value; + } + } + + /// + /// Parses a hyphenation pattern file. + /// + /// the filename + /// In case of an exception while parsing + public virtual void Parse(string filename) + { + // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in. + using (var src = XmlReader.Create(filename, new XmlReaderSettings + { + DtdProcessing = DtdProcessing.Parse, + XmlResolver = new DtdResolver() + })) + { + Parse(src); + } + } + + /// + /// Parses a hyphenation pattern file. + /// + /// the pattern file + public virtual void Parse(FileInfo file) + { + Parse(file, Encoding.UTF8); + } + + /// + /// Parses a hyphenation pattern file. + /// + /// the pattern file + public virtual void Parse(FileInfo file, Encoding encoding) + { + using (var src = XmlReader.Create(new StreamReader(file.FullName, encoding), new XmlReaderSettings + { + DtdProcessing = DtdProcessing.Parse, + XmlResolver = new DtdResolver() + })) + { + + Parse(src); + } + } + + /// + /// Parses a hyphenation pattern file. + /// + /// the pattern file + public virtual void Parse(Stream xmlStream) + { + using (var src = XmlReader.Create(xmlStream, new XmlReaderSettings + { + DtdProcessing = DtdProcessing.Parse, + XmlResolver = new DtdResolver() + })) + { + Parse(src); + } + } + + /// + /// Parses a hyphenation pattern file. + /// + /// the InputSource for the file + /// In case of an exception while parsing + public virtual void Parse(XmlReader source) + { + source.MoveToContent(); + while (source.Read()) + { + ParseNode(source); + } + } + + private void ParseNode(XmlReader node) + { + string uri, name, raw; + switch (node.NodeType) + { + case XmlNodeType.Element: + + // Element start + uri = node.NamespaceURI; + name = node.Name; + bool isEmptyElement = node.IsEmptyElement; + var attributes = GetAttributes(node); + raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer + + this.StartElement(uri, name, raw, attributes); + if (isEmptyElement) + { + this.EndElement(uri, name, raw); + } + break; + + case XmlNodeType.Text: + + this.Characters(node.Value.ToCharArray(), 0, node.Value.Length); + break; + + case XmlNodeType.EndElement: + uri = node.NamespaceURI; + name = node.Name; + raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer + + // Element end + this.EndElement(uri, name, raw); + break; + } + } + + private IDictionary GetAttributes(XmlReader node) + { + var result = new Dictionary(); + if (node.HasAttributes) + { + for (int i = 0; i < node.AttributeCount; i++) + { + node.MoveToAttribute(i); + result.Add(node.Name, node.Value); + } + } + + return result; + } + + protected internal virtual string ReadToken(StringBuilder chars) + { + string word; + bool space = false; + int i; + for (i = 0; i < chars.Length; i++) + { + if (char.IsWhiteSpace(chars[i])) + { + space = true; + } + else + { + break; + } + } + if (space) + { + // chars.delete(0,i); + for (int countr = i; countr < chars.Length; countr++) + { + chars[countr - i] = chars[countr]; + } + chars.Length = chars.Length - i; + if (token.Length > 0) + { + word = token.ToString(); + token.Length = 0; + return word; + } + } + space = false; + for (i = 0; i < chars.Length; i++) + { + if (char.IsWhiteSpace(chars[i])) + { + space = true; + break; + } + } + token.Append(chars.ToString(0, i - 0)); + // chars.delete(0,i); + for (int countr = i; countr < chars.Length; countr++) + { + chars[countr - i] = chars[countr]; + } + chars.Length = chars.Length - i; + if (space) + { + word = token.ToString(); + token.Length = 0; + return word; + } + token.Append(chars.ToString()); + return null; + } + + protected internal static string GetPattern(string word) + { + StringBuilder pat = new StringBuilder(); + int len = word.Length; + for (int i = 0; i < len; i++) + { + if (!char.IsDigit(word[i])) + { + pat.Append(word[i]); + } + } + return pat.ToString(); + } + + protected internal virtual List NormalizeException(List ex) + { + List res = new List(); + for (int i = 0; i < ex.Count; i++) + { + object item = ex[i]; + if (item is string) + { + string str = (string)item; + StringBuilder buf = new StringBuilder(); + for (int j = 0; j < str.Length; j++) + { + char c = str[j]; + if (c != hyphenChar) + { + buf.Append(c); + } + else + { + res.Add(buf.ToString()); + buf.Length = 0; + char[] h = new char[1]; + h[0] = hyphenChar; + // we use here hyphenChar which is not necessarily + // the one to be printed + res.Add(new Hyphen(new string(h), null, null)); + } + } + if (buf.Length > 0) + { + res.Add(buf.ToString()); + } + } + else + { + res.Add(item); + } + } + return res; + } + + protected internal virtual string GetExceptionWord(List ex) + { + StringBuilder res = new StringBuilder(); + for (int i = 0; i < ex.Count; i++) + { + object item = ex[i]; + if (item is string) + { + res.Append((string)item); + } + else + { + if (((Hyphen)item).noBreak != null) + { + res.Append(((Hyphen)item).noBreak); + } + } + } + return res.ToString(); + } + + protected internal static string GetInterletterValues(string pat) + { + StringBuilder il = new StringBuilder(); + string word = pat + "a"; // add dummy letter to serve as sentinel + int len = word.Length; + for (int i = 0; i < len; i++) + { + char c = word[i]; + if (char.IsDigit(c)) + { + il.Append(c); + i++; + } + else + { + il.Append('0'); + } + } + return il.ToString(); + } + + /// + /// LUCENENET specific helper class to force the DTD file to be read from the embedded resource + /// rather than from the file system. + /// + internal class DtdResolver : XmlUrlResolver + { + public override object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn) + { + string dtdFilename = "hyphenation.dtd"; + if (dtdFilename.Equals(absoluteUri.Segments.LastOrDefault(), StringComparison.OrdinalIgnoreCase)) + { + var qualifedDtdFilename = string.Concat(GetType().Namespace, ".", dtdFilename); + return GetType().Assembly.GetManifestResourceStream(qualifedDtdFilename); + } + + return base.GetEntity(absoluteUri, role, ofObjectToReturn); + } + } + + // + // ContentHandler methods + // + + /// + public void StartElement(string uri, string local, string raw, IDictionary attrs) + { + if (local.Equals("hyphen-char")) + { + string h = attrs.ContainsKey("value") ? attrs["value"] : null; + if (h != null && h.Length == 1) + { + hyphenChar = h[0]; + } + } + else if (local.Equals("classes")) + { + currElement = ELEM_CLASSES; + } + else if (local.Equals("patterns")) + { + currElement = ELEM_PATTERNS; + } + else if (local.Equals("exceptions")) + { + currElement = ELEM_EXCEPTIONS; + exception = new List(); + } + else if (local.Equals("hyphen")) + { + if (token.Length > 0) + { + exception.Add(token.ToString()); + } + exception.Add(new Hyphen(attrs["pre"], attrs["no"], attrs["post"])); + currElement = ELEM_HYPHEN; + } + token.Length = 0; + } + + /// + public void EndElement(string uri, string local, string raw) + { + if (token.Length > 0) + { + string word = token.ToString(); + switch (currElement) + { + case ELEM_CLASSES: + consumer.AddClass(word); + break; + case ELEM_EXCEPTIONS: + exception.Add(word); + exception = NormalizeException(exception); + consumer.AddException(GetExceptionWord(exception), new List(exception)); + break; + case ELEM_PATTERNS: + consumer.AddPattern(GetPattern(word), GetInterletterValues(word)); + break; + case ELEM_HYPHEN: + // nothing to do + break; + } + if (currElement != ELEM_HYPHEN) + { + token.Length = 0; + } + } + if (currElement == ELEM_HYPHEN) + { + currElement = ELEM_EXCEPTIONS; + } + else + { + currElement = 0; + } + } + + /// + public void Characters(char[] ch, int start, int length) + { + StringBuilder chars = new StringBuilder(length); + chars.Append(ch, start, length); + string word = ReadToken(chars); + while (word != null) + { + // System.out.println("\"" + word + "\""); + switch (currElement) + { + case ELEM_CLASSES: + consumer.AddClass(word); + break; + case ELEM_EXCEPTIONS: + exception.Add(word); + exception = NormalizeException(exception); + consumer.AddException(GetExceptionWord(exception), new List(exception)); + exception.Clear(); + break; + case ELEM_PATTERNS: + consumer.AddPattern(GetPattern(word), GetInterletterValues(word)); + break; + } + word = ReadToken(chars); + } + + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7ecb7529/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/TernaryTree.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/TernaryTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/TernaryTree.cs new file mode 100644 index 0000000..88cfd01 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/TernaryTree.cs @@ -0,0 +1,816 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Compound.Hyphenation +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + ///

Ternary Search Tree.

+ /// + /// + /// A ternary search tree is a hybrid between a binary tree and a digital search + /// tree (trie). Keys are limited to strings. A data value of type char is stored + /// in each leaf node. It can be used as an index (or pointer) to the data. + /// Branches that only contain one key are compressed to one node by storing a + /// pointer to the trailer substring of the key. This class is intended to serve + /// as base class or helper class to implement Dictionary collections or the + /// like. Ternary trees have some nice properties as the following: the tree can + /// be traversed in sorted order, partial matches (wildcard) can be implemented, + /// retrieval of all keys within a given distance from the target, etc. The + /// storage requirements are higher than a binary tree but a lot less than a + /// trie. Performance is comparable with a hash table, sometimes it outperforms a + /// hash function (most of the time can determine a miss faster than a hash). + /// + /// + /// + /// The main purpose of this java port is to serve as a base for implementing + /// TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language + /// requires from 5000 to 15000 hyphenation patterns which will be keys in this + /// tree. The strings patterns are usually small (from 2 to 5 characters), but + /// each char in the tree is stored in a node. Thus memory usage is the main + /// concern. We will sacrifice 'elegance' to keep memory requirements to the + /// minimum. Using java's char type as pointer (yes, I know pointer it is a + /// forbidden word in java) we can keep the size of the node to be just 8 bytes + /// (3 pointers and the data char). This gives room for about 65000 nodes. In my + /// tests the english patterns took 7694 nodes and the german patterns 10055 + /// nodes, so I think we are safe. + /// + /// + /// + /// All said, this is a map with strings as keys and char as value. Pretty + /// limited!. It can be extended to a general map by using the string + /// representation of an object and using the char value as an index to an array + /// that contains the object values. + /// + /// + /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + ///
+ + public class TernaryTree : ICloneable + { + /// + /// We use 4 arrays to represent a node. I guess I should have created a proper + /// node class, but somehow Knuth's pascal code made me forget we now have a + /// portable language with virtual memory management and automatic garbage + /// collection! And now is kind of late, furthermore, if it ain't broken, don't + /// fix it. + /// + + /// + /// Pointer to low branch and to rest of the key when it is stored directly in + /// this node, we don't have unions in java! + /// + protected internal char[] lo; + + /// + /// Pointer to high branch. + /// + protected internal char[] hi; + + /// + /// Pointer to equal branch and to data when this node is a string terminator. + /// + protected internal char[] eq; + + /// + ///

+ /// The character stored in this node: splitchar. Two special values are + /// reserved: + ///

+ ///
    + ///
  • 0x0000 as string terminator
  • + ///
  • 0xFFFF to indicate that the branch starting at this node is compressed
  • + ///
+ /// + /// This shouldn't be a problem if we give the usual semantics to strings since + /// 0xFFFF is guaranteed not to be an Unicode character. + /// + ///
+ protected internal char[] sc; + + /// + /// This vector holds the trailing of the keys when the branch is compressed. + /// + protected internal CharVector kv; + + protected internal char root; + + protected internal char freenode; + + protected internal int length; // number of items in tree + + protected internal const int BLOCK_SIZE = 2048; // allocation size for arrays + + internal TernaryTree() + { + Init(); + } + + protected internal virtual void Init() + { + root = (char)0; + freenode = (char)1; + length = 0; + lo = new char[BLOCK_SIZE]; + hi = new char[BLOCK_SIZE]; + eq = new char[BLOCK_SIZE]; + sc = new char[BLOCK_SIZE]; + kv = new CharVector(); + } + + /// + /// Branches are initially compressed, needing one node per key plus the size + /// of the string key. They are decompressed as needed when another key with + /// same prefix is inserted. This saves a lot of space, specially for long + /// keys. + /// + public virtual void Insert(string key, char val) + { + // make sure we have enough room in the arrays + int len = key.Length + 1; // maximum number of nodes that may be generated + if (freenode + len > eq.Length) + { + RedimNodeArrays(eq.Length + BLOCK_SIZE); + } + char[] strkey = new char[len--]; + key.CopyTo(0, strkey, 0, len - 0); + strkey[len] = (char)0; + root = Insert(root, strkey, 0, val); + } + + public virtual void Insert(char[] key, int start, char val) + { + int len = StrLen(key) + 1; + if (freenode + len > eq.Length) + { + RedimNodeArrays(eq.Length + BLOCK_SIZE); + } + root = Insert(root, key, start, val); + } + + /// + /// The actual insertion function, recursive version. + /// + private char Insert(char p, char[] key, int start, char val) + { + int len = StrLen(key, start); + if (p == 0) + { + // this means there is no branch, this node will start a new branch. + // Instead of doing that, we store the key somewhere else and create + // only one node with a pointer to the key + p = freenode++; + eq[p] = val; // holds data + length++; + hi[p] = (char)0; + if (len > 0) + { + sc[p] = (char)0xFFFF; // indicates branch is compressed + lo[p] = (char)kv.Alloc(len + 1); // use 'lo' to hold pointer to key + StrCpy(kv.Array, lo[p], key, start); + } + else + { + sc[p] = (char)0; + lo[p] = (char)0; + } + return p; + } + + if (sc[p] == 0xFFFF) + { + // branch is compressed: need to decompress + // this will generate garbage in the external key array + // but we can do some garbage collection later + char pp = freenode++; + lo[pp] = lo[p]; // previous pointer to key + eq[pp] = eq[p]; // previous pointer to data + lo[p] = (char)0; + if (len > 0) + { + sc[p] = kv[lo[pp]]; + eq[p] = pp; + lo[pp]++; + if (kv[lo[pp]] == 0) + { + // key completly decompressed leaving garbage in key array + lo[pp] = (char)0; + sc[pp] = (char)0; + hi[pp] = (char)0; + } + else + { + // we only got first char of key, rest is still there + sc[pp] = (char)0xFFFF; + } + } + else + { + // In this case we can save a node by swapping the new node + // with the compressed node + sc[pp] = (char)0xFFFF; + hi[p] = pp; + sc[p] = (char)0; + eq[p] = val; + length++; + return p; + } + } + char s = key[start]; + if (s < sc[p]) + { + lo[p] = Insert(lo[p], key, start, val); + } + else if (s == sc[p]) + { + if (s != 0) + { + eq[p] = Insert(eq[p], key, start + 1, val); + } + else + { + // key already in tree, overwrite data + eq[p] = val; + } + } + else + { + hi[p] = Insert(hi[p], key, start, val); + } + return p; + } + + /// + /// Compares 2 null terminated char arrays + /// + public static int StrCmp(char[] a, int startA, char[] b, int startB) + { + for (; a[startA] == b[startB]; startA++, startB++) + { + if (a[startA] == 0) + { + return 0; + } + } + return a[startA] - b[startB]; + } + + /// + /// Compares a string with null terminated char array + /// + public static int StrCmp(string str, char[] a, int start) + { + int i, d, len = str.Length; + for (i = 0; i < len; i++) + { + d = (int)str[i] - a[start + i]; + if (d != 0) + { + return d; + } + if (a[start + i] == 0) + { + return d; + } + } + if (a[start + i] != 0) + { + return -a[start + i]; + } + return 0; + + } + + public static void StrCpy(char[] dst, int di, char[] src, int si) + { + while (src[si] != 0) + { + dst[di++] = src[si++]; + } + dst[di] = (char)0; + } + + public static int StrLen(char[] a, int start) + { + int len = 0; + for (int i = start; i < a.Length && a[i] != 0; i++) + { + len++; + } + return len; + } + + public static int StrLen(char[] a) + { + return StrLen(a, 0); + } + + public virtual int Find(string key) + { + int len = key.Length; + char[] strkey = new char[len + 1]; + key.CopyTo(0, strkey, 0, len - 0); + strkey[len] = (char)0; + + return Find(strkey, 0); + } + + public virtual int Find(char[] key, int start) + { + int d; + char p = root; + int i = start; + char c; + + while (p != 0) + { + if (sc[p] == 0xFFFF) + { + if (StrCmp(key, i, kv.Array, lo[p]) == 0) + { + return eq[p]; + } + else + { + return -1; + } + } + c = key[i]; + d = c - sc[p]; + if (d == 0) + { + if (c == 0) + { + return eq[p]; + } + i++; + p = eq[p]; + } + else if (d < 0) + { + p = lo[p]; + } + else + { + p = hi[p]; + } + } + return -1; + } + + public virtual bool Knows(string key) + { + return (Find(key) >= 0); + } + + // redimension the arrays + private void RedimNodeArrays(int newsize) + { + int len = newsize < lo.Length ? newsize : lo.Length; + char[] na = new char[newsize]; + Array.Copy(lo, 0, na, 0, len); + lo = na; + na = new char[newsize]; + Array.Copy(hi, 0, na, 0, len); + hi = na; + na = new char[newsize]; + Array.Copy(eq, 0, na, 0, len); + eq = na; + na = new char[newsize]; + Array.Copy(sc, 0, na, 0, len); + sc = na; + } + + public virtual int Length + { + get { return length; } + } + + public object Clone() + { + TernaryTree t = new TernaryTree(); + t.lo = (char[])this.lo.Clone(); + t.hi = (char[])this.hi.Clone(); + t.eq = (char[])this.eq.Clone(); + t.sc = (char[])this.sc.Clone(); + t.kv = (CharVector)this.kv.Clone(); + t.root = this.root; + t.freenode = this.freenode; + t.length = this.length; + + return t; + } + + /// + /// Recursively insert the median first and then the median of the lower and + /// upper halves, and so on in order to get a balanced tree. The array of keys + /// is assumed to be sorted in ascending order. + /// + protected internal virtual void InsertBalanced(string[] k, char[] v, int offset, int n) + { + int m; + if (n < 1) + { + return; + } + m = n >> 1; + + Insert(k[m + offset], v[m + offset]); + InsertBalanced(k, v, offset, m); + + InsertBalanced(k, v, offset + m + 1, n - m - 1); + } + + /// + /// Balance the tree for best search performance + /// + public virtual void Balance() + { + // System.out.print("Before root splitchar = "); + // System.out.println(sc[root]); + + int i = 0, n = length; + string[] k = new string[n]; + char[] v = new char[n]; + Iterator iter = new Iterator(this); + while (iter.MoveNext()) + { + v[i] = iter.Value; + k[i++] = iter.Current; + } + Init(); + InsertBalanced(k, v, 0, n); + + // With uniform letter distribution sc[root] should be around 'm' + // System.out.print("After root splitchar = "); + // System.out.println(sc[root]); + } + + /// + /// Each node stores a character (splitchar) which is part of some key(s). In a + /// compressed branch (one that only contain a single string key) the trailer + /// of the key which is not already in nodes is stored externally in the kv + /// array. As items are inserted, key substrings decrease. Some substrings may + /// completely disappear when the whole branch is totally decompressed. The + /// tree is traversed to find the key substrings actually used. In addition, + /// duplicate substrings are removed using a map (implemented with a + /// TernaryTree!). + /// + /// + public virtual void TrimToSize() + { + // first balance the tree for best performance + Balance(); + + // redimension the node arrays + RedimNodeArrays(freenode); + + // ok, compact kv array + CharVector kx = new CharVector(); + kx.Alloc(1); + TernaryTree map = new TernaryTree(); + Compact(kx, map, root); + kv = kx; + kv.TrimToSize(); + } + + private void Compact(CharVector kx, TernaryTree map, char p) + { + int k; + if (p == 0) + { + return; + } + if (sc[p] == 0xFFFF) + { + k = map.Find(kv.Array, lo[p]); + if (k < 0) + { + k = kx.Alloc(StrLen(kv.Array, lo[p]) + 1); + StrCpy(kx.Array, k, kv.Array, lo[p]); + map.Insert(kx.Array, k, (char)k); + } + lo[p] = (char)k; + } + else + { + Compact(kx, map, lo[p]); + if (sc[p] != 0) + { + Compact(kx, map, eq[p]); + } + Compact(kx, map, hi[p]); + } + } + + public virtual IEnumerator Keys() + { + return new Iterator(this); + } + + /// + /// Enumerator for TernaryTree + /// + /// LUCENENET NOTE: This differs a bit from its Java counterpart to adhere to + /// .NET IEnumerator semantics. In Java, when the is + /// instantiated, it is already positioned at the first element. However, + /// to act like a .NET IEnumerator, the initial state is undefined and considered + /// to be before the first element until is called, and + /// if a move took place it will return true; + /// + public class Iterator : IEnumerator + { + private readonly TernaryTree outerInstance; + + + /// + /// current node index + /// + private int cur; + + /// + /// current key + /// + private string curkey; + + internal class Item : ICloneable + { + internal char parent; + internal char child; + + public Item() + { + parent = (char)0; + child = (char)0; + } + + public Item(char p, char c) + { + parent = p; + child = c; + } + + public object Clone() + { + return new Item(parent, child); + } + + } + + /// + /// Node stack + /// + internal Stack ns; + + /// + /// key stack implemented with a StringBuilder + /// + internal StringBuilder ks; + + private bool isInitialized = false; + + public Iterator(TernaryTree outerInstance) + { + this.outerInstance = outerInstance; + cur = -1; + ns = new Stack(); + ks = new StringBuilder(); + isInitialized = false; + } + + public virtual void Rewind() + { + ns.Clear(); + ks.Length = 0; + cur = outerInstance.root; + Run(); + } + + public virtual char Value + { + get + { + if (cur >= 0) + { + return outerInstance.eq[cur]; + } + return (char)0; + } + } + + /// + /// traverse upwards + /// + internal virtual int Up() + { + Item i = new Item(); + int res = 0; + + if (ns.Count == 0) + { + return -1; + } + + if (cur != 0 && outerInstance.sc[cur] == 0) + { + return outerInstance.lo[cur]; + } + + bool climb = true; + + while (climb) + { + i = ns.Pop(); + i.child++; + switch ((int)i.child) + { + case 1: + if (outerInstance.sc[i.parent] != 0) + { + res = outerInstance.eq[i.parent]; + ns.Push((Item)i.Clone()); + ks.Append(outerInstance.sc[i.parent]); + } + else + { + i.child++; + ns.Push((Item)i.Clone()); + res = outerInstance.hi[i.parent]; + } + climb = false; + break; + + case 2: + res = outerInstance.hi[i.parent]; + ns.Push((Item)i.Clone()); + if (ks.Length > 0) + { + ks.Length = ks.Length - 1; // pop + } + climb = false; + break; + + default: + if (ns.Count == 0) + { + return -1; + } + climb = true; + break; + } + } + return res; + } + + /// + /// traverse the tree to find next key + /// + internal virtual int Run() + { + if (cur == -1) + { + return -1; + } + + bool leaf = false; + while (true) + { + // first go down on low branch until leaf or compressed branch + while (cur != 0) + { + if (outerInstance.sc[cur] == 0xFFFF) + { + leaf = true; + break; + } + ns.Push(new Item((char)cur, '\u0000')); + if (outerInstance.sc[cur] == 0) + { + leaf = true; + break; + } + cur = outerInstance.lo[cur]; + } + if (leaf) + { + break; + } + // nothing found, go up one node and try again + cur = Up(); + if (cur == -1) + { + return -1; + } + } + // The current node should be a data node and + // the key should be in the key stack (at least partially) + StringBuilder buf = new StringBuilder(ks.ToString()); + if (outerInstance.sc[cur] == 0xFFFF) + { + int p = outerInstance.lo[cur]; + while (outerInstance.kv[p] != 0) + { + buf.Append(outerInstance.kv[p++]); + } + } + curkey = buf.ToString(); + return 0; + } + + #region Added for better .NET support + public string Current + { + get + { + return curkey; + } + } + + object IEnumerator.Current + { + get + { + return Current; + } + } + + public void Dispose() + { + // nothing to do + } + + public bool MoveNext() + { + if (!isInitialized) + { + Rewind(); + isInitialized = true; + return cur != -1; + } + if (cur == -1) + { + return false; + } + cur = Up(); + Run(); + return cur != -1; + } + + public void Reset() + { + throw new NotSupportedException(); + } + + #endregion + } + + public virtual void PrintStats(TextWriter @out) + { + @out.WriteLine("Number of keys = " + Convert.ToString(length)); + @out.WriteLine("Node count = " + Convert.ToString(freenode)); + // System.out.println("Array length = " + Integer.toString(eq.length)); + @out.WriteLine("Key Array length = " + Convert.ToString(kv.Length())); + + /* + * for(int i=0; i