lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From paulir...@apache.org
Subject [09/53] [abbrv] Finish Contrib.QueryParsers and Sandbox
Date Thu, 07 Nov 2013 13:53:24 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowCollatedStringComparer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Sandbox/Queries/SlowCollatedStringComparer.cs b/src/contrib/Sandbox/Queries/SlowCollatedStringComparer.cs
new file mode 100644
index 0000000..fd380d9
--- /dev/null
+++ b/src/contrib/Sandbox/Queries/SlowCollatedStringComparer.cs
@@ -0,0 +1,135 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    public sealed class SlowCollatedStringComparator : FieldComparator<String>
+    {
+        private readonly String[] values;
+        private BinaryDocValues currentDocTerms;
+        private readonly string field;
+        readonly StringComparer collator;
+        private string bottom;
+        private readonly BytesRef tempBR = new BytesRef();
+
+        public SlowCollatedStringComparator(int numHits, string field, StringComparer collator)
+        {
+            values = new string[numHits];
+            this.field = field;
+            this.collator = collator;
+        }
+
+        public override int Compare(int slot1, int slot2)
+        {
+            string val1 = values[slot1];
+            string val2 = values[slot2];
+            if (val1 == null)
+            {
+                if (val2 == null)
+                {
+                    return 0;
+                }
+
+                return -1;
+            }
+            else if (val2 == null)
+            {
+                return 1;
+            }
+
+            return collator.Compare(val1, val2);
+        }
+
+        public override int CompareBottom(int doc)
+        {
+            currentDocTerms.Get(doc, tempBR);
+            string val2 = tempBR.bytes == BinaryDocValues.MISSING ? null : tempBR.Utf8ToString();
+            if (bottom == null)
+            {
+                if (val2 == null)
+                {
+                    return 0;
+                }
+
+                return -1;
+            }
+            else if (val2 == null)
+            {
+                return 1;
+            }
+
+            return collator.Compare(bottom, val2);
+        }
+
+        public override void Copy(int slot, int doc)
+        {
+            currentDocTerms.Get(doc, tempBR);
+            if (tempBR.bytes == BinaryDocValues.MISSING)
+            {
+                values[slot] = null;
+            }
+            else
+            {
+                values[slot] = tempBR.Utf8ToString();
+            }
+        }
+
+        public override FieldComparator SetNextReader(AtomicReaderContext context)
+        {
+            currentDocTerms = FieldCache.DEFAULT.GetTerms(context.AtomicReader, field);
+            return this;
+        }
+
+        public override void SetBottom(int bottom)
+        {
+            this.bottom = values[bottom];
+        }
+
+        public override object Value(int slot)
+        {
+            return values[slot];
+        }
+
+        public override int CompareValues(string first, string second)
+        {
+            if (first == null)
+            {
+                if (second == null)
+                {
+                    return 0;
+                }
+
+                return -1;
+            }
+            else if (second == null)
+            {
+                return 1;
+            }
+            else
+            {
+                return collator.Compare(first, second);
+            }
+        }
+
+        public override int CompareDocToValue(int doc, string value)
+        {
+            currentDocTerms.Get(doc, tempBR);
+            string docValue;
+            if (tempBR.bytes == BinaryDocValues.MISSING)
+            {
+                docValue = null;
+            }
+            else
+            {
+                docValue = tempBR.Utf8ToString();
+            }
+
+            return CompareValues(docValue, value);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Sandbox/Queries/SlowCollatedTermRangeFilter.cs b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeFilter.cs
new file mode 100644
index 0000000..45b97d8
--- /dev/null
+++ b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeFilter.cs
@@ -0,0 +1,41 @@
+using Lucene.Net.Search;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    public class SlowCollatedTermRangeFilter : MultiTermQueryWrapperFilter<SlowCollatedTermRangeQuery>
+    {
+        public SlowCollatedTermRangeFilter(string fieldName, string lowerTerm, string upperTerm,
bool includeLower, bool includeUpper, StringComparer collator)
+            : base(new SlowCollatedTermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower,
includeUpper, collator))
+        {
+        }
+
+        public virtual string GetLowerTerm()
+        {
+            return query.GetLowerTerm();
+        }
+
+        public virtual string GetUpperTerm()
+        {
+            return query.GetUpperTerm();
+        }
+
+        public virtual bool IncludesLower()
+        {
+            return query.IncludesLower();
+        }
+
+        public virtual bool IncludesUpper()
+        {
+            return query.IncludesUpper();
+        }
+
+        public virtual StringComparer GetCollator()
+        {
+            return query.GetCollator();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeQuery.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Sandbox/Queries/SlowCollatedTermRangeQuery.cs b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeQuery.cs
new file mode 100644
index 0000000..ff7b2de
--- /dev/null
+++ b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeQuery.cs
@@ -0,0 +1,145 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    public class SlowCollatedTermRangeQuery : MultiTermQuery
+    {
+        private string lowerTerm;
+        private string upperTerm;
+        private bool includeLower;
+        private bool includeUpper;
+        private StringComparer collator;
+
+        public SlowCollatedTermRangeQuery(string field, string lowerTerm, string upperTerm,
bool includeLower, bool includeUpper, StringComparer collator)
+            : base(field)
+        {
+            this.lowerTerm = lowerTerm;
+            this.upperTerm = upperTerm;
+            this.includeLower = includeLower;
+            this.includeUpper = includeUpper;
+            this.collator = collator;
+        }
+
+        public virtual string GetLowerTerm()
+        {
+            return lowerTerm;
+        }
+
+        public virtual string GetUpperTerm()
+        {
+            return upperTerm;
+        }
+
+        public virtual bool IncludesLower()
+        {
+            return includeLower;
+        }
+
+        public virtual bool IncludesUpper()
+        {
+            return includeUpper;
+        }
+
+        public virtual StringComparer GetCollator()
+        {
+            return collator;
+        }
+
+        protected override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts)
+        {
+            if (lowerTerm != null && upperTerm != null && collator.Compare(lowerTerm,
upperTerm) > 0)
+            {
+                return TermsEnum.EMPTY;
+            }
+
+            TermsEnum tenum = terms.Iterator(null);
+            if (lowerTerm == null && upperTerm == null)
+            {
+                return tenum;
+            }
+
+            return new SlowCollatedTermRangeTermsEnum(tenum, lowerTerm, upperTerm, includeLower,
includeUpper, collator);
+        }
+
+        public override string Field
+        {
+            get
+            {
+                return base.Field;
+            }
+        }
+
+        public override string ToString(string field)
+        {
+            StringBuilder buffer = new StringBuilder();
+            if (!Field.Equals(field))
+            {
+                buffer.Append(Field);
+                buffer.Append(@":");
+            }
+
+            buffer.Append(includeLower ? '[' : '{');
+            buffer.Append(lowerTerm != null ? lowerTerm : @"*");
+            buffer.Append(@" TO ");
+            buffer.Append(upperTerm != null ? upperTerm : @"*");
+            buffer.Append(includeUpper ? ']' : '}');
+            buffer.Append(ToStringUtils.Boost(Boost));
+            return buffer.ToString();
+        }
+
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = base.GetHashCode();
+            result = prime * result + ((collator == null) ? 0 : collator.GetHashCode());
+            result = prime * result + (includeLower ? 1231 : 1237);
+            result = prime * result + (includeUpper ? 1231 : 1237);
+            result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.GetHashCode());
+            result = prime * result + ((upperTerm == null) ? 0 : upperTerm.GetHashCode());
+            return result;
+        }
+
+        public override bool Equals(Object obj)
+        {
+            if (this == obj)
+                return true;
+            if (!base.Equals(obj))
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            SlowCollatedTermRangeQuery other = (SlowCollatedTermRangeQuery)obj;
+            if (collator == null)
+            {
+                if (other.collator != null)
+                    return false;
+            }
+            else if (!collator.Equals(other.collator))
+                return false;
+            if (includeLower != other.includeLower)
+                return false;
+            if (includeUpper != other.includeUpper)
+                return false;
+            if (lowerTerm == null)
+            {
+                if (other.lowerTerm != null)
+                    return false;
+            }
+            else if (!lowerTerm.Equals(other.lowerTerm))
+                return false;
+            if (upperTerm == null)
+            {
+                if (other.upperTerm != null)
+                    return false;
+            }
+            else if (!upperTerm.Equals(other.upperTerm))
+                return false;
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeTermsEnum.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Sandbox/Queries/SlowCollatedTermRangeTermsEnum.cs b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeTermsEnum.cs
new file mode 100644
index 0000000..a4af5cd
--- /dev/null
+++ b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeTermsEnum.cs
@@ -0,0 +1,46 @@
+using Lucene.Net.Index;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    public class SlowCollatedTermRangeTermsEnum : FilteredTermsEnum
+    {
+        private StringComparer collator;
+        private string upperTermText;
+        private string lowerTermText;
+        private bool includeLower;
+        private bool includeUpper;
+
+        public SlowCollatedTermRangeTermsEnum(TermsEnum tenum, string lowerTermText, string
upperTermText, bool includeLower, bool includeUpper, StringComparer collator)
+            : base(tenum)
+        {
+            this.collator = collator;
+            this.upperTermText = upperTermText;
+            this.lowerTermText = lowerTermText;
+            this.includeLower = includeLower;
+            this.includeUpper = includeUpper;
+            if (this.lowerTermText == null)
+            {
+                this.lowerTermText = @"";
+                this.includeLower = true;
+            }
+
+            BytesRef startBytesRef = new BytesRef("");
+            InitialSeekTerm = startBytesRef;
+        }
+
+        protected override AcceptStatus Accept(BytesRef term)
+        {
+            if ((includeLower ? collator.Compare(term.Utf8ToString(), lowerTermText) >=
0 : collator.Compare(term.Utf8ToString(), lowerTermText) > 0) && (upperTermText
== null || (includeUpper ? collator.Compare(term.Utf8ToString(), upperTermText) <= 0 :
collator.Compare(term.Utf8ToString(), upperTermText) < 0)))
+            {
+                return AcceptStatus.YES;
+            }
+
+            return AcceptStatus.NO;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowFuzzyQuery.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Sandbox/Queries/SlowFuzzyQuery.cs b/src/contrib/Sandbox/Queries/SlowFuzzyQuery.cs
new file mode 100644
index 0000000..d68ac25
--- /dev/null
+++ b/src/contrib/Sandbox/Queries/SlowFuzzyQuery.cs
@@ -0,0 +1,136 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    public class SlowFuzzyQuery : MultiTermQuery
+    {
+        public static readonly float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
+        public static readonly int defaultPrefixLength = 0;
+        public static readonly int defaultMaxExpansions = 50;
+        private float minimumSimilarity;
+        private int prefixLength;
+        private bool termLongEnough = false;
+        protected Term term;
+
+        public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength, int maxExpansions)
+            : base(term.Field)
+        {
+            this.term = term;
+            if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
+                throw new ArgumentException(@"fractional edit distances are not allowed");
+            if (minimumSimilarity < 0.0f)
+                throw new ArgumentException(@"minimumSimilarity < 0");
+            if (prefixLength < 0)
+                throw new ArgumentException(@"prefixLength < 0");
+            if (maxExpansions < 0)
+                throw new ArgumentException(@"maxExpansions < 0");
+            SetRewriteMethod(new TopTermsScoringBooleanQueryRewrite(maxExpansions));
+            string text = term.Text;
+            int len = text.Length;
+            if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f
- minimumSimilarity)))
+            {
+                this.termLongEnough = true;
+            }
+
+            this.minimumSimilarity = minimumSimilarity;
+            this.prefixLength = prefixLength;
+        }
+
+        public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength)
+            : this(term, minimumSimilarity, prefixLength, defaultMaxExpansions)
+        {
+        }
+
+        public SlowFuzzyQuery(Term term, float minimumSimilarity)
+            : this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions)
+        {
+        }
+
+        public SlowFuzzyQuery(Term term)
+            : this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions)
+        {
+        }
+
+        public virtual float GetMinSimilarity()
+        {
+            return minimumSimilarity;
+        }
+
+        public virtual int GetPrefixLength()
+        {
+            return prefixLength;
+        }
+
+        protected override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts)
+        {
+            if (!termLongEnough)
+            {
+                return new SingleTermsEnum(terms.Iterator(null), term.Bytes);
+            }
+
+            return new SlowFuzzyTermsEnum(terms, atts, GetTerm(), minimumSimilarity, prefixLength);
+        }
+
+        public virtual Term GetTerm()
+        {
+            return term;
+        }
+
+        public override string ToString(string field)
+        {
+            StringBuilder buffer = new StringBuilder();
+            if (!term.Field.Equals(field))
+            {
+                buffer.Append(term.Field);
+                buffer.Append(@":");
+            }
+
+            buffer.Append(term.Text);
+            buffer.Append('~');
+            buffer.Append(minimumSimilarity);
+            buffer.Append(ToStringUtils.Boost(Boost));
+            return buffer.ToString();
+        }
+
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = base.GetHashCode();
+            result = prime * result + Number.FloatToIntBits(minimumSimilarity);
+            result = prime * result + prefixLength;
+            result = prime * result + ((term == null) ? 0 : term.GetHashCode());
+            return result;
+        }
+
+        public override bool Equals(Object obj)
+        {
+            if (this == obj)
+                return true;
+            if (!base.Equals(obj))
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            SlowFuzzyQuery other = (SlowFuzzyQuery)obj;
+            if (Number.FloatToIntBits(minimumSimilarity) != Number.FloatToIntBits(other.minimumSimilarity))
+                return false;
+            if (prefixLength != other.prefixLength)
+                return false;
+            if (term == null)
+            {
+                if (other.term != null)
+                    return false;
+            }
+            else if (!term.Equals(other.term))
+                return false;
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowFuzzyTermsEnum.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Sandbox/Queries/SlowFuzzyTermsEnum.cs b/src/contrib/Sandbox/Queries/SlowFuzzyTermsEnum.cs
new file mode 100644
index 0000000..e696d27
--- /dev/null
+++ b/src/contrib/Sandbox/Queries/SlowFuzzyTermsEnum.cs
@@ -0,0 +1,142 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    public sealed class SlowFuzzyTermsEnum : FuzzyTermsEnum
+    {
+        public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, float minSimilarity,
int prefixLength)
+            : base(terms, atts, term, minSimilarity, prefixLength, false)
+        {
+        }
+
+        protected override void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool
init)
+        {
+            TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm);
+            if (newEnum != null)
+            {
+                SetEnum(newEnum);
+            }
+            else if (init)
+            {
+                SetEnum(new LinearFuzzyTermsEnum(this));
+            }
+        }
+
+        private class LinearFuzzyTermsEnum : FilteredTermsEnum
+        {
+            private int[] d;
+            private int[] p;
+            private readonly int[] text;
+            private readonly IBoostAttribute boostAtt; // = Attributes.AddAttribute<IBoostAttribute>();
+            
+            public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum parent)
+                : base(parent.terms.Iterator(null))
+            {
+                this.parent = parent; 
+
+                boostAtt = Attributes.AddAttribute<IBoostAttribute>();
+
+                this.text = new int[parent.termLength - parent.realPrefixLength];
+                Array.Copy(parent.termText, parent.realPrefixLength, text, 0, text.Length);
+                string prefix = UnicodeUtil.NewString(parent.termText, 0, parent.realPrefixLength);
+                prefixBytesRef = new BytesRef(prefix);
+                this.d = new int[this.text.Length + 1];
+                this.p = new int[this.text.Length + 1];
+                InitialSeekTerm = prefixBytesRef;
+            }
+
+            private readonly BytesRef prefixBytesRef;
+            private readonly IntsRef utf32 = new IntsRef(20);
+
+            private readonly SlowFuzzyTermsEnum parent;
+
+            protected override AcceptStatus Accept(BytesRef term)
+            {
+                if (StringHelper.StartsWith(term, prefixBytesRef))
+                {
+                    UnicodeUtil.UTF8toUTF32(term, utf32);
+                    float similarity = Similarity(utf32.ints, parent.realPrefixLength, utf32.length
- parent.realPrefixLength);
+                    if (similarity > parent.minSimilarity)
+                    {
+                        boostAtt.Boost = (similarity - parent.minSimilarity) * parent.scale_factor;
+                        return AcceptStatus.YES;
+                    }
+                    else
+                        return AcceptStatus.NO;
+                }
+                else
+                {
+                    return AcceptStatus.END;
+                }
+            }
+
+            private float Similarity(int[] target, int offset, int length)
+            {
+                int m = length;
+                int n = text.Length;
+                if (n == 0)
+                {
+                    return parent.realPrefixLength == 0 ? 0.0f : 1.0f - ((float)m / parent.realPrefixLength);
+                }
+
+                if (m == 0)
+                {
+                    return parent.realPrefixLength == 0 ? 0.0f : 1.0f - ((float)n / parent.realPrefixLength);
+                }
+
+                int maxDistance = CalculateMaxDistance(m);
+                if (maxDistance < Math.Abs(m - n))
+                {
+                    return float.NegativeInfinity;
+                }
+
+                for (int i = 0; i <= n; ++i)
+                {
+                    p[i] = i;
+                }
+
+                for (int j = 1; j <= m; ++j)
+                {
+                    int bestPossibleEditDistance = m;
+                    int t_j = target[offset + j - 1];
+                    d[0] = j;
+                    for (int i = 1; i <= n; ++i)
+                    {
+                        if (t_j != text[i - 1])
+                        {
+                            d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1;
+                        }
+                        else
+                        {
+                            d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]);
+                        }
+
+                        bestPossibleEditDistance = Math.Min(bestPossibleEditDistance, d[i]);
+                    }
+
+                    if (j > maxDistance && bestPossibleEditDistance > maxDistance)
+                    {
+                        return float.NegativeInfinity;
+                    }
+
+                    int[] _d = p;
+                    p = d;
+                    d = _d;
+                }
+
+                return 1.0f - ((float)p[n] / (float)(parent.realPrefixLength + Math.Min(n,
m)));
+            }
+
+            private int CalculateMaxDistance(int m)
+            {
+                return parent.raw ? parent.maxEdits : Math.Min(parent.maxEdits, (int)((1
- parent.minSimilarity) * (Math.Min(text.Length, m) + parent.realPrefixLength)));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/core/Search/FuzzyTermsEnum.cs
----------------------------------------------------------------------
diff --git a/src/core/Search/FuzzyTermsEnum.cs b/src/core/Search/FuzzyTermsEnum.cs
index dd680af..e77fe9a 100644
--- a/src/core/Search/FuzzyTermsEnum.cs
+++ b/src/core/Search/FuzzyTermsEnum.cs
@@ -34,7 +34,7 @@ namespace Lucene.Net.Search
     /// <p/>Term enumerations are always ordered by Term.compareTo().  Each term in
     /// the enumeration is greater than all that precede it.
     /// </summary>
-    public sealed class FuzzyTermsEnum : TermsEnum
+    public class FuzzyTermsEnum : TermsEnum
     {
         private TermsEnum actualEnum;
         private IBoostAttribute actualBoostAtt;
@@ -185,7 +185,7 @@ namespace Lucene.Net.Search
             }
         }
 
-        protected void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init)
+        protected virtual void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool
init)
         {
             TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm);
             // instead of assert, we do a hard check in case someone uses our enum directly


Mime
View raw message