lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From paulir...@apache.org
Subject [42/53] [abbrv] git commit: Port Facet.Sampling
Date Thu, 07 Nov 2013 13:53:57 GMT
Port Facet.Sampling


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/ff4fe045
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/ff4fe045
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/ff4fe045

Branch: refs/heads/branch_4x
Commit: ff4fe04539f3cba515d09ab67a86e000069326eb
Parents: 71e218c
Author: Paul Irwin <paulirwin@gmail.com>
Authored: Wed Nov 6 11:35:59 2013 -0500
Committer: Paul Irwin <paulirwin@gmail.com>
Committed: Wed Nov 6 11:35:59 2013 -0500

----------------------------------------------------------------------
 src/contrib/Facet/Contrib.Facet.csproj          |   9 +
 src/contrib/Facet/Sampling/ISampleFixer.cs      |  13 +
 src/contrib/Facet/Sampling/RandomSampler.cs     |  56 ++++
 src/contrib/Facet/Sampling/RepeatableSampler.cs | 294 +++++++++++++++++++
 src/contrib/Facet/Sampling/Sampler.cs           | 170 +++++++++++
 .../Facet/Sampling/SamplingAccumulator.cs       |  72 +++++
 src/contrib/Facet/Sampling/SamplingParams.cs    |  86 ++++++
 src/contrib/Facet/Sampling/SamplingWrapper.cs   |  58 ++++
 src/contrib/Facet/Sampling/TakmiSampleFixer.cs  | 125 ++++++++
 src/contrib/Facet/Search/DrillDownQuery.cs      | 201 +++++++++++++
 src/core/Lucene.Net.csproj                      |   2 +
 src/core/Support/DateTimeExtensions.cs          |  17 ++
 src/core/Support/IDictionaryExtensions.cs       |  18 ++
 13 files changed, 1121 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Contrib.Facet.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Contrib.Facet.csproj b/src/contrib/Facet/Contrib.Facet.csproj
index 5c15e5b..9f15e12 100644
--- a/src/contrib/Facet/Contrib.Facet.csproj
+++ b/src/contrib/Facet/Contrib.Facet.csproj
@@ -86,11 +86,20 @@
     <Compile Include="Partitions\IIntermediateFacetResult.cs" />
     <Compile Include="Partitions\PartitionsFacetResultsHandler.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="Sampling\ISampleFixer.cs" />
+    <Compile Include="Sampling\RandomSampler.cs" />
+    <Compile Include="Sampling\RepeatableSampler.cs" />
+    <Compile Include="Sampling\Sampler.cs" />
+    <Compile Include="Sampling\SamplingAccumulator.cs" />
+    <Compile Include="Sampling\SamplingParams.cs" />
+    <Compile Include="Sampling\SamplingWrapper.cs" />
+    <Compile Include="Sampling\TakmiSampleFixer.cs" />
     <Compile Include="Search\CountFacetRequest.cs" />
     <Compile Include="Search\CountingAggregator.cs" />
     <Compile Include="Search\CountingFacetsAggregator.cs" />
     <Compile Include="Search\DepthOneFacetResultsHandler.cs" />
     <Compile Include="Search\DocValuesCategoryListIterator.cs" />
+    <Compile Include="Search\DrillDownQuery.cs" />
     <Compile Include="Search\FacetArrays.cs" />
     <Compile Include="Search\FacetRequest.cs" />
     <Compile Include="Search\FacetResult.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/ISampleFixer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Sampling/ISampleFixer.cs b/src/contrib/Facet/Sampling/ISampleFixer.cs
new file mode 100644
index 0000000..57146b5
--- /dev/null
+++ b/src/contrib/Facet/Sampling/ISampleFixer.cs
@@ -0,0 +1,13 @@
+using Lucene.Net.Facet.Search;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.Sampling
+{
+    public interface ISampleFixer
+    {
+        void FixResult(IScoredDocIDs origDocIds, FacetResult fres);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/RandomSampler.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Sampling/RandomSampler.cs b/src/contrib/Facet/Sampling/RandomSampler.cs
new file mode 100644
index 0000000..1dc7f1c
--- /dev/null
+++ b/src/contrib/Facet/Sampling/RandomSampler.cs
@@ -0,0 +1,56 @@
+using Lucene.Net.Facet.Search;
+using Lucene.Net.Facet.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.Sampling
+{
+    public class RandomSampler : Sampler
+    {
+        private readonly Random random;
+
+        public RandomSampler()
+            : base()
+        {
+            this.random = new Random();
+        }
+
+        public RandomSampler(SamplingParams params_renamed, Random random)
+            : base(params_renamed)
+        {
+            this.random = random;
+        }
+
+        protected override SampleResult CreateSample(IScoredDocIDs docids, int actualSize,
int sampleSetSize)
+        {
+            int[] sample = new int[sampleSetSize];
+            int maxStep = (actualSize * 2) / sampleSetSize;
+            int remaining = actualSize;
+            IScoredDocIDsIterator it = docids.Iterator();
+            int i = 0;
+            while (i < sample.Length && remaining > (sampleSetSize - maxStep
- i))
+            {
+                int skipStep = 1 + random.Next(maxStep);
+                for (int j = 0; j < skipStep; j++)
+                {
+                    it.Next();
+                    --remaining;
+                }
+
+                sample[i++] = it.DocID;
+            }
+
+            while (i < sample.Length)
+            {
+                it.Next();
+                sample[i++] = it.DocID;
+            }
+
+            IScoredDocIDs sampleRes = ScoredDocIdsUtils.CreateScoredDocIDsSubset(docids,
sample);
+            SampleResult res = new SampleResult(sampleRes, sampleSetSize / (double)actualSize);
+            return res;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/RepeatableSampler.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Sampling/RepeatableSampler.cs b/src/contrib/Facet/Sampling/RepeatableSampler.cs
new file mode 100644
index 0000000..d0f10ee
--- /dev/null
+++ b/src/contrib/Facet/Sampling/RepeatableSampler.cs
@@ -0,0 +1,294 @@
+using Lucene.Net.Facet.Search;
+using Lucene.Net.Facet.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.Sampling
+{
+    public class RepeatableSampler : Sampler
+    {
+        public RepeatableSampler(SamplingParams params_renamed)
+            : base(params_renamed)
+        {
+        }
+
+        protected override SampleResult CreateSample(IScoredDocIDs docids, int actualSize,
int sampleSetSize)
+        {
+            int[] sampleSet = null;
+            try
+            {
+                sampleSet = RepeatableSample(docids, actualSize, sampleSetSize);
+            }
+            catch (IOException e)
+            {
+                Trace.TraceWarning(@"sampling failed: " + e.Message + @" - falling back to
no sampling!", e);
+                
+                return new SampleResult(docids, 1.0);
+            }
+
+            IScoredDocIDs sampled = ScoredDocIdsUtils.CreateScoredDocIDsSubset(docids, sampleSet);
+            Debug.WriteLine(@"******************** " + sampled.Size);
+
+            return new SampleResult(sampled, sampled.Size / (double)docids.Size);
+        }
+
+        private static int[] RepeatableSample(IScoredDocIDs collection, int collectionSize,
int sampleSize)
+        {
+            return RepeatableSample(collection, collectionSize, sampleSize, Algorithm.HASHING,
Sorted.NO);
+        }
+
+        private static int[] RepeatableSample(IScoredDocIDs collection, int collectionSize,
int sampleSize, Algorithm algorithm, Sorted sorted)
+        {
+            if (collection == null)
+            {
+                throw new IOException(@"docIdSet is null");
+            }
+
+            if (sampleSize < 1)
+            {
+                throw new IOException(@"sampleSize < 1 (" + sampleSize + @")");
+            }
+
+            if (collectionSize < sampleSize)
+            {
+                throw new IOException(@"collectionSize (" + collectionSize + @") less than
sampleSize (" + sampleSize + @")");
+            }
+
+            int[] sample = new int[sampleSize];
+            long[] times = new long[4];
+            if (algorithm == Algorithm.TRAVERSAL)
+            {
+                Sample1(collection, collectionSize, sample, times);
+            }
+            else if (algorithm == Algorithm.HASHING)
+            {
+                Sample2(collection, collectionSize, sample, times);
+            }
+            else
+            {
+                throw new ArgumentException(@"Invalid algorithm selection");
+            }
+
+            if (sorted == Sorted.YES)
+            {
+                Array.Sort(sample);
+            }
+
+            if (returnTimings)
+            {
+                times[3] = DateTime.UtcNow.CurrentTimeMillis();
+                Debug.WriteLine(@"Times: " + (times[1] - times[0]) + @"ms, " + (times[2]
- times[1]) + @"ms, " + (times[3] - times[2]) + @"ms");
+            }
+
+            return sample;
+        }
+
+        private static void Sample1(IScoredDocIDs collection, int collectionSize, int[] sample,
long[] times)
+        {
+            IScoredDocIDsIterator it = collection.Iterator();
+            if (returnTimings)
+            {
+                times[0] = DateTime.UtcNow.CurrentTimeMillis();
+            }
+
+            int sampleSize = sample.Length;
+            int prime = FindGoodStepSize(collectionSize, sampleSize);
+            int mod = prime % collectionSize;
+            if (returnTimings)
+            {
+                times[1] = DateTime.UtcNow.CurrentTimeMillis();
+            }
+
+            int sampleCount = 0;
+            int index = 0;
+            for (; sampleCount < sampleSize; )
+            {
+                if (index + mod < collectionSize)
+                {
+                    for (int i = 0; i < mod; i++, index++)
+                    {
+                        it.Next();
+                    }
+                }
+                else
+                {
+                    index = index + mod - collectionSize;
+                    it = collection.Iterator();
+                    for (int i = 0; i < index; i++)
+                    {
+                        it.Next();
+                    }
+                }
+
+                sample[sampleCount++] = it.DocID;
+            }
+
+            if (returnTimings)
+            {
+                times[2] = DateTime.UtcNow.CurrentTimeMillis();
+            }
+        }
+
+        private static int FindGoodStepSize(int collectionSize, int sampleSize)
+        {
+            int i = (int)Math.Sqrt(collectionSize);
+            if (sampleSize < i)
+            {
+                i = collectionSize / sampleSize;
+            }
+
+            do
+            {
+                i = FindNextPrimeAfter(i);
+            }
+            while (collectionSize % i == 0);
+            return i;
+        }
+
+        private static int FindNextPrimeAfter(int n)
+        {
+            n += (n % 2 == 0) ? 1 : 2;
+        
+            for (; ; n += 2)
+            {
+                bool shouldContinueOuter = false;
+
+                int sri = (int)(Math.Sqrt(n));
+
+                for (int primeIndex = 0; primeIndex < N_PRIMES; primeIndex++)
+                {
+                    int p = primes[primeIndex];
+                    if (p > sri)
+                    {
+                        return n;
+                    }
+
+                    if (n % p == 0)
+                    {
+                        shouldContinueOuter = true;
+                        break;
+                    }
+                }
+
+                if (shouldContinueOuter)
+                    continue;
+
+                for (int p = primes[N_PRIMES - 1] + 2; ; p += 2)
+                {
+                    if (p > sri)
+                    {
+                        return n;
+                    }
+
+                    if (n % p == 0)
+                    {
+                        shouldContinueOuter = true;
+                        break;
+                    }
+                }
+
+                if (shouldContinueOuter)
+                    continue;
+            }
+        }
+
+        private static readonly int N_PRIMES = 4000;
+        private static int[] primes = new int[N_PRIMES];
+        static RepeatableSampler()
+        {
+            primes[0] = 3;
+            for (int count = 1; count < N_PRIMES; count++)
+            {
+                primes[count] = FindNextPrimeAfter(primes[count - 1]);
+            }
+        }
+
+        private static void Sample2(IScoredDocIDs collection, int collectionSize, int[] sample,
long[] times)
+        {
+            if (returnTimings)
+            {
+                times[0] = DateTime.UtcNow.CurrentTimeMillis();
+            }
+
+            int sampleSize = sample.Length;
+            IntPriorityQueue pq = new IntPriorityQueue(sampleSize);
+            IScoredDocIDsIterator it = collection.Iterator();
+            MI mi = null;
+            while (it.Next())
+            {
+                if (mi == null)
+                {
+                    mi = new MI();
+                }
+
+                mi.value = (int)(it.DocID * PHI_32) & 0x7FFFFFFF;
+                mi = pq.InsertWithOverflow(mi);
+            }
+
+            if (returnTimings)
+            {
+                times[1] = DateTime.UtcNow.CurrentTimeMillis();
+            }
+
+            Object[] heap = pq.GetHeap();
+            for (int si = 0; si < sampleSize; si++)
+            {
+                sample[si] = (int)(((MI)heap[si + 1]).value * PHI_32I) & 0x7FFFFFFF;
+            }
+
+            if (returnTimings)
+            {
+                times[2] = DateTime.UtcNow.CurrentTimeMillis();
+            }
+        }
+
+        private class MI
+        {
+            internal MI()
+            {
+            }
+
+            public int value;
+        }
+
+        private class IntPriorityQueue : Lucene.Net.Util.PriorityQueue<MI>
+        {
+            public IntPriorityQueue(int size)
+                : base(size)
+            {
+            }
+
+            public virtual Object[] GetHeap()
+            {
+                return GetHeapArray();
+            }
+
+            public override bool LessThan(MI o1, MI o2)
+            {
+                return o1.value < o2.value;
+            }
+        }
+
+        private enum Algorithm
+        {
+            TRAVERSAL,
+            HASHING
+        }
+
+        private enum Sorted
+        {
+            YES,
+            NO
+        }
+
+        private static readonly long PHI_32 = 2654435769L;
+        private static readonly long PHI_32I = 340573321L;
+        private static bool returnTimings = false;
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/Sampler.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Sampling/Sampler.cs b/src/contrib/Facet/Sampling/Sampler.cs
new file mode 100644
index 0000000..1941a70
--- /dev/null
+++ b/src/contrib/Facet/Sampling/Sampler.cs
@@ -0,0 +1,170 @@
+using Lucene.Net.Facet.Params;
+using Lucene.Net.Facet.Search;
+using Lucene.Net.Facet.Taxonomy;
+using Lucene.Net.Index;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.Sampling
+{
+    public abstract class Sampler
+    {
+        protected readonly SamplingParams samplingParams;
+        public Sampler()
+            : this(new SamplingParams())
+        {
+        }
+
+        public Sampler(SamplingParams params_renamed)
+        {
+            if (!params_renamed.Validate())
+            {
+                throw new ArgumentException(@"The provided SamplingParams are not valid!!");
+            }
+
+            this.samplingParams = params_renamed;
+        }
+
+        public virtual bool ShouldSample(IScoredDocIDs docIds)
+        {
+            return docIds.Size > samplingParams.SamplingThreshold;
+        }
+
+        public virtual SampleResult GetSampleSet(IScoredDocIDs docids)
+        {
+            if (!ShouldSample(docids))
+            {
+                return new SampleResult(docids, 1.0);
+            }
+
+            int actualSize = docids.Size;
+            int sampleSetSize = (int)(actualSize * samplingParams.SampleRatio);
+            sampleSetSize = Math.Max(sampleSetSize, samplingParams.MinSampleSize);
+            sampleSetSize = Math.Min(sampleSetSize, samplingParams.MaxSampleSize);
+            return CreateSample(docids, actualSize, sampleSetSize);
+        }
+
+        protected abstract SampleResult CreateSample(IScoredDocIDs docids, int actualSize,
int sampleSetSize);
+
+        public virtual ISampleFixer GetSampleFixer(IndexReader indexReader, TaxonomyReader
taxonomyReader, FacetSearchParams searchParams)
+        {
+            return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
+        }
+
+        public sealed class SampleResult
+        {
+            public readonly IScoredDocIDs docids;
+            public readonly double actualSampleRatio;
+
+            internal SampleResult(IScoredDocIDs docids, double actualSampleRatio)
+            {
+                this.docids = docids;
+                this.actualSampleRatio = actualSampleRatio;
+            }
+        }
+
+        public SamplingParams SamplingParams
+        {
+            get
+            {
+                return samplingParams;
+            }
+        }
+
+        public virtual FacetResult TrimResult(FacetResult facetResult)
+        {
+            double overSampleFactor = SamplingParams.OversampleFactor;
+            if (overSampleFactor <= 1)
+            {
+                return facetResult;
+            }
+
+            OverSampledFacetRequest sampledFreq = null;
+            try
+            {
+                sampledFreq = (OverSampledFacetRequest)facetResult.FacetRequest;
+            }
+            catch (InvalidCastException e)
+            {
+                throw new ArgumentException(@"It is only valid to call this method with result
obtained for a " + @"facet request created through sampler.overSamlpingSearchParams()", e);
+            }
+
+            FacetRequest origFrq = sampledFreq.orig;
+            FacetResultNode trimmedRootNode = facetResult.FacetResultNode;
+            TrimSubResults(trimmedRootNode, origFrq.numResults);
+            return new FacetResult(origFrq, trimmedRootNode, facetResult.NumValidDescendants);
+        }
+
+        private void TrimSubResults(FacetResultNode node, int size)
+        {
+            if (node.subResults == FacetResultNode.EMPTY_SUB_RESULTS || node.subResults.Count
== 0)
+            {
+                return;
+            }
+
+            List<FacetResultNode> trimmed = new List<FacetResultNode>(size);
+            for (int i = 0; i < node.subResults.Count && i < size; i++)
+            {
+                FacetResultNode trimmedNode = node.subResults[i];
+                TrimSubResults(trimmedNode, size);
+                trimmed.Add(trimmedNode);
+            }
+
+            node.subResults = trimmed;
+        }
+
+        public virtual FacetSearchParams OverSampledSearchParams(FacetSearchParams original)
+        {
+            FacetSearchParams res = original;
+            double overSampleFactor = SamplingParams.OversampleFactor;
+            if (overSampleFactor > 1)
+            {
+                List<FacetRequest> facetRequests = new List<FacetRequest>();
+                foreach (FacetRequest frq in original.facetRequests)
+                {
+                    int overSampledNumResults = (int)Math.Ceiling(frq.numResults * overSampleFactor);
+                    facetRequests.Add(new OverSampledFacetRequest(frq, overSampledNumResults));
+                }
+
+                res = new FacetSearchParams(original.indexingParams, facetRequests);
+            }
+
+            return res;
+        }
+
+        private class OverSampledFacetRequest : FacetRequest
+        {
+            internal readonly FacetRequest orig;
+
+            public OverSampledFacetRequest(FacetRequest orig, int num)
+                : base(orig.categoryPath, num)
+            {
+                this.orig = orig;
+                Depth = orig.Depth;
+                NumLabel = orig.NumLabel;
+                ResultModeValue = orig.ResultModeValue;
+                SortOrderValue = orig.SortOrderValue;
+            }
+
+            public override IAggregator CreateAggregator(bool useComplements, FacetArrays
arrays, TaxonomyReader taxonomy)
+            {
+                return orig.CreateAggregator(useComplements, arrays, taxonomy);
+            }
+
+            public override FacetArraysSource FacetArraysSourceValue
+            {
+                get
+                {
+                    return orig.FacetArraysSourceValue;
+                }
+            }
+
+            public override double GetValueOf(FacetArrays arrays, int idx)
+            {
+                return orig.GetValueOf(arrays, idx);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/SamplingAccumulator.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Sampling/SamplingAccumulator.cs b/src/contrib/Facet/Sampling/SamplingAccumulator.cs
new file mode 100644
index 0000000..8aefb4b
--- /dev/null
+++ b/src/contrib/Facet/Sampling/SamplingAccumulator.cs
@@ -0,0 +1,72 @@
+using Lucene.Net.Facet.Params;
+using Lucene.Net.Facet.Partitions;
+using Lucene.Net.Facet.Search;
+using Lucene.Net.Facet.Taxonomy;
+using Lucene.Net.Index;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.Sampling
+{
+    public class SamplingAccumulator : StandardFacetsAccumulator
+    {
+        private double samplingRatio = -1.0;
+        private readonly Sampler sampler;
+
+        public SamplingAccumulator(Sampler sampler, FacetSearchParams searchParams, IndexReader
indexReader, TaxonomyReader taxonomyReader, FacetArrays facetArrays)
+            : base(searchParams, indexReader, taxonomyReader, facetArrays)
+        {
+            this.sampler = sampler;
+        }
+
+        public SamplingAccumulator(Sampler sampler, FacetSearchParams searchParams, IndexReader
indexReader, TaxonomyReader taxonomyReader)
+            : base(searchParams, indexReader, taxonomyReader)
+        {
+            this.sampler = sampler;
+        }
+
+        public override List<FacetResult> Accumulate(IScoredDocIDs docids)
+        {
+            FacetSearchParams original = searchParams;
+            searchParams = sampler.OverSampledSearchParams(original);
+            List<FacetResult> sampleRes = base.Accumulate(docids);
+            List<FacetResult> fixedRes = new List<FacetResult>();
+            foreach (FacetResult fres in sampleRes)
+            {
+                var freswritable = fres;
+
+                PartitionsFacetResultsHandler frh = (PartitionsFacetResultsHandler)CreateFacetResultsHandler(freswritable.FacetRequest);
+                sampler.GetSampleFixer(indexReader, taxonomyReader, searchParams).FixResult(docids,
freswritable);
+                freswritable = frh.RearrangeFacetResult(freswritable);
+                freswritable = sampler.TrimResult(freswritable);
+                frh.LabelResult(freswritable);
+                fixedRes.Add(freswritable);
+            }
+
+            searchParams = original;
+            return fixedRes;
+        }
+
+        protected override IScoredDocIDs ActualDocsToAccumulate(IScoredDocIDs docids)
+        {
+            Sampler.SampleResult sampleRes = sampler.GetSampleSet(docids);
+            samplingRatio = sampleRes.actualSampleRatio;
+            return sampleRes.docids;
+        }
+
+        protected override double TotalCountsFactor
+        {
+            get
+            {
+                if (samplingRatio < 0)
+                {
+                    throw new InvalidOperationException(@"Total counts ratio unavailable
because actualDocsToAccumulate() was not invoked");
+                }
+
+                return samplingRatio;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/SamplingParams.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Sampling/SamplingParams.cs b/src/contrib/Facet/Sampling/SamplingParams.cs
new file mode 100644
index 0000000..7ffc68e
--- /dev/null
+++ b/src/contrib/Facet/Sampling/SamplingParams.cs
@@ -0,0 +1,86 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.Sampling
+{
+    public class SamplingParams
+    {
+        public static readonly double DEFAULT_OVERSAMPLE_FACTOR = 2.0;
+        public static readonly double DEFAULT_SAMPLE_RATIO = 0.0;
+        public static readonly int DEFAULT_MAX_SAMPLE_SIZE = 10000;
+        public static readonly int DEFAULT_MIN_SAMPLE_SIZE = 100;
+        public static readonly int DEFAULT_SAMPLING_THRESHOLD = 75000;
+        private int maxSampleSize = DEFAULT_MAX_SAMPLE_SIZE;
+        private int minSampleSize = DEFAULT_MIN_SAMPLE_SIZE;
+        private double sampleRatio = DEFAULT_SAMPLE_RATIO;
+        private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
+        private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR;
+
+        public int MaxSampleSize
+        {
+            get
+            {
+                return maxSampleSize;
+            }
+            set
+            {
+                maxSampleSize = value;
+            }
+        }
+
+        public int MinSampleSize
+        {
+            get
+            {
+                return minSampleSize;
+            }
+            set
+            {
+                minSampleSize = value;
+            }
+        }
+
+        public double SampleRatio
+        {
+            get
+            {
+                return sampleRatio;
+            }
+            set
+            {
+                sampleRatio = value;
+            }
+        }
+
+        public int SamplingThreshold
+        {
+            get
+            {
+                return samplingThreshold;
+            }
+            set
+            {
+                samplingThreshold = value;
+            }
+        }
+        
+        public virtual bool Validate()
+        {
+            return samplingThreshold >= maxSampleSize && maxSampleSize >= minSampleSize
&& sampleRatio > 0 && sampleRatio < 1;
+        }
+
+        public double OversampleFactor
+        {
+            get
+            {
+                return oversampleFactor;
+            }
+            set
+            {
+                oversampleFactor = value;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/SamplingWrapper.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Sampling/SamplingWrapper.cs b/src/contrib/Facet/Sampling/SamplingWrapper.cs
new file mode 100644
index 0000000..ce9400a
--- /dev/null
+++ b/src/contrib/Facet/Sampling/SamplingWrapper.cs
@@ -0,0 +1,58 @@
+using Lucene.Net.Facet.Params;
+using Lucene.Net.Facet.Partitions;
+using Lucene.Net.Facet.Search;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.Sampling
+{
+    public class SamplingWrapper : StandardFacetsAccumulator
+    {
+        private StandardFacetsAccumulator delegee;
+        private Sampler sampler;
+
+        public SamplingWrapper(StandardFacetsAccumulator delegee, Sampler sampler)
+            : base(delegee.searchParams, delegee.indexReader, delegee.taxonomyReader)
+        {
+            this.delegee = delegee;
+            this.sampler = sampler;
+        }
+
+        public override List<FacetResult> Accumulate(IScoredDocIDs docids)
+        {
+            FacetSearchParams original = delegee.searchParams;
+            delegee.searchParams = sampler.OverSampledSearchParams(original);
+            Sampler.SampleResult sampleSet = sampler.GetSampleSet(docids);
+            List<FacetResult> sampleRes = delegee.Accumulate(sampleSet.docids);
+            List<FacetResult> fixedRes = new List<FacetResult>();
+            foreach (FacetResult fres in sampleRes)
+            {
+                var freswritable = fres;
+
+                PartitionsFacetResultsHandler frh = (PartitionsFacetResultsHandler)CreateFacetResultsHandler(freswritable.FacetRequest);
+                sampler.GetSampleFixer(indexReader, taxonomyReader, searchParams).FixResult(docids,
freswritable);
+                freswritable = frh.RearrangeFacetResult(freswritable);
+                freswritable = sampler.TrimResult(freswritable);
+                frh.LabelResult(freswritable);
+                fixedRes.Add(freswritable);
+            }
+
+            delegee.searchParams = original;
+            return fixedRes;
+        }
+
+        public override double ComplementThreshold
+        {
+            get
+            {
+                return delegee.ComplementThreshold;
+            }
+            set
+            {
+                delegee.ComplementThreshold = value;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/TakmiSampleFixer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Sampling/TakmiSampleFixer.cs b/src/contrib/Facet/Sampling/TakmiSampleFixer.cs
new file mode 100644
index 0000000..1a6af01
--- /dev/null
+++ b/src/contrib/Facet/Sampling/TakmiSampleFixer.cs
@@ -0,0 +1,125 @@
+using Lucene.Net.Facet.Params;
+using Lucene.Net.Facet.Search;
+using Lucene.Net.Facet.Taxonomy;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.Sampling
+{
+    internal class TakmiSampleFixer : ISampleFixer
+    {
+        private TaxonomyReader taxonomyReader;
+        private IndexReader indexReader;
+        private FacetSearchParams searchParams;
+
+        public TakmiSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader, FacetSearchParams
searchParams)
+        {
+            this.indexReader = indexReader;
+            this.taxonomyReader = taxonomyReader;
+            this.searchParams = searchParams;
+        }
+
+        public void FixResult(IScoredDocIDs origDocIds, FacetResult fres)
+        {
+            FacetResultNode topRes = fres.FacetResultNode;
+            FixResultNode(topRes, origDocIds);
+        }
+
+        private void FixResultNode(FacetResultNode facetResNode, IScoredDocIDs docIds)
+        {
+            Recount(facetResNode, docIds);
+            foreach (FacetResultNode frn in facetResNode.subResults)
+            {
+                FixResultNode(frn, docIds);
+            }
+        }
+
+        private void Recount(FacetResultNode fresNode, IScoredDocIDs docIds)
+        {
+            if (fresNode.label == null)
+            {
+                fresNode.label = taxonomyReader.GetPath(fresNode.ordinal);
+            }
+
+            CategoryPath catPath = fresNode.label;
+            Term drillDownTerm = DrillDownQuery.Term(searchParams.indexingParams, catPath);
+            IBits liveDocs = MultiFields.GetLiveDocs(indexReader);
+            int updatedCount = CountIntersection(MultiFields.GetTermDocsEnum(indexReader,
liveDocs, drillDownTerm.Field, drillDownTerm.Bytes, 0), docIds.Iterator());
+            fresNode.value = updatedCount;
+        }
+
+        private static int CountIntersection(DocsEnum p1, IScoredDocIDsIterator p2)
+        {
+            if (p1 == null || p1.NextDoc() == DocIdSetIterator.NO_MORE_DOCS)
+            {
+                return 0;
+            }
+
+            if (!p2.Next())
+            {
+                return 0;
+            }
+
+            int d1 = p1.DocID;
+            int d2 = p2.DocID;
+            int count = 0;
+            for (; ; )
+            {
+                if (d1 == d2)
+                {
+                    ++count;
+                    if (p1.NextDoc() == DocIdSetIterator.NO_MORE_DOCS)
+                    {
+                        break;
+                    }
+
+                    d1 = p1.DocID;
+                    if (!Advance(p2, d1))
+                    {
+                        break;
+                    }
+
+                    d2 = p2.DocID;
+                }
+                else if (d1 < d2)
+                {
+                    if (p1.Advance(d2) == DocIdSetIterator.NO_MORE_DOCS)
+                    {
+                        break;
+                    }
+
+                    d1 = p1.DocID;
+                }
+                else
+                {
+                    if (!Advance(p2, d1))
+                    {
+                        break;
+                    }
+
+                    d2 = p2.DocID;
+                }
+            }
+
+            return count;
+        }
+
+        private static bool Advance(IScoredDocIDsIterator iterator, int targetDoc)
+        {
+            while (iterator.Next())
+            {
+                if (iterator.DocID >= targetDoc)
+                {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Search/DrillDownQuery.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Search/DrillDownQuery.cs b/src/contrib/Facet/Search/DrillDownQuery.cs
new file mode 100644
index 0000000..a9cf63a
--- /dev/null
+++ b/src/contrib/Facet/Search/DrillDownQuery.cs
@@ -0,0 +1,201 @@
+using Lucene.Net.Facet.Params;
+using Lucene.Net.Facet.Taxonomy;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Facet.Search
+{
+    public sealed class DrillDownQuery : Query
+    {
+        public static Term Term(FacetIndexingParams iParams, CategoryPath path)
+        {
+            CategoryListParams clp = iParams.GetCategoryListParams(path);
+            char[] buffer = new char[path.FullPathLength()];
+            iParams.DrillDownTermText(path, buffer);
+            return new Term(clp.field, new string(buffer));
+        }
+
+        private readonly BooleanQuery query;
+        private readonly IDictionary<string, int?> drillDownDims = new HashMap<string,
int?>();
+        readonly FacetIndexingParams fip;
+
+        internal DrillDownQuery(FacetIndexingParams fip, BooleanQuery query, IDictionary<string,
int?> drillDownDims)
+        {
+            this.fip = fip;
+            this.query = (BooleanQuery)query.Clone();
+            this.drillDownDims.PutAll(drillDownDims);
+        }
+
+        internal DrillDownQuery(Filter filter, DrillDownQuery other)
+        {
+            query = new BooleanQuery(true);
+            BooleanClause[] clauses = other.query.Clauses;
+            if (clauses.Length == other.drillDownDims.Count)
+            {
+                throw new ArgumentException(@"cannot apply filter unless baseQuery isn't
null; pass ConstantScoreQuery instead");
+            }
+
+            drillDownDims.PutAll(other.drillDownDims);
+            query.Add(new FilteredQuery(clauses[0].Query, filter), Occur.MUST);
+            for (int i = 1; i < clauses.Length; i++)
+            {
+                query.Add(clauses[i].Query, Occur.MUST);
+            }
+
+            fip = other.fip;
+        }
+
+        internal DrillDownQuery(FacetIndexingParams fip, Query baseQuery, List<Query>
clauses)
+        {
+            this.fip = fip;
+            this.query = new BooleanQuery(true);
+            if (baseQuery != null)
+            {
+                query.Add(baseQuery, Occur.MUST);
+            }
+
+            foreach (Query clause in clauses)
+            {
+                query.Add(clause, Occur.MUST);
+                drillDownDims[GetDim(clause)] = drillDownDims.Count;
+            }
+        }
+
+        internal string GetDim(Query clause)
+        {
+            clause = ((ConstantScoreQuery)clause).Query;
+            string term;
+            if (clause is TermQuery)
+            {
+                term = ((TermQuery)clause).Term.Text;
+            }
+            else
+            {
+                term = ((TermQuery)((BooleanQuery)clause).Clauses[0].Query).Term.Text;
+            }
+
+            return term.Split(new[] { Regex.Escape(fip.FacetDelimChar.ToString()) }, StringSplitOptions.None)[0];
+        }
+
+        public DrillDownQuery(FacetIndexingParams fip)
+            : this(fip, null)
+        {
+        }
+
+        public DrillDownQuery(FacetIndexingParams fip, Query baseQuery)
+        {
+            query = new BooleanQuery(true);
+            if (baseQuery != null)
+            {
+                query.Add(baseQuery, Occur.MUST);
+            }
+
+            this.fip = fip;
+        }
+
+        public void Add(params CategoryPath[] paths)
+        {
+            Query q;
+            if (paths[0].length == 0)
+            {
+                throw new ArgumentException(@"all CategoryPaths must have length > 0");
+            }
+
+            string dim = paths[0].components[0];
+            if (drillDownDims.ContainsKey(dim))
+            {
+                throw new ArgumentException(@"dimension '" + dim + @"' was already added");
+            }
+
+            if (paths.Length == 1)
+            {
+                q = new TermQuery(Term(fip, paths[0]));
+            }
+            else
+            {
+                BooleanQuery bq = new BooleanQuery(true);
+                foreach (CategoryPath cp in paths)
+                {
+                    if (cp.length == 0)
+                    {
+                        throw new ArgumentException(@"all CategoryPaths must have length
> 0");
+                    }
+
+                    if (!cp.components[0].Equals(dim))
+                    {
+                        throw new ArgumentException(@"multiple (OR'd) drill-down paths must
be under same dimension; got '" + dim + @"' and '" + cp.components[0] + @"'");
+                    }
+
+                    bq.Add(new TermQuery(Term(fip, cp)), Occur.SHOULD);
+                }
+
+                q = bq;
+            }
+
+            drillDownDims[dim] = drillDownDims.Count;
+            ConstantScoreQuery drillDownQuery = new ConstantScoreQuery(q);
+            drillDownQuery.Boost = 0F;
+            query.Add(drillDownQuery, Occur.MUST);
+        }
+
+        public override object Clone()
+        {
+            return new DrillDownQuery(fip, query, drillDownDims);
+        }
+
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = base.GetHashCode();
+            return prime * result + query.GetHashCode();
+        }
+
+        public override bool Equals(Object obj)
+        {
+            if (!(obj is DrillDownQuery))
+            {
+                return false;
+            }
+
+            DrillDownQuery other = (DrillDownQuery)obj;
+            return query.Equals(other.query) && base.Equals(other);
+        }
+
+        public override Query Rewrite(IndexReader r)
+        {
+            if (query.Clauses.Count() == 0)
+            {
+                throw new InvalidOperationException(@"no base query or drill-down categories
given");
+            }
+
+            return query;
+        }
+
+        public override string ToString(string field)
+        {
+            return query.ToString(field);
+        }
+
+        internal BooleanQuery BooleanQuery
+        {
+            get
+            {
+                return query;
+            }
+        }
+
+        internal IDictionary<string, int?> Dims
+        {
+            get
+            {
+                return drillDownDims;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/core/Lucene.Net.csproj
----------------------------------------------------------------------
diff --git a/src/core/Lucene.Net.csproj b/src/core/Lucene.Net.csproj
index 47713a1..5941972 100644
--- a/src/core/Lucene.Net.csproj
+++ b/src/core/Lucene.Net.csproj
@@ -880,6 +880,7 @@
     <Compile Include="Support\BufferUnderflowException.cs" />
     <Compile Include="Support\BuildType.cs" />
     <Compile Include="Support\ByteBuffer.cs" />
+    <Compile Include="Support\DateTimeExtensions.cs" />
     <Compile Include="Support\ICallable.cs" />
     <Compile Include="Support\Character.cs" />
     <Compile Include="Support\CloseableThreadLocalProfiler.cs" />
@@ -911,6 +912,7 @@
     <Compile Include="Support\IdentityHashMap.cs" />
     <Compile Include="Support\IdentityHashSet.cs" />
     <Compile Include="Support\IdentityWeakReferenceT.cs" />
+    <Compile Include="Support\IDictionaryExtensions.cs" />
     <Compile Include="Support\IndexedLinkedList.cs" />
     <Compile Include="Support\Inflater.cs" />
     <Compile Include="Support\IThreadRunnable.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/core/Support/DateTimeExtensions.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/DateTimeExtensions.cs b/src/core/Support/DateTimeExtensions.cs
new file mode 100644
index 0000000..aebaf32
--- /dev/null
+++ b/src/core/Support/DateTimeExtensions.cs
@@ -0,0 +1,17 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support
+{
+    public static class DateTimeExtensions
+    {
+        private static readonly DateTime Jan1st1970 = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc);
+
+        public static long CurrentTimeMillis(this DateTime dt)
+        {
+            return (long)(dt - Jan1st1970).TotalMilliseconds;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/core/Support/IDictionaryExtensions.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/IDictionaryExtensions.cs b/src/core/Support/IDictionaryExtensions.cs
new file mode 100644
index 0000000..121c188
--- /dev/null
+++ b/src/core/Support/IDictionaryExtensions.cs
@@ -0,0 +1,18 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support
+{
+    public static class IDictionaryExtensions
+    {
+        public static void PutAll<TKey, TValue>(this IDictionary<TKey, TValue>
dict, IEnumerable<KeyValuePair<TKey, TValue>> kvps)
+        {
+            foreach (var kvp in kvps)
+            {
+                dict[kvp.Key] = kvp.Value;
+            }
+        }
+    }
+}


Mime
View raw message