Return-Path: X-Original-To: apmail-lucenenet-commits-archive@www.apache.org Delivered-To: apmail-lucenenet-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 98D91C8E8 for ; Fri, 14 Nov 2014 11:59:23 +0000 (UTC) Received: (qmail 58742 invoked by uid 500); 14 Nov 2014 11:59:18 -0000 Delivered-To: apmail-lucenenet-commits-archive@lucenenet.apache.org Received: (qmail 58652 invoked by uid 500); 14 Nov 2014 11:59:18 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 57824 invoked by uid 99); 14 Nov 2014 11:59:17 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 14 Nov 2014 11:59:17 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 6731E94066A; Fri, 14 Nov 2014 11:59:17 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: synhershko@apache.org To: commits@lucenenet.apache.org Date: Fri, 14 Nov 2014 11:59:29 -0000 Message-Id: <890e6615a8a14d34a3abfba1177daf47@git.apache.org> In-Reply-To: <25d3e5ad3026426c84d9af894c5dece8@git.apache.org> References: <25d3e5ad3026426c84d9af894c5dece8@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [14/26] lucenenet git commit: first commit of facet porting, failing tests will be fixed in next commits. http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/FacetLabel.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/FacetLabel.cs b/Lucene.Net.Facet/Taxonomy/FacetLabel.cs new file mode 100644 index 0000000..287a18c --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/FacetLabel.cs @@ -0,0 +1,224 @@ +using System; +using System.Diagnostics; +using Lucene.Net.Support; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using LruTaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.LruTaxonomyWriterCache; + using NameHashIntCacheLRU = Lucene.Net.Facet.Taxonomy.WriterCache.NameHashIntCacheLRU; + + /// + /// Holds a sequence of string components, specifying the hierarchical name of a + /// category. + /// + /// @lucene.internal + /// + public class FacetLabel : IComparable + { + private static readonly int BYTE_BLOCK_SIZE = Lucene.Net.Util.ByteBlockPool.BYTE_BLOCK_SIZE; + /* + * copied from DocumentWriterPerThread -- if a FacetLabel is resolved to a + * drill-down term which is encoded to a larger term than that length, it is + * silently dropped! Therefore we limit the number of characters to MAX/4 to + * be on the safe side. + */ + /// + /// The maximum number of characters a can have. + /// + public static readonly int MAX_CATEGORY_PATH_LENGTH = (BYTE_BLOCK_SIZE - 2) / 4; + + /// + /// The components of this . Note that this array may be + /// shared with other instances, e.g. as a result of + /// , therefore you should traverse the array up to + /// for this path's components. + /// + public readonly string[] components; + + /// + /// The number of components of this . + public readonly int length; + + // Used by subpath + private FacetLabel(FacetLabel copyFrom, int prefixLen) + { + // while the code which calls this method is safe, at some point a test + // tripped on AIOOBE in toString, but we failed to reproduce. adding the + // assert as a safety check. + Debug.Assert(prefixLen >= 0 && prefixLen <= copyFrom.components.Length, "prefixLen cannot be negative nor larger than the given components' length: prefixLen=" + prefixLen + " components.length=" + copyFrom.components.Length); + this.components = copyFrom.components; + length = prefixLen; + } + + /// + /// Construct from the given path components. + public FacetLabel(params string[] components) + { + this.components = components; + length = components.Length; + CheckComponents(); + } + + /// + /// Construct from the dimension plus the given path components. + public FacetLabel(string dim, string[] path) + { + components = new string[1 + path.Length]; + components[0] = dim; + Array.Copy(path, 0, components, 1, path.Length); + length = components.Length; + CheckComponents(); + } + + private void CheckComponents() + { + long len = 0; + foreach (string comp in components) + { + if (string.IsNullOrEmpty(comp)) + { + throw new System.ArgumentException("empty or null components not allowed: " + Arrays.ToString(components)); + } + len += comp.Length; + } + len += components.Length - 1; // add separators + if (len > MAX_CATEGORY_PATH_LENGTH) + { + throw new System.ArgumentException("category path exceeds maximum allowed path length: max=" + MAX_CATEGORY_PATH_LENGTH + " len=" + len + " path=" + Arrays.ToString(components).Substring(0, 30) + "..."); + } + } + + /// + /// Compares this path with another for lexicographic + /// order. + /// + public virtual int CompareTo(FacetLabel other) + { + int len = length < other.length ? length : other.length; + for (int i = 0, j = 0; i < len; i++, j++) + { + int cmp = components[i].CompareTo(other.components[j]); + if (cmp < 0) + { + return -1; // this is 'before' + } + if (cmp > 0) + { + return 1; // this is 'after' + } + } + + // one is a prefix of the other + return length - other.length; + } + + public override bool Equals(object obj) + { + if (!(obj is FacetLabel)) + { + return false; + } + + FacetLabel other = (FacetLabel)obj; + if (length != other.length) + { + return false; // not same length, cannot be equal + } + + // CategoryPaths are more likely to differ at the last components, so start + // from last-first + for (int i = length - 1; i >= 0; i--) + { + if (!components[i].Equals(other.components[i])) + { + return false; + } + } + return true; + } + + public override int GetHashCode() + { + if (length == 0) + { + return 0; + } + + int hash = length; + for (int i = 0; i < length; i++) + { + hash = hash * 31 + components[i].GetHashCode(); + } + return hash; + } + + /// + /// Calculate a 64-bit hash function for this path. This + /// is necessary for (the + /// default cache impl for {@link + /// LruTaxonomyWriterCache}) to reduce the chance of + /// "silent but deadly" collisions. + /// + public virtual long LongHashCode() + { + if (length == 0) + { + return 0; + } + + long hash = length; + for (int i = 0; i < length; i++) + { + hash = hash * 65599 + components[i].GetHashCode(); + } + return hash; + } + + /// + /// Returns a sub-path of this path up to {@code length} components. + public virtual FacetLabel Subpath(int len) + { + if (len >= this.length || len < 0) + { + return this; + } + else + { + return new FacetLabel(this, len); + } + } + + /// + /// Returns a string representation of the path. + /// + public override string ToString() + { + if (length == 0) + { + return "FacetLabel: []"; + } + string[] parts = new string[length]; + Array.Copy(components, 0, parts, 0, length); + return "FacetLabel: [" + Arrays.ToString(parts) + "]"; + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/FastTaxonomyFacetCounts.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/FastTaxonomyFacetCounts.cs b/Lucene.Net.Facet/Taxonomy/FastTaxonomyFacetCounts.cs new file mode 100644 index 0000000..01545f0 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/FastTaxonomyFacetCounts.cs @@ -0,0 +1,105 @@ +using System.Collections.Generic; +using Lucene.Net.Facet; +using Lucene.Net.Search; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + using MatchingDocs = FacetsCollector.MatchingDocs; + using BinaryDocValues = Lucene.Net.Index.BinaryDocValues; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using BytesRef = Lucene.Net.Util.BytesRef; + + /// + /// Computes facets counts, assuming the default encoding + /// into DocValues was used. + /// + /// @lucene.experimental + /// + public class FastTaxonomyFacetCounts : IntTaxonomyFacets + { + + /// + /// Create {@code FastTaxonomyFacetCounts}, which also + /// counts all facet labels. + /// + public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) + : this(FacetsConfig.DEFAULT_INDEX_FIELD_NAME, taxoReader, config, fc) + { + } + + /// + /// Create {@code FastTaxonomyFacetCounts}, using the + /// specified {@code indexFieldName} for ordinals. Use + /// this if you had set {@link + /// FacetsConfig#setIndexFieldName} to change the index + /// field name for certain dimensions. + /// + public FastTaxonomyFacetCounts(string indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) + : base(indexFieldName, taxoReader, config) + { + Count(fc.GetMatchingDocs); + } + + private void Count(IList matchingDocs) + { + foreach (FacetsCollector.MatchingDocs hits in matchingDocs) + { + BinaryDocValues dv = hits.context.AtomicReader.GetBinaryDocValues(IndexFieldName); + if (dv == null) // this reader does not have DocValues for the requested category list + { + continue; + } + + DocIdSetIterator docs = hits.bits.GetIterator(); + + int doc; + BytesRef bytesRef = new BytesRef(); + while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) + { + dv.Get(doc,bytesRef); + sbyte[] bytes = bytesRef.Bytes; + int end = bytesRef.Offset + bytesRef.Length; + int ord = 0; + int offset = bytesRef.Offset; + int prev = 0; + while (offset < end) + { + sbyte b = bytes[offset++]; + if (b >= 0) + { + prev = ord = ((ord << 7) | b) + prev; + ++values[ord]; + ord = 0; + } + else + { + ord = (ord << 7) | (b & 0x7F); + } + } + } + } + + Rollup(); + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/FloatAssociationFacetField.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/FloatAssociationFacetField.cs b/Lucene.Net.Facet/Taxonomy/FloatAssociationFacetField.cs new file mode 100644 index 0000000..cc90e61 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/FloatAssociationFacetField.cs @@ -0,0 +1,65 @@ +using Lucene.Net.Support; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using Document = Lucene.Net.Documents.Document; + using BytesRef = Lucene.Net.Util.BytesRef; + + /// + /// Add an instance of this to your to add + /// a facet label associated with a float. Use {@link + /// TaxonomyFacetSumFloatAssociations} to aggregate float values + /// per facet label at search time. + /// + /// @lucene.experimental + /// + public class FloatAssociationFacetField : AssociationFacetField + { + + /// + /// Creates this from {@code dim} and {@code path} and a + /// float association + /// + public FloatAssociationFacetField(float assoc, string dim, params string[] path) : base(floatToBytesRef(assoc), dim, path) + { + } + + /// + /// Encodes a {@code float} as a 4-byte . + public static BytesRef floatToBytesRef(float v) + { + return IntAssociationFacetField.intToBytesRef(Number.FloatToIntBits(v)); + } + + /// + /// Decodes a previously encoded {@code float}. + public static float bytesRefToFloat(BytesRef b) + { + return Number.IntBitsToFloat(IntAssociationFacetField.bytesRefToInt(b)); + } + + public override string ToString() + { + return "FloatAssociationFacetField(dim=" + dim + " path=" + Arrays.ToString(path) + " value=" + bytesRefToFloat(assoc) + ")"; + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/FloatTaxonomyFacets.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/FloatTaxonomyFacets.cs b/Lucene.Net.Facet/Taxonomy/FloatTaxonomyFacets.cs new file mode 100644 index 0000000..9dbe71d --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/FloatTaxonomyFacets.cs @@ -0,0 +1,183 @@ +using System; +using System.Diagnostics; +using System.Collections.Generic; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using DimConfig = Lucene.Net.Facet.FacetsConfig.DimConfig; + + /// + /// Base class for all taxonomy-based facets that aggregate + /// to a per-ords float[]. + /// + public abstract class FloatTaxonomyFacets : TaxonomyFacets + { + + /// + /// Per-ordinal value. + protected readonly float[] values; + + /// + /// Sole constructor. + protected internal FloatTaxonomyFacets(string indexFieldName, TaxonomyReader taxoReader, FacetsConfig config) + : base(indexFieldName, taxoReader, config) + { + values = new float[taxoReader.Size]; + } + + /// + /// Rolls up any single-valued hierarchical dimensions. + protected virtual void Rollup() + { + // Rollup any necessary dims: + foreach (KeyValuePair ent in Config.DimConfigs) + { + string dim = ent.Key; + FacetsConfig.DimConfig ft = ent.Value; + if (ft.hierarchical && ft.multiValued == false) + { + int dimRootOrd = TaxoReader.GetOrdinal(new FacetLabel(dim)); + Debug.Assert(dimRootOrd > 0); + values[dimRootOrd] += Rollup(Children[dimRootOrd]); + } + } + } + + private float Rollup(int ord) + { + float sum = 0; + while (ord != TaxonomyReader.INVALID_ORDINAL) + { + float childValue = values[ord] + Rollup(Children[ord]); + values[ord] = childValue; + sum += childValue; + ord = Siblings[ord]; + } + return sum; + } + + public override float GetSpecificValue(string dim, params string[] path) + { + FacetsConfig.DimConfig dimConfig = VerifyDim(dim); + if (path.Length == 0) + { + if (dimConfig.hierarchical && dimConfig.multiValued == false) + { + // ok: rolled up at search time + } + else if (dimConfig.requireDimCount && dimConfig.multiValued) + { + // ok: we indexed all ords at index time + } + else + { + throw new System.ArgumentException("cannot return dimension-level value alone; use getTopChildren instead"); + } + } + int ord = TaxoReader.GetOrdinal(new FacetLabel(dim, path)); + if (ord < 0) + { + return -1; + } + return values[ord]; + } + + public override FacetResult GetTopChildren(int topN, string dim, params string[] path) + { + if (topN <= 0) + { + throw new System.ArgumentException("topN must be > 0 (got: " + topN + ")"); + } + FacetsConfig.DimConfig dimConfig = VerifyDim(dim); + FacetLabel cp = new FacetLabel(dim, path); + int dimOrd = TaxoReader.GetOrdinal(cp); + if (dimOrd == -1) + { + return null; + } + + TopOrdAndFloatQueue q = new TopOrdAndFloatQueue(Math.Min(TaxoReader.Size, topN)); + float bottomValue = 0; + + int ord = Children[dimOrd]; + float sumValues = 0; + int childCount = 0; + + TopOrdAndFloatQueue.OrdAndValue reuse = null; + while (ord != TaxonomyReader.INVALID_ORDINAL) + { + if (values[ord] > 0) + { + sumValues += values[ord]; + childCount++; + if (values[ord] > bottomValue) + { + if (reuse == null) + { + reuse = new TopOrdAndFloatQueue.OrdAndValue(); + } + reuse.ord = ord; + reuse.value = values[ord]; + reuse = q.InsertWithOverflow(reuse); + if (q.Size() == topN) + { + bottomValue = q.Top().value; + } + } + } + + ord = Siblings[ord]; + } + + if (sumValues == 0) + { + return null; + } + + if (dimConfig.multiValued) + { + if (dimConfig.requireDimCount) + { + sumValues = values[dimOrd]; + } + else + { + // Our sum'd count is not correct, in general: + sumValues = -1; + } + } + else + { + // Our sum'd dim count is accurate, so we keep it + } + + LabelAndValue[] labelValues = new LabelAndValue[q.Size()]; + for (int i = labelValues.Length - 1; i >= 0; i--) + { + TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.Pop(); + FacetLabel child = TaxoReader.GetPath(ordAndValue.ord); + labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value); + } + + return new FacetResult(dim, path, sumValues, labelValues, childCount); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/IntAssociationFacetField.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/IntAssociationFacetField.cs b/Lucene.Net.Facet/Taxonomy/IntAssociationFacetField.cs new file mode 100644 index 0000000..b3f1ddd --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/IntAssociationFacetField.cs @@ -0,0 +1,74 @@ +using Lucene.Net.Support; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using Document = Lucene.Net.Documents.Document; + using BytesRef = Lucene.Net.Util.BytesRef; + + /// + /// Add an instance of this to your to add + /// a facet label associated with an int. Use {@link + /// TaxonomyFacetSumIntAssociations} to aggregate int values + /// per facet label at search time. + /// + /// @lucene.experimental + /// + public class IntAssociationFacetField : AssociationFacetField + { + + /// + /// Creates this from {@code dim} and {@code path} and an + /// int association + /// + public IntAssociationFacetField(int assoc, string dim, params string[] path) + : base(intToBytesRef(assoc), dim, path) + { + } + + /// + /// Encodes an {@code int} as a 4-byte , + /// big-endian. + /// + public static BytesRef intToBytesRef(int v) + { + sbyte[] bytes = new sbyte[4]; + // big-endian: + bytes[0] = (sbyte)(v >> 24); + bytes[1] = (sbyte)(v >> 16); + bytes[2] = (sbyte)(v >> 8); + bytes[3] = (sbyte)v; + return new BytesRef(bytes); + } + + /// + /// Decodes a previously encoded {@code int}. + public static int bytesRefToInt(BytesRef b) + { + return ((b.Bytes[b.Offset] & 0xFF) << 24) | ((b.Bytes[b.Offset + 1] & 0xFF) << 16) | ((b.Bytes[b.Offset + 2] & 0xFF) << 8) | (b.Bytes[b.Offset + 3] & 0xFF); + } + + public override string ToString() + { + return "IntAssociationFacetField(dim=" + dim + " path=" + Arrays.ToString(path) + " value=" + bytesRefToInt(assoc) + ")"; + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/IntTaxonomyFacets.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/IntTaxonomyFacets.cs b/Lucene.Net.Facet/Taxonomy/IntTaxonomyFacets.cs new file mode 100644 index 0000000..ad40137 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/IntTaxonomyFacets.cs @@ -0,0 +1,189 @@ +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + using DimConfig = Lucene.Net.Facet.FacetsConfig.DimConfig; + + /// + /// Base class for all taxonomy-based facets that aggregate + /// to a per-ords int[]. + /// + + public abstract class IntTaxonomyFacets : TaxonomyFacets + { + + /// + /// Per-ordinal value. + protected internal readonly int[] values; + + /// + /// Sole constructor. + protected internal IntTaxonomyFacets(string indexFieldName, TaxonomyReader taxoReader, FacetsConfig config) + : base(indexFieldName, taxoReader, config) + { + values = new int[taxoReader.Size]; + } + + /// + /// Rolls up any single-valued hierarchical dimensions. + protected virtual void Rollup() + { + // Rollup any necessary dims: + foreach (KeyValuePair ent in Config.DimConfigs) + { + string dim = ent.Key; + FacetsConfig.DimConfig ft = ent.Value; + if (ft.hierarchical && ft.multiValued == false) + { + int dimRootOrd = TaxoReader.GetOrdinal(new FacetLabel(dim)); + // It can be -1 if this field was declared in the + // config but never indexed: + if (dimRootOrd > 0) + { + values[dimRootOrd] += Rollup(Children[dimRootOrd]); + } + } + } + } + + private int Rollup(int ord) + { + int sum = 0; + while (ord != TaxonomyReader.INVALID_ORDINAL) + { + int childValue = values[ord] + Rollup(Children[ord]); + values[ord] = childValue; + sum += childValue; + ord = Siblings[ord]; + } + return sum; + } + + public override float GetSpecificValue(string dim, params string[] path) + { + var dimConfig = VerifyDim(dim); + if (path.Length == 0) + { + if (dimConfig.hierarchical && dimConfig.multiValued == false) + { + // ok: rolled up at search time + } + else if (dimConfig.requireDimCount && dimConfig.multiValued) + { + // ok: we indexed all ords at index time + } + else + { + throw new System.ArgumentException("cannot return dimension-level value alone; use getTopChildren instead"); + } + } + int ord = TaxoReader.GetOrdinal(new FacetLabel(dim, path)); + if (ord < 0) + { + return -1; + } + return values[ord]; + } + + public override FacetResult GetTopChildren(int topN, string dim, params string[] path) + { + if (topN <= 0) + { + throw new System.ArgumentException("topN must be > 0 (got: " + topN + ")"); + } + var dimConfig = VerifyDim(dim); + FacetLabel cp = new FacetLabel(dim, path); + int dimOrd = TaxoReader.GetOrdinal(cp); + if (dimOrd == -1) + { + return null; + } + + TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.Min(TaxoReader.Size, topN)); + + int bottomValue = 0; + + int ord = Children[dimOrd]; + int totValue = 0; + int childCount = 0; + + TopOrdAndIntQueue.OrdAndValue reuse = null; + while (ord != TaxonomyReader.INVALID_ORDINAL) + { + if (values[ord] > 0) + { + totValue += values[ord]; + childCount++; + if (values[ord] > bottomValue) + { + if (reuse == null) + { + reuse = new TopOrdAndIntQueue.OrdAndValue(); + } + reuse.ord = ord; + reuse.value = values[ord]; + reuse = q.InsertWithOverflow(reuse); + if (q.Size() == topN) + { + bottomValue = q.Top().value; + } + } + } + + ord = Siblings[ord]; + } + + if (totValue == 0) + { + return null; + } + + if (dimConfig.multiValued) + { + if (dimConfig.requireDimCount) + { + totValue = values[dimOrd]; + } + else + { + // Our sum'd value is not correct, in general: + totValue = -1; + } + } + else + { + // Our sum'd dim value is accurate, so we keep it + } + + LabelAndValue[] labelValues = new LabelAndValue[q.Size()]; + for (int i = labelValues.Length - 1; i >= 0; i--) + { + TopOrdAndIntQueue.OrdAndValue ordAndValue = q.Pop(); + FacetLabel child = TaxoReader.GetPath(ordAndValue.ord); + labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value); + } + + return new FacetResult(dim, path, totValue, labelValues, childCount); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/LRUHashMap.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/LRUHashMap.cs b/Lucene.Net.Facet/Taxonomy/LRUHashMap.cs new file mode 100644 index 0000000..d442992 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/LRUHashMap.cs @@ -0,0 +1,154 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using Lucene.Net.Support; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + /// + /// LRUHashMap is an extension of Java's HashMap, which has a bounded size(); + /// When it reaches that size, each time a new element is added, the least + /// recently used (LRU) entry is removed. + /// + /// Java makes it very easy to implement LRUHashMap - all its functionality is + /// already available from , and we just need to + /// configure that properly. + /// + /// + /// Note that like HashMap, LRUHashMap is unsynchronized, and the user MUST + /// synchronize the access to it if used from several threads. Moreover, while + /// with HashMap this is only a concern if one of the threads is modifies the + /// map, with LURHashMap every read is a modification (because the LRU order + /// needs to be remembered) so proper synchronization is always necessary. + /// + /// + /// With the usual synchronization mechanisms available to the user, this + /// unfortunately means that LRUHashMap will probably perform sub-optimally under + /// heavy contention: while one thread uses the hash table (reads or writes), any + /// other thread will be blocked from using it - or even just starting to use it + /// (e.g., calculating the hash function). A more efficient approach would be not + /// to use LinkedHashMap at all, but rather to use a non-locking (as much as + /// possible) thread-safe solution, something along the lines of + /// java.util.concurrent.ConcurrentHashMap (though that particular class does not + /// support the additional LRU semantics, which will need to be added separately + /// using a concurrent linked list or additional storage of timestamps (in an + /// array or inside the entry objects), or whatever). + /// + /// @lucene.experimental + /// + /// + public class LRUHashMap where TU : class //this is implementation of LRU Cache + { + + public int MaxSize { get; set; } + private int CleanSize; + private TimeSpan MaxDuration; + + + private readonly ConcurrentDictionary> _cache = new ConcurrentDictionary>(); + + public LRUHashMap(int maxSize = 50000, int cleanPercentage = 30, TimeSpan maxDuration = default(TimeSpan)) + { + MaxSize = maxSize; + CleanSize = (int)Math.Max(MaxSize * (1.0 * cleanPercentage / 100), 1); + if (maxDuration == default(TimeSpan)) + { + MaxDuration = TimeSpan.FromDays(1); + } + else + { + MaxDuration = maxDuration; + } + } + + + public bool Put(TV cacheKey, TU value) + { + return AddToCache(cacheKey, value); + } + + public bool AddToCache(TV cacheKey, TU value) + { + var cachedResult = new CacheDataObject + { + Usage = 1, //value == null ? 1 : value.Usage + 1, + Value = value, + Timestamp = DateTime.UtcNow + }; + + _cache.AddOrUpdate(cacheKey, cachedResult, (_, __) => cachedResult); + if (_cache.Count > MaxSize) + { + foreach (var source in _cache + .OrderByDescending(x => x.Value.Usage) + .ThenBy(x => x.Value.Timestamp) + .Skip(MaxSize - CleanSize)) + { + if (EqualityComparer.Default.Equals(source.Key, cacheKey)) + continue; // we don't want to remove the one we just added + CacheDataObject ignored; + _cache.TryRemove(source.Key, out ignored); + } + } + return true; + } + + public TU Get(TV cacheKey, bool increment = false) + { + CacheDataObject value; + if (_cache.TryGetValue(cacheKey, out value) && (DateTime.UtcNow - value.Timestamp) <= MaxDuration) + { + if (increment) + { + Interlocked.Increment(ref value.Usage); + } + return value.Value; + } + return null; + } + + public bool IsExistInCache(TV cacheKey) + { + return (_cache.ContainsKey(cacheKey)); + } + + public int Size() + { + return _cache.Count; + } + + #region Nested type: CacheDataObject + + private class CacheDataObject where T : class + { + public DateTime Timestamp; + public int Usage; + public T Value; + } + + #endregion + + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/OrdinalsReader.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/OrdinalsReader.cs b/Lucene.Net.Facet/Taxonomy/OrdinalsReader.cs new file mode 100644 index 0000000..544a1ef --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/OrdinalsReader.cs @@ -0,0 +1,68 @@ +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext; + using IntsRef = Lucene.Net.Util.IntsRef; + + /// + /// Provides per-document ordinals. + /// + + public abstract class OrdinalsReader + { + + /// + /// Returns ordinals for documents in one segment. + public abstract class OrdinalsSegmentReader + { + /// + /// Get the ordinals for this document. ordinals.offset + /// must always be 0! + /// + public abstract void Get(int doc, IntsRef ordinals); + + /// + /// Default constructor. + /// + public OrdinalsSegmentReader() + { + } + } + + /// + /// Default constructor. + /// + public OrdinalsReader() + { + } + + /// + /// Set current atomic reader. + /// + public abstract OrdinalsSegmentReader GetReader(AtomicReaderContext context); + + /// + /// Returns the indexed field name this {@code + /// OrdinalsReader} is reading from. + /// + public abstract string IndexFieldName { get; } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/ParallelTaxonomyArrays.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/ParallelTaxonomyArrays.cs b/Lucene.Net.Facet/Taxonomy/ParallelTaxonomyArrays.cs new file mode 100644 index 0000000..26f1d8a --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/ParallelTaxonomyArrays.cs @@ -0,0 +1,74 @@ +namespace Lucene.Net.Facet.Taxonomy +{ + + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Returns 3 arrays for traversing the taxonomy: + ///
    + ///
  • {@code parents}: {@code parents[i]} denotes the parent of category + /// ordinal {@code i}.
  • + ///
  • {@code children}: {@code children[i]} denotes a child of category ordinal + /// {@code i}.
  • + ///
  • {@code siblings}: {@code siblings[i]} denotes the sibling of category + /// ordinal {@code i}.
  • + ///
+ /// + /// To traverse the taxonomy tree, you typically start with {@code children[0]} + /// (ordinal 0 is reserved for ROOT), and then depends if you want to do DFS or + /// BFS, you call {@code children[children[0]]} or {@code siblings[children[0]]} + /// and so forth, respectively. + /// + /// + /// NOTE: you are not expected to modify the values of the arrays, since + /// the arrays are shared with other threads. + /// + /// @lucene.experimental + /// + ///
+ public abstract class ParallelTaxonomyArrays + { + + /// + /// Sole constructor. + public ParallelTaxonomyArrays() + { + } + + /// + /// Returns the parents array, where {@code parents[i]} denotes the parent of + /// category ordinal {@code i}. + /// + public abstract int[] Parents(); + + /// + /// Returns the children array, where {@code children[i]} denotes a child of + /// category ordinal {@code i}. + /// + public abstract int[] Children(); + + /// + /// Returns the siblings array, where {@code siblings[i]} denotes the sibling + /// of category ordinal {@code i}. + /// + public abstract int[] Siblings(); + + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/PrintTaxonomyStats.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/PrintTaxonomyStats.cs b/Lucene.Net.Facet/Taxonomy/PrintTaxonomyStats.cs new file mode 100644 index 0000000..bfcbda7 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/PrintTaxonomyStats.cs @@ -0,0 +1,121 @@ +using System; +using System.IO; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + using ChildrenIterator = Lucene.Net.Facet.Taxonomy.TaxonomyReader.ChildrenIterator; + using DirectoryTaxonomyReader = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyReader; + using Directory = Lucene.Net.Store.Directory; + using FSDirectory = Lucene.Net.Store.FSDirectory; + + /// + /// Prints how many ords are under each dimension. + + // java -cp ../build/core/classes/java:../build/facet/classes/java org.apache.lucene.facet.util.PrintTaxonomyStats -printTree /s2/scratch/indices/wikibig.trunk.noparents.facets.Lucene41.nd1M/facets + public class PrintTaxonomyStats + { + + /// + /// Sole constructor. + public PrintTaxonomyStats() + { + } + + /// + /// Command-line tool. + public static void Main(string[] args) + { + bool printTree = false; + string path = null; + for (int i = 0; i < args.Length; i++) + { + if (args[i].Equals("-printTree")) + { + printTree = true; + } + else + { + path = args[i]; + } + } + if (args.Length != (printTree ? 2 : 1)) + { + Console.WriteLine("\nUsage: java -classpath ... org.apache.lucene.facet.util.PrintTaxonomyStats [-printTree] /path/to/taxononmy/index\n"); + Environment.Exit(1); + } + Store.Directory dir = FSDirectory.Open(new DirectoryInfo(path)); + var r = new DirectoryTaxonomyReader(dir); + PrintStats(r, System.Console.Out, printTree); + r.Dispose(); + //dir.close(); + } + + /// + /// Recursively prints stats for all ordinals. + public static void PrintStats(TaxonomyReader r, TextWriter @out, bool printTree) + { + @out.WriteLine(r.Size + " total categories."); + + ChildrenIterator it = r.GetChildren(TaxonomyReader.ROOT_ORDINAL); + int child; + while ((child = it.Next()) != TaxonomyReader.INVALID_ORDINAL) + { + ChildrenIterator chilrenIt = r.GetChildren(child); + int numImmediateChildren = 0; + while (chilrenIt.Next() != TaxonomyReader.INVALID_ORDINAL) + { + numImmediateChildren++; + } + FacetLabel cp = r.GetPath(child); + @out.WriteLine("/" + cp.components[0] + ": " + numImmediateChildren + " immediate children; " + (1 + CountAllChildren(r, child)) + " total categories"); + if (printTree) + { + PrintAllChildren(@out, r, child, " ", 1); + } + } + } + + private static int CountAllChildren(TaxonomyReader r, int ord) + { + int count = 0; + ChildrenIterator it = r.GetChildren(ord); + int child; + while ((child = it.Next()) != TaxonomyReader.INVALID_ORDINAL) + { + count += 1 + CountAllChildren(r, child); + } + return count; + } + + private static void PrintAllChildren(TextWriter @out, TaxonomyReader r, int ord, string indent, int depth) + { + ChildrenIterator it = r.GetChildren(ord); + int child; + while ((child = it.Next()) != TaxonomyReader.INVALID_ORDINAL) + { + @out.WriteLine(indent + "/" + r.GetPath(child).components[depth]); + PrintAllChildren(@out, r, child, indent + " ", depth + 1); + } + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/SearcherTaxonomyManager.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/SearcherTaxonomyManager.cs b/Lucene.Net.Facet/Taxonomy/SearcherTaxonomyManager.cs new file mode 100644 index 0000000..d543aad --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/SearcherTaxonomyManager.cs @@ -0,0 +1,179 @@ +using System.Threading; +using Lucene.Net.Search; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using DirectoryTaxonomyReader = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyReader; + using DirectoryTaxonomyWriter = Lucene.Net.Facet.Taxonomy.Directory.DirectoryTaxonomyWriter; + using DirectoryReader = Lucene.Net.Index.DirectoryReader; + using IndexReader = Lucene.Net.Index.IndexReader; + using IndexWriter = Lucene.Net.Index.IndexWriter; + using IndexSearcher = Lucene.Net.Search.IndexSearcher; + using Lucene.Net.Search; + using SearcherFactory = Lucene.Net.Search.SearcherFactory; + using SearcherManager = Lucene.Net.Search.SearcherManager; + using Directory = Lucene.Net.Store.Directory; + using IOUtils = Lucene.Net.Util.IOUtils; + + /// + /// Manages near-real-time reopen of both an IndexSearcher + /// and a TaxonomyReader. + /// + /// NOTE: If you call {@link + /// DirectoryTaxonomyWriter#replaceTaxonomy} then you must + /// open a new {@code SearcherTaxonomyManager} afterwards. + /// + /// + public class SearcherTaxonomyManager : ReferenceManager + { + + /// + /// Holds a matched pair of and + /// + /// + public class SearcherAndTaxonomy + { + /// + /// Point-in-time . + public readonly IndexSearcher searcher; + + /// + /// Matching point-in-time . + public readonly DirectoryTaxonomyReader taxonomyReader; + + /// + /// Create a SearcherAndTaxonomy + public SearcherAndTaxonomy(IndexSearcher searcher, DirectoryTaxonomyReader taxonomyReader) + { + this.searcher = searcher; + this.taxonomyReader = taxonomyReader; + } + } + + private readonly SearcherFactory searcherFactory; + private readonly long taxoEpoch; + private readonly DirectoryTaxonomyWriter taxoWriter; + + /// + /// Creates near-real-time searcher and taxonomy reader + /// from the corresponding writers. + /// + public SearcherTaxonomyManager(IndexWriter writer, bool applyAllDeletes, SearcherFactory searcherFactory, DirectoryTaxonomyWriter taxoWriter) + { + if (searcherFactory == null) + { + searcherFactory = new SearcherFactory(); + } + this.searcherFactory = searcherFactory; + this.taxoWriter = taxoWriter; + var taxoReader = new DirectoryTaxonomyReader(taxoWriter); + Current = new SearcherAndTaxonomy(SearcherManager.GetSearcher(searcherFactory, DirectoryReader.Open(writer, applyAllDeletes)), taxoReader); + this.taxoEpoch = taxoWriter.TaxonomyEpoch; + } + + /// + /// Creates search and taxonomy readers over the corresponding directories. + /// + /// + /// NOTE: you should only use this constructor if you commit and call + /// in the same thread. Otherwise it could lead to an + /// unsync'd and pair. + /// + /// + public SearcherTaxonomyManager(Store.Directory indexDir, Store.Directory taxoDir, SearcherFactory searcherFactory) + { + if (searcherFactory == null) + { + searcherFactory = new SearcherFactory(); + } + this.searcherFactory = searcherFactory; + var taxoReader = new DirectoryTaxonomyReader(taxoDir); + Current = new SearcherAndTaxonomy(SearcherManager.GetSearcher(searcherFactory, DirectoryReader.Open(indexDir)), taxoReader); + this.taxoWriter = null; + taxoEpoch = -1; + } + + protected override void DecRef(SearcherAndTaxonomy @ref) + { + @ref.searcher.IndexReader.DecRef(); + + // This decRef can fail, and then in theory we should + // tryIncRef the searcher to put back the ref count + // ... but 1) the below decRef should only fail because + // it decRef'd to 0 and closed and hit some IOException + // during close, in which case 2) very likely the + // searcher was also just closed by the above decRef and + // a tryIncRef would fail: + @ref.taxonomyReader.DecRef(); + } + + protected override bool TryIncRef(SearcherAndTaxonomy @ref) + { + if (@ref.searcher.IndexReader.TryIncRef()) + { + if (@ref.taxonomyReader.TryIncRef()) + { + return true; + } + else + { + @ref.searcher.IndexReader.DecRef(); + } + } + return false; + } + + protected override SearcherAndTaxonomy RefreshIfNeeded(SearcherAndTaxonomy @ref) + { + // Must re-open searcher first, otherwise we may get a + // new reader that references ords not yet known to the + // taxonomy reader: + IndexReader r = @ref.searcher.IndexReader; + IndexReader newReader = DirectoryReader.OpenIfChanged((DirectoryReader)r); + if (newReader == null) + { + return null; + } + else + { + var tr = TaxonomyReader.OpenIfChanged(@ref.taxonomyReader); + if (tr == null) + { + @ref.taxonomyReader.IncRef(); + tr = @ref.taxonomyReader; + } + else if (taxoWriter != null && taxoWriter.TaxonomyEpoch != taxoEpoch) + { + IOUtils.Close(newReader, tr); + throw new ThreadStateException("DirectoryTaxonomyWriter.replaceTaxonomy was called, which is not allowed when using SearcherTaxonomyManager"); + } + + return new SearcherAndTaxonomy(SearcherManager.GetSearcher(searcherFactory, newReader), tr); + } + } + + protected override int GetRefCount(SearcherAndTaxonomy reference) + { + return reference.searcher.IndexReader.RefCount; + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/TaxonomyFacetCounts.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/TaxonomyFacetCounts.cs b/Lucene.Net.Facet/Taxonomy/TaxonomyFacetCounts.cs new file mode 100644 index 0000000..49be839 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/TaxonomyFacetCounts.cs @@ -0,0 +1,77 @@ +using System.Collections.Generic; +using Lucene.Net.Facet; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + using MatchingDocs = FacetsCollector.MatchingDocs; + using BinaryDocValues = Lucene.Net.Index.BinaryDocValues; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using IntsRef = Lucene.Net.Util.IntsRef; + + /// + /// Reads from any ; use {@link + /// FastTaxonomyFacetCounts} if you are using the + /// default encoding from . + /// + /// @lucene.experimental + /// + public class TaxonomyFacetCounts : IntTaxonomyFacets + { + private readonly OrdinalsReader ordinalsReader; + + /// + /// Create {@code TaxonomyFacetCounts}, which also + /// counts all facet labels. Use this for a non-default + /// ; otherwise use {@link + /// FastTaxonomyFacetCounts}. + /// + public TaxonomyFacetCounts(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) + : base(ordinalsReader.IndexFieldName, taxoReader, config) + { + this.ordinalsReader = ordinalsReader; + Count(fc.GetMatchingDocs); + } + + private void Count(IList matchingDocs) + { + IntsRef scratch = new IntsRef(); + foreach (FacetsCollector.MatchingDocs hits in matchingDocs) + { + OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.GetReader(hits.context); + DocIdSetIterator docs = hits.bits.GetIterator(); + + int doc; + while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) + { + ords.Get(doc, scratch); + for (int i = 0; i < scratch.Length; i++) + { + values[scratch.Ints[scratch.Offset + i]]++; + } + } + } + + Rollup(); + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumFloatAssociations.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumFloatAssociations.cs b/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumFloatAssociations.cs new file mode 100644 index 0000000..bf0d417 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumFloatAssociations.cs @@ -0,0 +1,98 @@ +using System.Collections.Generic; +using Lucene.Net.Facet; +using Lucene.Net.Search; +using Lucene.Net.Support; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + using MatchingDocs = FacetsCollector.MatchingDocs; + using BinaryDocValues = Lucene.Net.Index.BinaryDocValues; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using BytesRef = Lucene.Net.Util.BytesRef; + + /// + /// Aggregates sum of int values previously indexed with + /// , assuming the default + /// encoding. + /// + /// @lucene.experimental + /// + public class TaxonomyFacetSumFloatAssociations : FloatTaxonomyFacets + { + + /// + /// Create {@code TaxonomyFacetSumFloatAssociations} against + /// the default index field. + /// + public TaxonomyFacetSumFloatAssociations(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) + : this(FacetsConfig.DEFAULT_INDEX_FIELD_NAME, taxoReader, config, fc) + { + } + + /// + /// Create {@code TaxonomyFacetSumFloatAssociations} against + /// the specified index field. + /// + public TaxonomyFacetSumFloatAssociations(string indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) + : base(indexFieldName, taxoReader, config) + { + SumValues(fc.GetMatchingDocs); + } + + private void SumValues(IList matchingDocs) + { + //System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName); + foreach (FacetsCollector.MatchingDocs hits in matchingDocs) + { + BinaryDocValues dv = hits.context.AtomicReader.GetBinaryDocValues(IndexFieldName); + if (dv == null) // this reader does not have DocValues for the requested category list + { + continue; + } + + DocIdSetIterator docs = hits.bits.GetIterator(); + + int doc; + while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) + { + //System.out.println(" doc=" + doc); + // TODO: use OrdinalsReader? we'd need to add a + // BytesRef getAssociation()? + BytesRef bytesRef = new BytesRef(); + dv.Get(doc, bytesRef); + sbyte[] bytes = bytesRef.Bytes; + int end = bytesRef.Offset + bytesRef.Length; + int offset = bytesRef.Offset; + while (offset < end) + { + int ord = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + offset += 4; + int value = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + offset += 4; + values[ord] += Number.IntBitsToFloat(value); + } + } + } + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumIntAssociations.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumIntAssociations.cs b/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumIntAssociations.cs new file mode 100644 index 0000000..849cb30 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumIntAssociations.cs @@ -0,0 +1,96 @@ +using System.Collections.Generic; +using Lucene.Net.Facet; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + using MatchingDocs = FacetsCollector.MatchingDocs; + using BinaryDocValues = Lucene.Net.Index.BinaryDocValues; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using BytesRef = Lucene.Net.Util.BytesRef; + + /// + /// Aggregates sum of int values previously indexed with + /// , assuming the default + /// encoding. + /// + /// @lucene.experimental + /// + public class TaxonomyFacetSumIntAssociations : IntTaxonomyFacets + { + + /// + /// Create {@code TaxonomyFacetSumIntAssociations} against + /// the default index field. + /// + public TaxonomyFacetSumIntAssociations(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) + : this(FacetsConfig.DEFAULT_INDEX_FIELD_NAME, taxoReader, config, fc) + { + } + + /// + /// Create {@code TaxonomyFacetSumIntAssociations} against + /// the specified index field. + /// + public TaxonomyFacetSumIntAssociations(string indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) + : base(indexFieldName, taxoReader, config) + { + SumValues(fc.GetMatchingDocs); + } + + private void SumValues(IList matchingDocs) + { + //System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName); + foreach (FacetsCollector.MatchingDocs hits in matchingDocs) + { + BinaryDocValues dv = hits.context.AtomicReader.GetBinaryDocValues(IndexFieldName); + if (dv == null) // this reader does not have DocValues for the requested category list + { + continue; + } + + DocIdSetIterator docs = hits.bits.GetIterator(); + + int doc; + while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) + { + //System.out.println(" doc=" + doc); + // TODO: use OrdinalsReader? we'd need to add a + // BytesRef getAssociation()? + BytesRef bytesRef = new BytesRef(); + dv.Get(doc, bytesRef); + sbyte[] bytes = bytesRef.Bytes; + int end = bytesRef.Offset + bytesRef.Length; + int offset = bytesRef.Offset; + while (offset < end) + { + int ord = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + offset += 4; + int value = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + offset += 4; + values[ord] += value; + } + } + } + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumValueSource.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumValueSource.cs b/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumValueSource.cs new file mode 100644 index 0000000..4d4fc76 --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/TaxonomyFacetSumValueSource.cs @@ -0,0 +1,247 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Threading; +using Lucene.Net.Facet; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + using MatchingDocs = FacetsCollector.MatchingDocs; + using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext; + using FunctionValues = Lucene.Net.Queries.Function.FunctionValues; + using ValueSource = Lucene.Net.Queries.Function.ValueSource; + using DoubleDocValues = Lucene.Net.Queries.Function.DocValues.DoubleDocValues; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using Scorer = Lucene.Net.Search.Scorer; + using Weight = Lucene.Net.Search.Weight; + using IntsRef = Lucene.Net.Util.IntsRef; + + /// + /// Aggregates sum of values from {@link + /// FunctionValues#doubleVal}, for each facet label. + /// + /// @lucene.experimental + /// + public class TaxonomyFacetSumValueSource : FloatTaxonomyFacets + { + private readonly OrdinalsReader ordinalsReader; + + /// + /// Aggreggates float facet values from the provided + /// , pulling ordinals using {@link + /// DocValuesOrdinalsReader} against the default indexed + /// facet field {@link + /// FacetsConfig#DEFAULT_INDEX_FIELD_NAME}. + /// + public TaxonomyFacetSumValueSource(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc, ValueSource valueSource) + : this(new DocValuesOrdinalsReader(FacetsConfig.DEFAULT_INDEX_FIELD_NAME), taxoReader, config, fc, valueSource) + { + } + + /// + /// Aggreggates float facet values from the provided + /// , and pulls ordinals from the + /// provided . + /// + public TaxonomyFacetSumValueSource(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc, ValueSource valueSource) + : base(ordinalsReader.IndexFieldName, taxoReader, config) + { + this.ordinalsReader = ordinalsReader; + SumValues(fc.GetMatchingDocs, fc.KeepScores, valueSource); + } + + private sealed class FakeScorer : Scorer + { + internal float score_Renamed; + internal int docID_Renamed; + internal FakeScorer() + : base(null) + { + } + public override float Score() + { + return score_Renamed; + } + public override int Freq() + { + throw new System.NotSupportedException(); + } + public override int DocID() + { + return docID_Renamed; + } + public override int NextDoc() + { + throw new System.NotSupportedException(); + } + public override int Advance(int target) + { + throw new System.NotSupportedException(); + } + public override long Cost() + { + return 0; + } + public override Weight Weight + { + get + { + throw new System.NotSupportedException(); + } + } + + public override ICollection Children + { + get + { + throw new System.NotSupportedException(); + } + } + } + + private void SumValues(IList matchingDocs, bool keepScores, ValueSource valueSource) + { + FakeScorer scorer = new FakeScorer(); + IDictionary context = new Dictionary(); + if (keepScores) + { + context["scorer"] = scorer; + } + IntsRef scratch = new IntsRef(); + foreach (MatchingDocs hits in matchingDocs) + { + OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.GetReader(hits.context); + + int scoresIdx = 0; + float[] scores = hits.scores; + + FunctionValues functionValues = valueSource.GetValues(context, hits.context); + DocIdSetIterator docs = hits.bits.GetIterator(); + + int doc; + while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) + { + ords.Get(doc, scratch); + if (keepScores) + { + scorer.docID_Renamed = doc; + scorer.score_Renamed = scores[scoresIdx++]; + } + float value = (float)functionValues.DoubleVal(doc); + for (int i = 0; i < scratch.Length; i++) + { + values[scratch.Ints[i]] += value; + } + } + } + + Rollup(); + } + + /// + /// that returns the score for each + /// hit; use this to aggregate the sum of all hit scores + /// for each facet label. + /// + public class ScoreValueSource : ValueSource + { + + /// + /// Sole constructor. + public ScoreValueSource() + { + } + + public override FunctionValues GetValues(IDictionary context, AtomicReaderContext readerContext) + { + Scorer scorer = (Scorer)context["scorer"]; + if (scorer == null) + { + throw new ThreadStateException("scores are missing; be sure to pass keepScores=true to FacetsCollector"); + } + return new DoubleDocValuesAnonymousInnerClassHelper(this, scorer); + } + + private class DoubleDocValuesAnonymousInnerClassHelper : DoubleDocValues + { + private readonly ScoreValueSource outerInstance; + + private Scorer scorer; + + public DoubleDocValuesAnonymousInnerClassHelper(ScoreValueSource outerInstance, Scorer scorer) + : base(outerInstance) + { + this.outerInstance = outerInstance; + this.scorer = scorer; + + } + + public override double DoubleVal(int document) + { + try + { + return scorer.Score(); + } + catch (Exception exception) + { + throw; + } + } + } + + public override bool Equals(object o) + { + if (ReferenceEquals(null, o)) return false; + if (ReferenceEquals(this, o)) return true; + if (o.GetType() != this.GetType()) return false; + return Equals((ScoreValueSource) o); + } + + protected bool Equals(ScoreValueSource other) + { + return Equals(this, other); + } + + public override int GetHashCode() + { + return hcode; + } + + private static readonly int hcode = typeof(DoubleDocValuesAnonymousInnerClassHelper).GetHashCode(); + + + + + public override string Description + { + get + { + return "score()"; + } + + } + } + + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/982eaf60/Lucene.Net.Facet/Taxonomy/TaxonomyFacets.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Facet/Taxonomy/TaxonomyFacets.cs b/Lucene.Net.Facet/Taxonomy/TaxonomyFacets.cs new file mode 100644 index 0000000..b23530d --- /dev/null +++ b/Lucene.Net.Facet/Taxonomy/TaxonomyFacets.cs @@ -0,0 +1,137 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Lucene.Net.Facet; + +namespace Lucene.Net.Facet.Taxonomy +{ + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + using DimConfig = Lucene.Net.Facet.FacetsConfig.DimConfig; // javadocs + + /// + /// Base class for all taxonomy-based facets impls. + public abstract class TaxonomyFacets : Facets + { + + private static readonly IComparer BY_VALUE_THEN_DIM = new ComparatorAnonymousInnerClassHelper(); + + private class ComparatorAnonymousInnerClassHelper : IComparer + { + public ComparatorAnonymousInnerClassHelper() + { + } + + public virtual int Compare(FacetResult a, FacetResult b) + { + if ((double)a.value > (double)b.value) + { + return -1; + } + else if ((double)b.value > (double)a.value) + { + return 1; + } + else + { + return a.dim.CompareTo(b.dim); + } + } + } + + /// + /// Index field name provided to the constructor. + protected internal readonly string IndexFieldName; + + /// + /// {@code TaxonomyReader} provided to the constructor. + protected internal readonly TaxonomyReader TaxoReader; + + /// + /// {@code FacetsConfig} provided to the constructor. + protected internal readonly FacetsConfig Config; + + /// + /// Maps parent ordinal to its child, or -1 if the parent + /// is childless. + /// + protected internal readonly int[] Children; + + /// + /// Maps an ordinal to its sibling, or -1 if there is no + /// sibling. + /// + protected internal readonly int[] Siblings; + + /// + /// Sole constructor. + /// + protected internal TaxonomyFacets(string indexFieldName, TaxonomyReader taxoReader, FacetsConfig config) + { + this.IndexFieldName = indexFieldName; + this.TaxoReader = taxoReader; + this.Config = config; + ParallelTaxonomyArrays pta = taxoReader.ParallelTaxonomyArrays; + Children = pta.Children(); + Siblings = pta.Siblings(); + } + + /// + /// Throws {@code IllegalArgumentException} if the + /// dimension is not recognized. Otherwise, returns the + /// for this dimension. + /// + protected internal virtual DimConfig VerifyDim(string dim) + { + DimConfig dimConfig = Config.GetDimConfig(dim); + if (!dimConfig.indexFieldName.Equals(IndexFieldName)) + { + throw new System.ArgumentException("dimension \"" + dim + "\" was not indexed into field \"" + IndexFieldName); + } + return dimConfig; + } + + public override IList GetAllDims(int topN) + { + int ord = Children[TaxonomyReader.ROOT_ORDINAL]; + IList results = new List(); + while (ord != TaxonomyReader.INVALID_ORDINAL) + { + string dim = TaxoReader.GetPath(ord).components[0]; + DimConfig dimConfig = Config.GetDimConfig(dim); + if (dimConfig.indexFieldName.Equals(IndexFieldName)) + { + FacetResult result = GetTopChildren(topN, dim); + if (result != null) + { + results.Add(result); + } + } + ord = Siblings[ord]; + } + + // Sort by highest value, tie break by dim: + var resultArray = results.ToArray(); + Array.Sort(resultArray, BY_VALUE_THEN_DIM); + return resultArray; + } + + } +} \ No newline at end of file