Return-Path: X-Original-To: apmail-lucenenet-commits-archive@www.apache.org Delivered-To: apmail-lucenenet-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 988731887F for ; Sun, 23 Aug 2015 22:34:01 +0000 (UTC) Received: (qmail 92557 invoked by uid 500); 23 Aug 2015 22:34:01 -0000 Delivered-To: apmail-lucenenet-commits-archive@lucenenet.apache.org Received: (qmail 92455 invoked by uid 500); 23 Aug 2015 22:34:01 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 91790 invoked by uid 99); 23 Aug 2015 22:34:01 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 23 Aug 2015 22:34:01 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id E54FBE1782; Sun, 23 Aug 2015 22:34:00 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: synhershko@apache.org To: commits@lucenenet.apache.org Date: Sun, 23 Aug 2015 22:34:11 -0000 Message-Id: <6f21b9fa95264a82a5ff3499a9135788@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [12/17] lucenenet git commit: Lucene.Net.Join tests now passing http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Grouping/TopGroups.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Grouping/TopGroups.cs b/src/Lucene.Net.Grouping/TopGroups.cs new file mode 100644 index 0000000..017c975 --- /dev/null +++ b/src/Lucene.Net.Grouping/TopGroups.cs @@ -0,0 +1,249 @@ +using System; +using Lucene.Net.Search; + +namespace Lucene.Net.Grouping +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Represents result returned by a grouping search. + /// + /// @lucene.experimental + /// + public class TopGroups + { + /// + /// Number of documents matching the search + public readonly int TotalHitCount; + + /// + /// Number of documents grouped into the topN groups + public readonly int TotalGroupedHitCount; + + /// + /// The total number of unique groups. If null this value is not computed. + public readonly int? TotalGroupCount; + + /// + /// Group results in groupSort order + public readonly GroupDocs[] Groups; + + /// + /// How groups are sorted against each other + public readonly SortField[] GroupSort; + + /// + /// How docs are sorted within each group + public readonly SortField[] WithinGroupSort; + + /// + /// Highest score across all hits, or + /// Float.NaN if scores were not computed. + /// + public readonly float MaxScore; + + public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups, float maxScore) + { + GroupSort = groupSort; + WithinGroupSort = withinGroupSort; + TotalHitCount = totalHitCount; + TotalGroupedHitCount = totalGroupedHitCount; + Groups = groups; + TotalGroupCount = null; + MaxScore = maxScore; + } + + public TopGroups(TopGroups oldTopGroups, int? totalGroupCount) + { + GroupSort = oldTopGroups.GroupSort; + WithinGroupSort = oldTopGroups.WithinGroupSort; + TotalHitCount = oldTopGroups.TotalHitCount; + TotalGroupedHitCount = oldTopGroups.TotalGroupedHitCount; + Groups = oldTopGroups.Groups; + MaxScore = oldTopGroups.MaxScore; + TotalGroupCount = totalGroupCount; + } + + /// + /// How the GroupDocs score (if any) should be merged. + public enum ScoreMergeMode + { + /// + /// Set score to Float.NaN + /// + None, + + /// + /// Sum score across all shards for this group. + /// + Total, + + /// + /// Avg score across all shards for this group. + /// + Avg, + } + + /// + /// Merges an array of TopGroups, for example obtained from the second-pass + /// collector across multiple shards. Each TopGroups must have been sorted by the + /// same groupSort and docSort, and the top groups passed to all second-pass + /// collectors must be the same. + /// + /// NOTE: We can't always compute an exact totalGroupCount. + /// Documents belonging to a group may occur on more than + /// one shard and thus the merged totalGroupCount can be + /// higher than the actual totalGroupCount. In this case the + /// totalGroupCount represents a upper bound. If the documents + /// of one group do only reside in one shard then the + /// totalGroupCount is exact. + /// + /// NOTE: the topDocs in each GroupDocs is actually + /// an instance of TopDocsAndShards + /// + public static TopGroups Merge(TopGroups[] shardGroups, Sort groupSort, Sort docSort, int docOffset, int docTopN, ScoreMergeMode scoreMergeMode) + { + //System.out.println("TopGroups.merge"); + + if (shardGroups.Length == 0) + { + return null; + } + + int totalHitCount = 0; + int totalGroupedHitCount = 0; + // Optionally merge the totalGroupCount. + int? totalGroupCount = null; + + int numGroups = shardGroups[0].Groups.Length; + foreach (var shard in shardGroups) + { + if (numGroups != shard.Groups.Length) + { + throw new ArgumentException("number of groups differs across shards; you must pass same top groups to all shards' second-pass collector"); + } + totalHitCount += shard.TotalHitCount; + totalGroupedHitCount += shard.TotalGroupedHitCount; + if (shard.TotalGroupCount != null) + { + if (totalGroupCount == null) + { + totalGroupCount = 0; + } + + totalGroupCount += shard.TotalGroupCount; + } + } + + var mergedGroupDocs = new GroupDocs[numGroups]; + + TopDocs[] shardTopDocs = new TopDocs[shardGroups.Length]; + float totalMaxScore = float.MinValue; + + for (int groupIDX = 0; groupIDX < numGroups; groupIDX++) + { + T groupValue = shardGroups[0].Groups[groupIDX].GroupValue; + //System.out.println(" merge groupValue=" + groupValue + " sortValues=" + Arrays.toString(shardGroups[0].groups[groupIDX].groupSortValues)); + float maxScore = float.MinValue; + int totalHits = 0; + double scoreSum = 0.0; + for (int shardIdx = 0; shardIdx < shardGroups.Length; shardIdx++) + { + //System.out.println(" shard=" + shardIDX); + TopGroups shard = shardGroups[shardIdx]; + var shardGroupDocs = shard.Groups[groupIDX]; + if (groupValue == null) + { + if (shardGroupDocs.GroupValue != null) + { + throw new ArgumentException("group values differ across shards; you must pass same top groups to all shards' second-pass collector"); + } + } + else if (!groupValue.Equals(shardGroupDocs.GroupValue)) + { + throw new ArgumentException("group values differ across shards; you must pass same top groups to all shards' second-pass collector"); + } + + /* + for(ScoreDoc sd : shardGroupDocs.scoreDocs) { + System.out.println(" doc=" + sd.doc); + } + */ + + shardTopDocs[shardIdx] = new TopDocs(shardGroupDocs.TotalHits, shardGroupDocs.ScoreDocs, shardGroupDocs.MaxScore); + maxScore = Math.Max(maxScore, shardGroupDocs.MaxScore); + totalHits += shardGroupDocs.TotalHits; + scoreSum += shardGroupDocs.Score; + } + + TopDocs mergedTopDocs = TopDocs.Merge(docSort, docOffset + docTopN, shardTopDocs); + + // Slice; + ScoreDoc[] mergedScoreDocs; + if (docOffset == 0) + { + mergedScoreDocs = mergedTopDocs.ScoreDocs; + } + else if (docOffset >= mergedTopDocs.ScoreDocs.Length) + { + mergedScoreDocs = new ScoreDoc[0]; + } + else + { + mergedScoreDocs = new ScoreDoc[mergedTopDocs.ScoreDocs.Length - docOffset]; + Array.Copy(mergedTopDocs.ScoreDocs, docOffset, mergedScoreDocs, 0, mergedTopDocs.ScoreDocs.Length - docOffset); + } + + float groupScore; + switch (scoreMergeMode) + { + case ScoreMergeMode.None: + groupScore = float.NaN; + break; + case ScoreMergeMode.Avg: + if (totalHits > 0) + { + groupScore = (float)(scoreSum / totalHits); + } + else + { + groupScore = float.NaN; + } + break; + case ScoreMergeMode.Total: + groupScore = (float)scoreSum; + break; + default: + throw new ArgumentException("can't handle ScoreMergeMode " + scoreMergeMode); + } + + //System.out.println("SHARDS=" + Arrays.toString(mergedTopDocs.shardIndex)); + mergedGroupDocs[groupIDX] = new GroupDocs(groupScore, maxScore, totalHits, mergedScoreDocs, groupValue, shardGroups[0].Groups[groupIDX].GroupSortValues); + totalMaxScore = Math.Max(totalMaxScore, maxScore); + } + + if (totalGroupCount != null) + { + var result = new TopGroups(groupSort.GetSort(), docSort == null ? null : docSort.GetSort(), totalHitCount, totalGroupedHitCount, mergedGroupDocs, totalMaxScore); + return new TopGroups(result, totalGroupCount); + } + + return new TopGroups(groupSort.GetSort(), docSort == null ? null : docSort.GetSort(), totalHitCount, totalGroupedHitCount, mergedGroupDocs, totalMaxScore); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/FakeScorer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/FakeScorer.cs b/src/Lucene.Net.Join/FakeScorer.cs new file mode 100644 index 0000000..42bf91b --- /dev/null +++ b/src/Lucene.Net.Join/FakeScorer.cs @@ -0,0 +1,76 @@ +using System; +using System.Collections.Generic; +using Lucene.Net.Search; + +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Passed to during join collection. + /// + internal sealed class FakeScorer : Scorer + { + internal float _score; + internal int doc = -1; + + public FakeScorer() : base(null) + { + } + + public override int DocID() + { + return doc; + } + + public override int NextDoc() + { + throw new NotSupportedException("FakeScorer doesn't support NextDoc()"); + } + + public override int Advance(int target) + { + throw new NotSupportedException("FakeScorer doesn't support Advance(int)"); + } + + public override long Cost() + { + return 1; + } + + public override int Freq() + { + throw new NotSupportedException("FakeScorer doesn't support Freq()"); + } + + public override float Score() + { + return _score; + } + + public override Weight Weight + { + get { throw new NotSupportedException(); } + } + + public override ICollection Children + { + get { throw new NotSupportedException(); } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/FixedBitSetCachingWrapperFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/FixedBitSetCachingWrapperFilter.cs b/src/Lucene.Net.Join/FixedBitSetCachingWrapperFilter.cs new file mode 100644 index 0000000..da8b0b8 --- /dev/null +++ b/src/Lucene.Net.Join/FixedBitSetCachingWrapperFilter.cs @@ -0,0 +1,62 @@ +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; + +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// A that caches sets using a , + /// as required for joins. + /// + public sealed class FixedBitSetCachingWrapperFilter : CachingWrapperFilter + { + /// + /// Sole constructor, see . + /// + public FixedBitSetCachingWrapperFilter(Filter filter) : base(filter) + { + } + + protected override DocIdSet DocIdSetToCache(DocIdSet docIdSet, AtomicReader reader) + { + if (docIdSet == null) + { + return EMPTY_DOCIDSET; + } + + if (docIdSet is FixedBitSet) + { + // this is different from CachingWrapperFilter: even when the DocIdSet is + // cacheable, we convert it to a FixedBitSet since we require all the + // cached filters to be FixedBitSets + return docIdSet; + } + + DocIdSetIterator it = docIdSet.GetIterator(); + if (it == null) + { + return EMPTY_DOCIDSET; + } + FixedBitSet copy = new FixedBitSet(reader.MaxDoc); + copy.Or(it); + return copy; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/JoinUtil.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/JoinUtil.cs b/src/Lucene.Net.Join/JoinUtil.cs new file mode 100644 index 0000000..726731e --- /dev/null +++ b/src/Lucene.Net.Join/JoinUtil.cs @@ -0,0 +1,80 @@ +using System.IO; +using Lucene.Net.Search; + +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + /// + /// Utility for query time joining using TermsQuery and TermsCollector. + /// + /// @lucene.experimental + /// + public sealed class JoinUtil + { + // No instances allowed + private JoinUtil() + { + } + + /// + /// Method for query time joining. + ///

+ /// Execute the returned query with a to retrieve all documents that have the same terms in the + /// to field that match with documents matching the specified fromQuery and have the same terms in the from field. + ///

+ /// In the case a single document relates to more than one document the multipleValuesPerDocument option + /// should be set to true. When the multipleValuesPerDocument is set to true only the + /// the score from the first encountered join value originating from the 'from' side is mapped into the 'to' side. + /// Even in the case when a second join value related to a specific document yields a higher score. Obviously this + /// doesn't apply in the case that is used, since no scores are computed at all. + ///

+ /// Memory considerations: During joining all unique join values are kept in memory. On top of that when the scoreMode + /// isn't set to a float value per unique join value is kept in memory for computing scores. + /// When scoreMode is set to also an additional integer value is kept in memory per unique + /// join value. + ///
+ /// The from field to join from + /// Whether the from field has multiple terms per document + /// The to field to join to + /// The query to match documents on the from side + /// The searcher that executed the specified fromQuery + /// Instructs how scores from the fromQuery are mapped to the returned query + /// A instance that can be used to join documents based on the terms in the from and to field + /// If I/O related errors occur + public static Query CreateJoinQuery(string fromField, bool multipleValuesPerDocument, string toField, Query fromQuery, IndexSearcher fromSearcher, ScoreMode scoreMode) + { + switch (scoreMode) + { + case ScoreMode.None: + TermsCollector termsCollector = TermsCollector.Create(fromField, multipleValuesPerDocument); + fromSearcher.Search(fromQuery, termsCollector); + return new TermsQuery(toField, fromQuery, termsCollector.CollectorTerms); + case ScoreMode.Total: + case ScoreMode.Max: + case ScoreMode.Avg: + TermsWithScoreCollector termsWithScoreCollector = TermsWithScoreCollector.Create(fromField, multipleValuesPerDocument, scoreMode); + fromSearcher.Search(fromQuery, termsWithScoreCollector); + return new TermsIncludingScoreQuery(toField, multipleValuesPerDocument, termsWithScoreCollector.CollectedTerms, termsWithScoreCollector.ScoresPerTerm, fromQuery); + default: + throw new System.ArgumentException(string.Format("Score mode {0} isn't supported.", scoreMode)); + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/Lucene.Net.Join.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/Lucene.Net.Join.csproj b/src/Lucene.Net.Join/Lucene.Net.Join.csproj new file mode 100644 index 0000000..72bda4a --- /dev/null +++ b/src/Lucene.Net.Join/Lucene.Net.Join.csproj @@ -0,0 +1,76 @@ + + + + + Debug + AnyCPU + {E8A339C7-FCF6-4A72-8586-56D8961D7B99} + Library + Properties + Lucene.Net.Join + Lucene.Net.Join + v4.5.1 + 512 + + + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {02BAB603-067D-48B1-AEDD-316849652568} + Lucene.Net.Grouping + + + {5D4AD9BE-1FFB-41AB-9943-25737971BF57} + Lucene.Net + + + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/Properties/AssemblyInfo.cs b/src/Lucene.Net.Join/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..2c17c13 --- /dev/null +++ b/src/Lucene.Net.Join/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Join")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("Lucene.Net.Join")] +[assembly: AssemblyCopyright("Copyright © 2015")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("e8a339c7-fcf6-4a72-8586-56d8961d7b99")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/ScoreMode.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/ScoreMode.cs b/src/Lucene.Net.Join/ScoreMode.cs new file mode 100644 index 0000000..a5b91be --- /dev/null +++ b/src/Lucene.Net.Join/ScoreMode.cs @@ -0,0 +1,45 @@ +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// How to aggregate multiple child hit scores into a single parent score. + /// + public enum ScoreMode + { + /// + /// Do no scoring. + /// + None, + + /// + /// Parent hit's score is the average of all child scores. + /// + Avg, + + /// + /// Parent hit's score is the max of all child scores. + /// + Max, + + /// + /// Parent hit's score is the sum of all child scores. + /// + Total + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/TermsCollector.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/TermsCollector.cs b/src/Lucene.Net.Join/TermsCollector.cs new file mode 100644 index 0000000..2ccf1ed --- /dev/null +++ b/src/Lucene.Net.Join/TermsCollector.cs @@ -0,0 +1,127 @@ +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; + +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// A collector that collects all terms from a specified field matching the query. + /// + /// @lucene.experimental + /// + internal abstract class TermsCollector : Collector + { + private readonly string _field; + private readonly BytesRefHash _collectorTerms = new BytesRefHash(); + + internal TermsCollector(string field) + { + _field = field; + } + + public BytesRefHash CollectorTerms + { + get + { + return _collectorTerms; + } + } + + public override Scorer Scorer + { + set {} + } + + public override bool AcceptsDocsOutOfOrder() + { + return true; + } + + /// + /// Chooses the right implementation. + /// + /// The field to collect terms for. + /// Whether the field to collect terms for has multiple values per document. + /// A instance. + internal static TermsCollector Create(string field, bool multipleValuesPerDocument) + { + return multipleValuesPerDocument ? (TermsCollector) new MV(field) : new SV(field); + } + + // impl that works with multiple values per document + private class MV : TermsCollector + { + private readonly BytesRef _scratch = new BytesRef(); + private SortedSetDocValues _docTermOrds; + + internal MV(string field) : base(field) + { + } + + public override void Collect(int doc) + { + _docTermOrds.Document = doc; + long ord; + while ((ord = _docTermOrds.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) + { + _docTermOrds.LookupOrd(ord, _scratch); + _collectorTerms.Add(_scratch); + } + } + + public override AtomicReaderContext NextReader + { + set { _docTermOrds = FieldCache.DEFAULT.GetDocTermOrds(value.AtomicReader, _field); } + } + + public override bool AcceptsDocsOutOfOrder() + { + throw new System.NotImplementedException(); + } + } + + // impl that works with single value per document + private class SV : TermsCollector + { + private readonly BytesRef _spare = new BytesRef(); + private BinaryDocValues _fromDocTerms; + + internal SV(string field) : base(field) + { + } + + public override void Collect(int doc) + { + _fromDocTerms.Get(doc, _spare); + _collectorTerms.Add(_spare); + } + + public override AtomicReaderContext NextReader + { + set { _fromDocTerms = FieldCache.DEFAULT.GetTerms(value.AtomicReader, _field, false); } + } + + public override bool AcceptsDocsOutOfOrder() + { + return base.AcceptsDocsOutOfOrder(); + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs new file mode 100644 index 0000000..9f3befc --- /dev/null +++ b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs @@ -0,0 +1,472 @@ +using System.Collections.Generic; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; + +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + internal class TermsIncludingScoreQuery : Query + { + private readonly string _field; + private readonly bool _multipleValuesPerDocument; + private readonly BytesRefHash _terms; + private readonly float[] _scores; + private readonly int[] _ords; + private readonly Query _originalQuery; + private readonly Query _unwrittenOriginalQuery; + + internal TermsIncludingScoreQuery(string field, bool multipleValuesPerDocument, BytesRefHash terms, + float[] scores, Query originalQuery) + { + _field = field; + _multipleValuesPerDocument = multipleValuesPerDocument; + _terms = terms; + _scores = scores; + _originalQuery = originalQuery; + _ords = terms.Sort(BytesRef.UTF8SortedAsUnicodeComparer); + _unwrittenOriginalQuery = originalQuery; + } + + private TermsIncludingScoreQuery(string field, bool multipleValuesPerDocument, BytesRefHash terms, + float[] scores, int[] ords, Query originalQuery, Query unwrittenOriginalQuery) + { + _field = field; + _multipleValuesPerDocument = multipleValuesPerDocument; + _terms = terms; + _scores = scores; + _originalQuery = originalQuery; + _ords = ords; + _unwrittenOriginalQuery = unwrittenOriginalQuery; + } + + public override string ToString(string @string) + { + return string.Format("TermsIncludingScoreQuery{{field={0};originalQuery={1}}}", _field, + _unwrittenOriginalQuery); + } + + public override void ExtractTerms(ISet terms) + { + _originalQuery.ExtractTerms(terms); + } + + public override Query Rewrite(IndexReader reader) + { + Query originalQueryRewrite = _originalQuery.Rewrite(reader); + if (originalQueryRewrite != _originalQuery) + { + Query rewritten = new TermsIncludingScoreQuery(_field, _multipleValuesPerDocument, _terms, _scores, + _ords, originalQueryRewrite, _originalQuery); + rewritten.Boost = Boost; + return rewritten; + } + + return this; + } + + protected bool Equals(TermsIncludingScoreQuery other) + { + return base.Equals(other) && string.Equals(_field, other._field) && + Equals(_unwrittenOriginalQuery, other._unwrittenOriginalQuery); + } + + public override bool Equals(object obj) + { + if (ReferenceEquals(null, obj)) return false; + if (ReferenceEquals(this, obj)) return true; + if (obj.GetType() != GetType()) return false; + return Equals((TermsIncludingScoreQuery) obj); + } + + public override int GetHashCode() + { + unchecked + { + int hashCode = base.GetHashCode(); + hashCode = (hashCode*397) ^ (_field != null ? _field.GetHashCode() : 0); + hashCode = (hashCode*397) ^ + (_unwrittenOriginalQuery != null ? _unwrittenOriginalQuery.GetHashCode() : 0); + return hashCode; + } + } + + public override Weight CreateWeight(IndexSearcher searcher) + { + Weight originalWeight = _originalQuery.CreateWeight(searcher); + return new WeightAnonymousInnerClassHelper(this, originalWeight); + } + + private class WeightAnonymousInnerClassHelper : Weight + { + private readonly TermsIncludingScoreQuery outerInstance; + + private Weight originalWeight; + + public WeightAnonymousInnerClassHelper(TermsIncludingScoreQuery outerInstance, Weight originalWeight) + { + this.outerInstance = outerInstance; + this.originalWeight = originalWeight; + } + + + private TermsEnum segmentTermsEnum; + + public override Explanation Explain(AtomicReaderContext context, int doc) + { + SVInnerScorer scorer = (SVInnerScorer) BulkScorer(context, false, null); + if (scorer != null) + { + return scorer.Explain(doc); + } + return new ComplexExplanation(false, 0.0f, "Not a match"); + } + + public override bool ScoresDocsOutOfOrder() + { + // We have optimized impls below if we are allowed + // to score out-of-order: + return true; + } + + public override Query Query + { + get { return outerInstance; } + } + + public override float ValueForNormalization + { + get { return originalWeight.ValueForNormalization*outerInstance.Boost*outerInstance.Boost; } + } + + public override void Normalize(float norm, float topLevelBoost) + { + originalWeight.Normalize(norm, topLevelBoost*outerInstance.Boost); + } + + public override Scorer Scorer(AtomicReaderContext context, Bits acceptDocs) + { + Terms terms = context.AtomicReader.Terms(outerInstance._field); + if (terms == null) + { + return null; + } + + // what is the runtime...seems ok? + long cost = context.AtomicReader.MaxDoc * terms.Size(); + + segmentTermsEnum = terms.Iterator(segmentTermsEnum); + if (outerInstance._multipleValuesPerDocument) + { + return new MVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost); + } + + return new SVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost); + } + + public override BulkScorer BulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, Bits acceptDocs) + { + if (scoreDocsInOrder) + { + return base.BulkScorer(context, scoreDocsInOrder, acceptDocs); + } + + Terms terms = context.AtomicReader.Terms(outerInstance._field); + if (terms == null) + { + return null; + } + // what is the runtime...seems ok? + long cost = context.AtomicReader.MaxDoc * terms.Size(); + + segmentTermsEnum = terms.Iterator(segmentTermsEnum); + // Optimized impls that take advantage of docs + // being allowed to be out of order: + if (outerInstance._multipleValuesPerDocument) + { + return new MVInnerScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost); + } + + return new SVInnerScorer(outerInstance, this, acceptDocs, segmentTermsEnum, cost); + } + } + + // This impl assumes that the 'join' values are used uniquely per doc per field. Used for one to many relations. + internal class SVInnerScorer : BulkScorer + { + private readonly TermsIncludingScoreQuery outerInstance; + + private readonly BytesRef _spare = new BytesRef(); + private readonly Bits _acceptDocs; + private readonly TermsEnum _termsEnum; + private readonly long _cost; + + private int _upto; + internal DocsEnum DocsEnum; + private DocsEnum _reuse; + private int _scoreUpto; + private int _doc; + + internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, Weight weight, Bits acceptDocs, TermsEnum termsEnum, long cost) + { + this.outerInstance = outerInstance; + _acceptDocs = acceptDocs; + _termsEnum = termsEnum; + _cost = cost; + _doc = -1; + } + + public override bool Score(Collector collector, int max) + { + FakeScorer fakeScorer = new FakeScorer(); + collector.Scorer = fakeScorer; + if (_doc == -1) + { + _doc = NextDocOutOfOrder(); + } + while (_doc < max) + { + fakeScorer.doc = _doc; + fakeScorer._score = outerInstance._scores[outerInstance._ords[_scoreUpto]]; + collector.Collect(_doc); + _doc = NextDocOutOfOrder(); + } + + return _doc != DocIdSetIterator.NO_MORE_DOCS; + } + + private int NextDocOutOfOrder() + { + while (true) + { + if (DocsEnum != null) + { + int docId = DocsEnumNextDoc(); + if (docId == DocIdSetIterator.NO_MORE_DOCS) + { + DocsEnum = null; + } + else + { + return _doc = docId; + } + } + + if (_upto == outerInstance._terms.Size()) + { + return _doc = DocIdSetIterator.NO_MORE_DOCS; + } + + _scoreUpto = _upto; + if (_termsEnum.SeekExact(outerInstance._terms.Get(outerInstance._ords[_upto++], _spare))) + { + DocsEnum = _reuse = _termsEnum.Docs(_acceptDocs, _reuse, DocsEnum.FLAG_NONE); + } + } + } + + protected virtual int DocsEnumNextDoc() + { + return DocsEnum.NextDoc(); + } + + internal Explanation Explain(int target) + { + int docId; + do + { + docId = NextDocOutOfOrder(); + if (docId < target) + { + int tempDocId = DocsEnum.Advance(target); + if (tempDocId == target) + { + docId = tempDocId; + break; + } + } + else if (docId == target) + { + break; + } + DocsEnum = null; // goto the next ord. + } while (docId != DocIdSetIterator.NO_MORE_DOCS); + + return new ComplexExplanation(true, outerInstance._scores[outerInstance._ords[_scoreUpto]], + "Score based on join value " + _termsEnum.Term().Utf8ToString()); + } + } + + // This impl that tracks whether a docid has already been emitted. This check makes sure that docs aren't emitted + // twice for different join values. This means that the first encountered join value determines the score of a document + // even if other join values yield a higher score. + internal class MVInnerScorer : SVInnerScorer + { + private readonly TermsIncludingScoreQuery outerInstance; + + + internal readonly FixedBitSet alreadyEmittedDocs; + + internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, Weight weight, Bits acceptDocs, + TermsEnum termsEnum, int maxDoc, long cost) : base(outerInstance, weight, acceptDocs, termsEnum, cost) + { + this.outerInstance = outerInstance; + alreadyEmittedDocs = new FixedBitSet(maxDoc); + } + + protected override int DocsEnumNextDoc() + { + while (true) + { + int docId = DocsEnum.NextDoc(); + if (docId == DocIdSetIterator.NO_MORE_DOCS) + { + return docId; + } + if (!alreadyEmittedDocs.GetAndSet(docId)) + { + return docId; //if it wasn't previously set, return it + } + } + } + } + + internal class SVInOrderScorer : Scorer + { + private readonly TermsIncludingScoreQuery outerInstance; + + + internal readonly DocIdSetIterator matchingDocsIterator; + internal readonly float[] scores; + internal readonly long cost_Renamed; + + internal int currentDoc = -1; + + internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, Bits acceptDocs, + TermsEnum termsEnum, int maxDoc, long cost) : base(weight) + { + this.outerInstance = outerInstance; + FixedBitSet matchingDocs = new FixedBitSet(maxDoc); + scores = new float[maxDoc]; + FillDocsAndScores(matchingDocs, acceptDocs, termsEnum); + matchingDocsIterator = matchingDocs.GetIterator(); + cost_Renamed = cost; + } + + protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, Bits acceptDocs, + TermsEnum termsEnum) + { + BytesRef spare = new BytesRef(); + DocsEnum docsEnum = null; + for (int i = 0; i < outerInstance._terms.Size(); i++) + { + if (termsEnum.SeekExact(outerInstance._terms.Get(outerInstance._ords[i], spare))) + { + docsEnum = termsEnum.Docs(acceptDocs, docsEnum, FLAG_NONE); + float score = outerInstance._scores[outerInstance._ords[i]]; + for (int doc = docsEnum.NextDoc(); + doc != NO_MORE_DOCS; + doc = docsEnum.NextDoc()) + { + matchingDocs.Set(doc); + // In the case the same doc is also related to a another doc, a score might be overwritten. I think this + // can only happen in a many-to-many relation + scores[doc] = score; + } + } + } + } + + public override float Score() + { + return scores[currentDoc]; + } + + public override int Freq() + { + return 1; + } + + public override int DocID() + { + return currentDoc; + } + + public override int NextDoc() + { + return currentDoc = matchingDocsIterator.NextDoc(); + } + + public override int Advance(int target) + { + return currentDoc = matchingDocsIterator.Advance(target); + } + + public override long Cost() + { + return cost_Renamed; + } + } + + // This scorer deals with the fact that a document can have more than one score from multiple related documents. + internal class MVInOrderScorer : SVInOrderScorer + { + private readonly TermsIncludingScoreQuery outerInstance; + + + internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, Bits acceptDocs, + TermsEnum termsEnum, int maxDoc, long cost) + : base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost) + { + this.outerInstance = outerInstance; + } + + protected override void FillDocsAndScores(FixedBitSet matchingDocs, Bits acceptDocs, + TermsEnum termsEnum) + { + BytesRef spare = new BytesRef(); + DocsEnum docsEnum = null; + for (int i = 0; i < outerInstance._terms.Size(); i++) + { + if (termsEnum.SeekExact(outerInstance._terms.Get(outerInstance._ords[i], spare))) + { + docsEnum = termsEnum.Docs(acceptDocs, docsEnum, FLAG_NONE); + float score = outerInstance._scores[outerInstance._ords[i]]; + for (int doc = docsEnum.NextDoc(); + doc != NO_MORE_DOCS; + doc = docsEnum.NextDoc()) + { + // I prefer this: + /*if (scores[doc] < score) { + scores[doc] = score; + matchingDocs.set(doc); + }*/ + // But this behaves the same as MVInnerScorer and only then the tests will pass: + if (!matchingDocs.Get(doc)) + { + scores[doc] = score; + matchingDocs.Set(doc); + } + } + } + } + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/TermsQuery.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/TermsQuery.cs b/src/Lucene.Net.Join/TermsQuery.cs new file mode 100644 index 0000000..2d5ccf8 --- /dev/null +++ b/src/Lucene.Net.Join/TermsQuery.cs @@ -0,0 +1,147 @@ +using System.Collections.Generic; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; + +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// A query that has an array of terms from a specific field. This query will match documents have one or more terms in + /// the specified field that match with the terms specified in the array. + /// + /// @lucene.experimental + /// + internal class TermsQuery : MultiTermQuery + { + private readonly BytesRefHash _terms; + private readonly int[] _ords; + private readonly Query _fromQuery; // Used for equals() only + + /// + /// + /// + /// The field that should contain terms that are specified in the previous parameter. + /// + /// The terms that matching documents should have. The terms must be sorted by natural order. + internal TermsQuery(string field, Query fromQuery, BytesRefHash terms) : base(field) + { + _fromQuery = fromQuery; + _terms = terms; + _ords = terms.Sort(BytesRef.UTF8SortedAsUnicodeComparer); + } + + public override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts) + { + if (_terms.Size() == 0) + { + return TermsEnum.EMPTY; + } + + return new SeekingTermSetTermsEnum(terms.Iterator(null), _terms, _ords); + + } + + public override string ToString(string field) + { + return string.Format("TermsQuery{{field={0}}}", field); + } + + private class SeekingTermSetTermsEnum : FilteredTermsEnum + { + private readonly BytesRefHash Terms; + private readonly int[] Ords; + private readonly int _lastElement; + + private readonly BytesRef _lastTerm; + private readonly BytesRef _spare = new BytesRef(); + private readonly IComparer _comparator; + + private BytesRef _seekTerm; + private int _upto; + + internal SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms, int[] ords) : base(tenum) + { + Terms = terms; + Ords = ords; + _comparator = BytesRef.UTF8SortedAsUnicodeComparer; + _lastElement = terms.Size() - 1; + _lastTerm = terms.Get(ords[_lastElement], new BytesRef()); + _seekTerm = terms.Get(ords[_upto], _spare); + } + + + + protected override BytesRef NextSeekTerm(BytesRef currentTerm) + { + BytesRef temp = _seekTerm; + _seekTerm = null; + return temp; + } + + protected override AcceptStatus Accept(BytesRef term) + { + if (_comparator.Compare(term, _lastTerm) > 0) + { + return AcceptStatus.END; + } + + BytesRef currentTerm = Terms.Get(Ords[_upto], _spare); + if (_comparator.Compare(term, currentTerm) == 0) + { + if (_upto == _lastElement) + { + return AcceptStatus.YES; + } + + _seekTerm = Terms.Get(Ords[++_upto], _spare); + return AcceptStatus.YES_AND_SEEK; + } + + if (_upto == _lastElement) + { + return AcceptStatus.NO; + } // Our current term doesn't match the the given term. + + int cmp; + do // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher. + { + if (_upto == _lastElement) + { + return AcceptStatus.NO; + } + // typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of + // our terms so we don't do a binary search here + _seekTerm = Terms.Get(Ords[++_upto], _spare); + } while ((cmp = _comparator.Compare(_seekTerm, term)) < 0); + if (cmp == 0) + { + if (_upto == _lastElement) + { + return AcceptStatus.YES; + } + _seekTerm = Terms.Get(Ords[++_upto], _spare); + return AcceptStatus.YES_AND_SEEK; + } + + return AcceptStatus.NO_AND_SEEK; + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/TermsWithScoreCollector.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/TermsWithScoreCollector.cs b/src/Lucene.Net.Join/TermsWithScoreCollector.cs new file mode 100644 index 0000000..e823293 --- /dev/null +++ b/src/Lucene.Net.Join/TermsWithScoreCollector.cs @@ -0,0 +1,333 @@ +using System; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; + +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + internal abstract class TermsWithScoreCollector : Collector + { + private const int InitialArraySize = 256; + + private readonly string _field; + private readonly BytesRefHash _collectedTerms = new BytesRefHash(); + private readonly ScoreMode _scoreMode; + + private Scorer _scorer; + private float[] _scoreSums = new float[InitialArraySize]; + + internal TermsWithScoreCollector(string field, ScoreMode scoreMode) + { + this._field = field; + this._scoreMode = scoreMode; + } + + public BytesRefHash CollectedTerms + { + get + { + return _collectedTerms; + } + } + + public virtual float[] ScoresPerTerm + { + get + { + return _scoreSums; + } + } + + //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: + //ORIGINAL LINE: @Override public void setScorer(org.apache.lucene.search.Scorer scorer) throws java.io.IOException + public override Scorer Scorer + { + set + { + _scorer = value; + } + } + + public override bool AcceptsDocsOutOfOrder() + { + return true; + } + + /// + /// Chooses the right implementation. + /// + /// The field to collect terms for. + /// Whether the field to collect terms for has multiple values per document. + /// A instance + internal static TermsWithScoreCollector Create(string field, bool multipleValuesPerDocument, ScoreMode scoreMode) + { + if (multipleValuesPerDocument) + { + switch (scoreMode) + { + case ScoreMode.Avg: + return new Mv.Avg(field); + default: + return new Mv(field, scoreMode); + } + } + + switch (scoreMode) + { + case ScoreMode.Avg: + return new Sv.Avg(field); + default: + return new Sv(field, scoreMode); + } + } + + // impl that works with single value per document + internal class Sv : TermsWithScoreCollector + { + private readonly BytesRef _spare = new BytesRef(); + private BinaryDocValues _fromDocTerms; + + internal Sv(string field, ScoreMode scoreMode) : base(field, scoreMode) + { + } + + public override void Collect(int doc) + { + _fromDocTerms.Get(doc, _spare); + int ord = _collectedTerms.Add(_spare); + if (ord < 0) + { + ord = -ord - 1; + } + else + { + if (ord >= _scoreSums.Length) + { + _scoreSums = ArrayUtil.Grow(_scoreSums); + } + } + + float current = _scorer.Score(); + float existing = _scoreSums[ord]; + if (existing.CompareTo(0.0f) == 0) + { + _scoreSums[ord] = current; + } + else + { + switch (_scoreMode) + { + case ScoreMode.Total: + _scoreSums[ord] = _scoreSums[ord] + current; + break; + case ScoreMode.Max: + if (current > existing) + { + _scoreSums[ord] = current; + } + break; + } + } + } + + public override AtomicReaderContext NextReader + { + set + { + _fromDocTerms = FieldCache.DEFAULT.GetTerms(value.AtomicReader, _field, false); + } + } + + public override bool AcceptsDocsOutOfOrder() + { + return base.AcceptsDocsOutOfOrder(); + } + + internal class Avg : Sv + { + private int[] _scoreCounts = new int[InitialArraySize]; + + internal Avg(string field) : base(field, ScoreMode.Avg) + { + } + + public override void Collect(int doc) + { + _fromDocTerms.Get(doc, _spare); + int ord = _collectedTerms.Add(_spare); + if (ord < 0) + { + ord = -ord - 1; + } + else + { + if (ord >= _scoreSums.Length) + { + _scoreSums = ArrayUtil.Grow(_scoreSums); + _scoreCounts = ArrayUtil.Grow(_scoreCounts); + } + } + + float current = _scorer.Score(); + float existing = _scoreSums[ord]; + if (existing.CompareTo(0.0f) == 0) + { + _scoreSums[ord] = current; + _scoreCounts[ord] = 1; + } + else + { + _scoreSums[ord] = _scoreSums[ord] + current; + _scoreCounts[ord]++; + } + } + + public override float[] ScoresPerTerm + { + get + { + if (_scoreCounts != null) + { + for (int i = 0; i < _scoreCounts.Length; i++) + { + _scoreSums[i] = _scoreSums[i] / _scoreCounts[i]; + } + _scoreCounts = null; + } + return _scoreSums; + } + } + } + } + + // impl that works with multiple values per document + internal class Mv : TermsWithScoreCollector + { + private SortedSetDocValues _fromDocTermOrds; + private readonly BytesRef _scratch = new BytesRef(); + + internal Mv(string field, ScoreMode scoreMode) : base(field, scoreMode) + { + } + + public override void Collect(int doc) + { + _fromDocTermOrds.Document = doc; + long ord; + while ((ord = _fromDocTermOrds.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) + { + _fromDocTermOrds.LookupOrd(ord, _scratch); + + int termId = _collectedTerms.Add(_scratch); + if (termId < 0) + { + termId = -termId - 1; + } + else + { + if (termId >= _scoreSums.Length) + { + _scoreSums = ArrayUtil.Grow(_scoreSums); + } + } + + switch (_scoreMode) + { + case ScoreMode.Total: + _scoreSums[termId] += _scorer.Score(); + break; + case ScoreMode.Max: + _scoreSums[termId] = Math.Max(_scoreSums[termId], _scorer.Score()); + break; + } + } + } + + //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: + //ORIGINAL LINE: @Override public void setNextReader(org.apache.lucene.index.AtomicReaderContext context) throws java.io.IOException + public override AtomicReaderContext NextReader + { + set + { + _fromDocTermOrds = FieldCache.DEFAULT.GetDocTermOrds(value.AtomicReader, _field); + } + } + + public override bool AcceptsDocsOutOfOrder() + { + throw new NotImplementedException(); + } + + internal class Avg : Mv + { + private int[] _scoreCounts = new int[InitialArraySize]; + + internal Avg(string field) : base(field, ScoreMode.Avg) + { + } + + public override void Collect(int doc) + { + _fromDocTermOrds.Document = doc; + long ord; + while ((ord = _fromDocTermOrds.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) + { + _fromDocTermOrds.LookupOrd(ord, _scratch); + + int termId = _collectedTerms.Add(_scratch); + if (termId < 0) + { + termId = -termId - 1; + } + else + { + if (termId >= _scoreSums.Length) + { + _scoreSums = ArrayUtil.Grow(_scoreSums); + _scoreCounts = ArrayUtil.Grow(_scoreCounts); + } + } + + _scoreSums[termId] += _scorer.Score(); + _scoreCounts[termId]++; + } + } + + public override float[] ScoresPerTerm + { + get + { + if (_scoreCounts != null) + { + for (int i = 0; i < _scoreCounts.Length; i++) + { + _scoreSums[i] = _scoreSums[i] / _scoreCounts[i]; + } + _scoreCounts = null; + } + return _scoreSums; + } + } + } + } + + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4820f236/src/Lucene.Net.Join/ToChildBlockJoinQuery.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Join/ToChildBlockJoinQuery.cs b/src/Lucene.Net.Join/ToChildBlockJoinQuery.cs new file mode 100644 index 0000000..3d4f2d5 --- /dev/null +++ b/src/Lucene.Net.Join/ToChildBlockJoinQuery.cs @@ -0,0 +1,396 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; + +namespace Lucene.Net.Join +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Just like , except this + /// query joins in reverse: you provide a Query matching + /// parent documents and it joins down to child + /// documents. + /// + /// @lucene.experimental + /// + public class ToChildBlockJoinQuery : Query + { + /// + /// Message thrown from + /// on mis-use, when the parent query incorrectly returns child docs. + /// + public const string InvalidQueryMessage = "Parent query yields document which is not matched by parents filter, docID="; + + private readonly Filter _parentsFilter; + private readonly Query _parentQuery; + + // If we are rewritten, this is the original parentQuery we + // were passed; we use this for .equals() and + // .hashCode(). This makes rewritten query equal the + // original, so that user does not have to .rewrite() their + // query before searching: + private readonly Query _origParentQuery; + private readonly bool _doScores; + + /// + /// Create a ToChildBlockJoinQuery. + /// + /// Query that matches parent documents + /// Filter (must produce FixedBitSet per-segment, like ) + /// identifying the parent documents. + /// True if parent scores should be calculated. + public ToChildBlockJoinQuery(Query parentQuery, Filter parentsFilter, bool doScores) + { + _origParentQuery = parentQuery; + _parentQuery = parentQuery; + _parentsFilter = parentsFilter; + _doScores = doScores; + } + + private ToChildBlockJoinQuery(Query origParentQuery, Query parentQuery, Filter parentsFilter, bool doScores) : base() + { + _origParentQuery = origParentQuery; + _parentQuery = parentQuery; + _parentsFilter = parentsFilter; + _doScores = doScores; + } + + public override Weight CreateWeight(IndexSearcher searcher) + { + return new ToChildBlockJoinWeight(this, _parentQuery.CreateWeight(searcher), _parentsFilter, _doScores); + } + + private class ToChildBlockJoinWeight : Weight + { + private readonly Query _joinQuery; + private readonly Weight _parentWeight; + private readonly Filter _parentsFilter; + private readonly bool _doScores; + + public ToChildBlockJoinWeight(Query joinQuery, Weight parentWeight, Filter parentsFilter, bool doScores) : base() + { + _joinQuery = joinQuery; + _parentWeight = parentWeight; + _parentsFilter = parentsFilter; + _doScores = doScores; + } + + public override Query Query + { + get { return _joinQuery; } + } + + public override float ValueForNormalization + { + get { return _parentWeight.ValueForNormalization*_joinQuery.Boost*_joinQuery.Boost; } + } + + public override void Normalize(float norm, float topLevelBoost) + { + _parentWeight.Normalize(norm, topLevelBoost * _joinQuery.Boost); + } + + // NOTE: acceptDocs applies (and is checked) only in the child document space + public override Scorer Scorer(AtomicReaderContext readerContext, Bits acceptDocs) + { + Scorer parentScorer = _parentWeight.Scorer(readerContext, null); + + if (parentScorer == null) + { + // No matches + return null; + } + + // NOTE: we cannot pass acceptDocs here because this + // will (most likely, justifiably) cause the filter to + // not return a FixedBitSet but rather a + // BitsFilteredDocIdSet. Instead, we filter by + // acceptDocs when we score: + DocIdSet parents = _parentsFilter.GetDocIdSet(readerContext, null); + + if (parents == null) + { + // No matches + return null; + } + if (!(parents is FixedBitSet)) + { + throw new InvalidOperationException("parentFilter must return FixedBitSet; got " + parents); + } + + return new ToChildBlockJoinScorer(this, parentScorer, (FixedBitSet)parents, _doScores, acceptDocs); + } + + public override Explanation Explain(AtomicReaderContext reader, int doc) + { + // TODO + throw new NotSupportedException(GetType().Name + " cannot explain match on parent document"); + } + + public override bool ScoresDocsOutOfOrder() + { + return false; + } + } + + private sealed class ToChildBlockJoinScorer : Scorer + { + private readonly Scorer _parentScorer; + private readonly FixedBitSet _parentBits; + private readonly bool _doScores; + private readonly Bits _acceptDocs; + + private float _parentScore; + private int _parentFreq = 1; + + private int _childDoc = -1; + private int _parentDoc; + + public ToChildBlockJoinScorer(Weight weight, Scorer parentScorer, FixedBitSet parentBits, bool doScores, Bits acceptDocs) : base(weight) + { + _doScores = doScores; + _parentBits = parentBits; + _parentScorer = parentScorer; + _acceptDocs = acceptDocs; + } + + public override ICollection Children + { + get { return Collections.Singleton(new ChildScorer(_parentScorer, "BLOCK_JOIN")); } + } + + public override int NextDoc() + { + //System.out.println("Q.nextDoc() parentDoc=" + parentDoc + " childDoc=" + childDoc); + + // Loop until we hit a childDoc that's accepted + while (true) + { + if (_childDoc + 1 == _parentDoc) + { + // OK, we are done iterating through all children + // matching this one parent doc, so we now nextDoc() + // the parent. Use a while loop because we may have + // to skip over some number of parents w/ no + // children: + while (true) + { + _parentDoc = _parentScorer.NextDoc(); + ValidateParentDoc(); + + if (_parentDoc == 0) + { + // Degenerate but allowed: first parent doc has no children + // TODO: would be nice to pull initial parent + // into ctor so we can skip this if... but it's + // tricky because scorer must return -1 for + // .doc() on init... + _parentDoc = _parentScorer.NextDoc(); + ValidateParentDoc(); + } + + if (_parentDoc == NO_MORE_DOCS) + { + _childDoc = NO_MORE_DOCS; + //System.out.println(" END"); + return _childDoc; + } + + // Go to first child for this next parentDoc: + _childDoc = 1 + _parentBits.PrevSetBit(_parentDoc - 1); + + if (_childDoc == _parentDoc) + { + // This parent has no children; continue + // parent loop so we move to next parent + continue; + } + + if (_acceptDocs != null && !_acceptDocs.Get(_childDoc)) + { + goto nextChildDocContinue; + } + + if (_childDoc < _parentDoc) + { + if (_doScores) + { + _parentScore = _parentScorer.Score(); + _parentFreq = _parentScorer.Freq(); + } + //System.out.println(" " + childDoc); + return _childDoc; + } + else + { + // Degenerate but allowed: parent has no children + } + } + } + + Debug.Assert(_childDoc < _parentDoc, "childDoc=" + _childDoc + " parentDoc=" + _parentDoc); + _childDoc++; + if (_acceptDocs != null && !_acceptDocs.Get(_childDoc)) + { + continue; + } + //System.out.println(" " + childDoc); + return _childDoc; + nextChildDocContinue:; + } + } + + /// + /// Detect mis-use, where provided parent query in fact sometimes returns child documents. + /// + private void ValidateParentDoc() + { + if (_parentDoc != NO_MORE_DOCS && !_parentBits.Get(_parentDoc)) + { + throw new InvalidOperationException(InvalidQueryMessage + _parentDoc); + } + } + + public override int DocID() + { + return _childDoc; + } + + public override float Score() + { + return _parentScore; + } + + public override int Freq() + { + return _parentFreq; + } + + public override int Advance(int childTarget) + { + Debug.Assert(childTarget >= _parentBits.Length() || !_parentBits.Get(childTarget)); + + //System.out.println("Q.advance childTarget=" + childTarget); + if (childTarget == NO_MORE_DOCS) + { + //System.out.println(" END"); + return _childDoc = _parentDoc = NO_MORE_DOCS; + } + + Debug.Assert(_childDoc == -1 || childTarget != _parentDoc, "childTarget=" + childTarget); + if (_childDoc == -1 || childTarget > _parentDoc) + { + // Advance to new parent: + _parentDoc = _parentScorer.Advance(childTarget); + ValidateParentDoc(); + //System.out.println(" advance to parentDoc=" + parentDoc); + Debug.Assert(_parentDoc > childTarget); + if (_parentDoc == NO_MORE_DOCS) + { + //System.out.println(" END"); + return _childDoc = NO_MORE_DOCS; + } + if (_doScores) + { + _parentScore = _parentScorer.Score(); + _parentFreq = _parentScorer.Freq(); + } + int firstChild = _parentBits.PrevSetBit(_parentDoc - 1); + //System.out.println(" firstChild=" + firstChild); + childTarget = Math.Max(childTarget, firstChild); + } + + Debug.Assert(childTarget < _parentDoc); + + // Advance within children of current parent: + _childDoc = childTarget; + //System.out.println(" " + childDoc); + if (_acceptDocs != null && !_acceptDocs.Get(_childDoc)) + { + NextDoc(); + } + return _childDoc; + } + + public override long Cost() + { + return _parentScorer.Cost(); + } + } + + public override void ExtractTerms(ISet terms) + { + _parentQuery.ExtractTerms(terms); + } + + public override Query Rewrite(IndexReader reader) + { + Query parentRewrite = _parentQuery.Rewrite(reader); + if (parentRewrite != _parentQuery) + { + Query rewritten = new ToChildBlockJoinQuery(_parentQuery, parentRewrite, _parentsFilter, _doScores); + rewritten.Boost = Boost; + return rewritten; + } + + return this; + } + + public override string ToString(string field) + { + return "ToChildBlockJoinQuery (" + _parentQuery + ")"; + } + + protected bool Equals(ToChildBlockJoinQuery other) + { + return base.Equals(other) && + Equals(_origParentQuery, other._origParentQuery) && + _doScores == other._doScores && + Equals(_parentsFilter, other._parentsFilter); + } + + public override bool Equals(object obj) + { + if (ReferenceEquals(null, obj)) return false; + if (ReferenceEquals(this, obj)) return true; + if (obj.GetType() != GetType()) return false; + return Equals((ToChildBlockJoinQuery) obj); + } + + public override int GetHashCode() + { + unchecked + { + int hashCode = base.GetHashCode(); + hashCode = (hashCode*397) ^ (_origParentQuery != null ? _origParentQuery.GetHashCode() : 0); + hashCode = (hashCode*397) ^ _doScores.GetHashCode(); + hashCode = (hashCode*397) ^ (_parentsFilter != null ? _parentsFilter.GetHashCode() : 0); + return hashCode; + } + } + + public override object Clone() + { + return new ToChildBlockJoinQuery((Query) _origParentQuery.Clone(), _parentsFilter, _doScores); + } + } +} \ No newline at end of file