Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 365C9200BD8 for ; Wed, 7 Dec 2016 14:48:23 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 34F8A160AFD; Wed, 7 Dec 2016 13:48:23 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 34F76160B26 for ; Wed, 7 Dec 2016 14:48:21 +0100 (CET) Received: (qmail 67867 invoked by uid 500); 7 Dec 2016 13:48:20 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 67836 invoked by uid 99); 7 Dec 2016 13:48:20 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 07 Dec 2016 13:48:20 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 00A2EE7DFC; Wed, 7 Dec 2016 13:48:20 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: nightowl888@apache.org To: commits@lucenenet.apache.org Date: Wed, 07 Dec 2016 13:48:20 -0000 Message-Id: <9642922abbe04a0e85e954f8e4eef9e8@git.apache.org> In-Reply-To: <6d35022a72064a70a76ceb9d7e599654@git.apache.org> References: <6d35022a72064a70a76ceb9d7e599654@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [02/11] lucenenet git commit: Ported enough of Sandbox to support QueryParser.Xml archived-at: Wed, 07 Dec 2016 13:48:23 -0000 http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3395a8b4/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs b/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs new file mode 100644 index 0000000..4b830c6 --- /dev/null +++ b/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs @@ -0,0 +1,159 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; +using Lucene.Net.Util; +using NUnit.Framework; +using System.Collections.Generic; + +namespace Lucene.Net.Sandbox.Queries +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class FuzzyLikeThisQueryTest : LuceneTestCase + { + private Directory directory; + private IndexSearcher searcher; + private IndexReader reader; + private Analyzer analyzer; + + public override void SetUp() + { + base.SetUp(); + + analyzer = new MockAnalyzer(Random()); + directory = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy())); + + //Add series of docs with misspelt names + AddDoc(writer, "jonathon smythe", "1"); + AddDoc(writer, "jonathan smith", "2"); + AddDoc(writer, "johnathon smyth", "3"); + AddDoc(writer, "johnny smith", "4"); + AddDoc(writer, "jonny smith", "5"); + AddDoc(writer, "johnathon smythe", "6"); + reader = writer.Reader; + writer.Dispose(); + searcher = NewSearcher(reader); + } + + public override void TearDown() + { + reader.Dispose(); + directory.Dispose(); + base.TearDown(); + } + + private void AddDoc(RandomIndexWriter writer, string name, string id) + { + Document doc = new Document(); + doc.Add(NewTextField("name", name, Field.Store.YES)); + doc.Add(NewTextField("id", id, Field.Store.YES)); + writer.AddDocument(doc); + } + + + //Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match + [Test] + public void TestClosestEditDistanceMatchComesFirst() + { + FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); + flt.AddTerms("smith", "name", 0.3f, 1); + Query q = flt.Rewrite(searcher.IndexReader); + HashSet queryTerms = new HashSet(); + q.ExtractTerms(queryTerms); + assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe"))); + assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); + assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth"))); + TopDocs topDocs = searcher.Search(flt, 1); + ScoreDoc[] sd = topDocs.ScoreDocs; + assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0)); + Document doc = searcher.Doc(sd[0].Doc); + assertEquals("Should match most similar not most rare variant", "2", doc.Get("id")); + } + + //Test multiple input words are having variants produced + [Test] + public void TestMultiWord() + { + FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); + flt.AddTerms("jonathin smoth", "name", 0.3f, 1); + Query q = flt.Rewrite(searcher.IndexReader); + HashSet queryTerms = new HashSet(); + q.ExtractTerms(queryTerms); + assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan"))); + assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); + TopDocs topDocs = searcher.Search(flt, 1); + ScoreDoc[] sd = topDocs.ScoreDocs; + assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0)); + Document doc = searcher.Doc(sd[0].Doc); + assertEquals("Should match most similar when using 2 words", "2", doc.Get("id")); + } + + // LUCENE-4809 + [Test] + public void TestNonExistingField() + { + FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); + flt.AddTerms("jonathin smoth", "name", 0.3f, 1); + flt.AddTerms("jonathin smoth", "this field does not exist", 0.3f, 1); + // don't fail here just because the field doesn't exits + Query q = flt.Rewrite(searcher.IndexReader); + HashSet queryTerms = new HashSet(); + q.ExtractTerms(queryTerms); + assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan"))); + assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); + TopDocs topDocs = searcher.Search(flt, 1); + ScoreDoc[] sd = topDocs.ScoreDocs; + assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0)); + Document doc = searcher.Doc(sd[0].Doc); + assertEquals("Should match most similar when using 2 words", "2", doc.Get("id")); + } + + + //Test bug found when first query word does not match anything + [Test] + public void TestNoMatchFirstWordBug() + { + FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); + flt.AddTerms("fernando smith", "name", 0.3f, 1); + Query q = flt.Rewrite(searcher.IndexReader); + HashSet queryTerms = new HashSet(); + q.ExtractTerms(queryTerms); + assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); + TopDocs topDocs = searcher.Search(flt, 1); + ScoreDoc[] sd = topDocs.ScoreDocs; + assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0)); + Document doc = searcher.Doc(sd[0].Doc); + assertEquals("Should match most similar when using 2 words", "2", doc.Get("id")); + } + + [Test] + public void TestFuzzyLikeThisQueryEquals() + { + Analyzer analyzer = new MockAnalyzer(Random()); + FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer); + fltq1.AddTerms("javi", "subject", 0.5f, 2); + FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer); + fltq2.AddTerms("javi", "subject", 0.5f, 2); + assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1, + fltq2); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3395a8b4/Lucene.Net.Tests.Sandbox/Queries/TestSlowFuzzyQuery.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Tests.Sandbox/Queries/TestSlowFuzzyQuery.cs b/Lucene.Net.Tests.Sandbox/Queries/TestSlowFuzzyQuery.cs new file mode 100644 index 0000000..4baa0e7 --- /dev/null +++ b/Lucene.Net.Tests.Sandbox/Queries/TestSlowFuzzyQuery.cs @@ -0,0 +1,516 @@ +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; +using Lucene.Net.Support; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.Collections.Generic; + +#pragma warning disable 612, 618 +namespace Lucene.Net.Sandbox.Queries +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Tests + /// + public class TestSlowFuzzyQuery : LuceneTestCase + { + [Test] + public void TestFuzziness() + { + //every test with SlowFuzzyQuery.defaultMinSimilarity + //is exercising the Automaton, not the brute force linear method + + Directory directory = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, Similarity, TimeZone); + addDoc("aaaaa", writer); + addDoc("aaaab", writer); + addDoc("aaabb", writer); + addDoc("aabbb", writer); + addDoc("abbbb", writer); + addDoc("bbbbb", writer); + addDoc("ddddd", writer); + + IndexReader reader = writer.Reader; + IndexSearcher searcher = NewSearcher(reader); + writer.Dispose(); + + SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0); + ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + + // same with prefix + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 3); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 4); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(2, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 5); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 6); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + + // test scoring + query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals("3 documents should match", 3, hits.Length); + List order = Arrays.AsList("bbbbb", "abbbb", "aabbb"); + for (int i = 0; i < hits.Length; i++) + { + string term = searcher.Doc(hits[i].Doc).Get("field"); + //System.out.println(hits[i].score); + assertEquals(order[i], term); + } + + // test pq size by supplying maxExpansions=2 + // This query would normally return 3 documents, because 3 terms match (see above): + query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0, 2); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals("only 2 documents should match", 2, hits.Length); + order = Arrays.AsList("bbbbb", "abbbb"); + for (int i = 0; i < hits.Length; i++) + { + string term = searcher.Doc(hits[i].Doc).Get("field"); + //System.out.println(hits[i].score); + assertEquals(order[i], term); + } + + // not similar enough: + query = new SlowFuzzyQuery(new Term("field", "xxxxx"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "aaccc"), SlowFuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3 + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + // query identical to a word in the index: + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); + // default allows for up to two edits: + assertEquals(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); + assertEquals(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); + + // query similar to a word in the index: + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); + assertEquals(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); + assertEquals(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); + assertEquals(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); + assertEquals(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); + assertEquals(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); + assertEquals(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 3); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); + assertEquals(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); + assertEquals(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 4); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(2, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); + assertEquals(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 5); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 3); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 4); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 5); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + + // different field = no match: + query = new SlowFuzzyQuery(new Term("anotherfield", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + reader.Dispose(); + directory.Dispose(); + } + + [Test] + public void TestFuzzinessLong2() + { + //Lucene-5033 + Directory directory = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, Similarity, TimeZone); + addDoc("abcdef", writer); + addDoc("segment", writer); + + IndexReader reader = writer.Reader; + IndexSearcher searcher = NewSearcher(reader); + writer.Dispose(); + + SlowFuzzyQuery query; + + query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 3f, 0); + ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 4f, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + reader.Dispose(); + directory.Dispose(); + } + + [Test] + public void TestFuzzinessLong() + { + Directory directory = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, Similarity, TimeZone); + addDoc("aaaaaaa", writer); + addDoc("segment", writer); + + IndexReader reader = writer.Reader; + IndexSearcher searcher = NewSearcher(reader); + writer.Dispose(); + + SlowFuzzyQuery query; + // not similar enough: + query = new SlowFuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0); + ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + // edit distance to "aaaaaaa" = 3, this matches because the string is longer than + // in testDefaultFuzziness so a bigger difference is allowed: + query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa")); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa")); + query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa")); + query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + // no match, more than half of the characters is wrong: + query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + // "student" and "stellent" are indeed similar to "segment" by default: + query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 1); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 1); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 2); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 2); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + // "student" doesn't match anymore thanks to increased minimum similarity: + query = new SlowFuzzyQuery(new Term("field", "student"), 0.6f, 0); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + try + { + query = new SlowFuzzyQuery(new Term("field", "student"), 1.1f); + fail("Expected IllegalArgumentException"); + } +#pragma warning disable 168 + catch (ArgumentException e) +#pragma warning restore 168 + { + // expecting exception + } + try + { + query = new SlowFuzzyQuery(new Term("field", "student"), -0.1f); + fail("Expected IllegalArgumentException"); + } +#pragma warning disable 168 + catch (ArgumentException e) +#pragma warning restore 168 + { + // expecting exception + } + + reader.Dispose(); + directory.Dispose(); + } + + /** + * MultiTermQuery provides (via attribute) information about which values + * must be competitive to enter the priority queue. + * + * SlowFuzzyQuery optimizes itself around this information, if the attribute + * is not implemented correctly, there will be problems! + */ + [Test] + public void TestTieBreaker() + { + Directory directory = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, Similarity, TimeZone); + addDoc("a123456", writer); + addDoc("c123456", writer); + addDoc("d123456", writer); + addDoc("e123456", writer); + + Directory directory2 = NewDirectory(); + RandomIndexWriter writer2 = new RandomIndexWriter(Random(), directory2, Similarity, TimeZone); + addDoc("a123456", writer2); + addDoc("b123456", writer2); + addDoc("b123456", writer2); + addDoc("b123456", writer2); + addDoc("c123456", writer2); + addDoc("f123456", writer2); + + IndexReader ir1 = writer.Reader; + IndexReader ir2 = writer2.Reader; + + MultiReader mr = new MultiReader(ir1, ir2); + IndexSearcher searcher = NewSearcher(mr); + SlowFuzzyQuery fq = new SlowFuzzyQuery(new Term("field", "z123456"), 1f, 0, 2); + TopDocs docs = searcher.Search(fq, 2); + assertEquals(5, docs.TotalHits); // 5 docs, from the a and b's + mr.Dispose(); + ir1.Dispose(); + ir2.Dispose(); + writer.Dispose(); + writer2.Dispose(); + directory.Dispose(); + directory2.Dispose(); + } + + [Test] + public void TestTokenLengthOpt() + { + Directory directory = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, Similarity, TimeZone); + addDoc("12345678911", writer); + addDoc("segment", writer); + + IndexReader reader = writer.Reader; + IndexSearcher searcher = NewSearcher(reader); + writer.Dispose(); + + Query query; + // term not over 10 chars, so optimization shortcuts + query = new SlowFuzzyQuery(new Term("field", "1234569"), 0.9f); + ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + // 10 chars, so no optimization + query = new SlowFuzzyQuery(new Term("field", "1234567891"), 0.9f); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + // over 10 chars, so no optimization + query = new SlowFuzzyQuery(new Term("field", "12345678911"), 0.9f); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(1, hits.Length); + + // over 10 chars, no match + query = new SlowFuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f); + hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(0, hits.Length); + + reader.Dispose(); + directory.Dispose(); + } + + /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */ + [Test] + public void TestBoostOnlyRewrite() + { + Directory directory = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, Similarity, TimeZone); + addDoc("Lucene", writer); + addDoc("Lucene", writer); + addDoc("Lucenne", writer); + + IndexReader reader = writer.Reader; + IndexSearcher searcher = NewSearcher(reader); + writer.Dispose(); + + SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "lucene")); + query.SetRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50)); + ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; + assertEquals(3, hits.Length); + // normally, 'Lucenne' would be the first result as IDF will skew the score. + assertEquals("Lucene", reader.Document(hits[0].Doc).Get("field")); + assertEquals("Lucene", reader.Document(hits[1].Doc).Get("field")); + assertEquals("Lucenne", reader.Document(hits[2].Doc).Get("field")); + reader.Dispose(); + directory.Dispose(); + } + + [Test] + public void TestGiga() + { + + Directory index = NewDirectory(); + RandomIndexWriter w = new RandomIndexWriter(Random(), index, Similarity, TimeZone); + + addDoc("Lucene in Action", w); + addDoc("Lucene for Dummies", w); + + //addDoc("Giga", w); + addDoc("Giga byte", w); + + addDoc("ManagingGigabytesManagingGigabyte", w); + addDoc("ManagingGigabytesManagingGigabytes", w); + + addDoc("The Art of Computer Science", w); + addDoc("J. K. Rowling", w); + addDoc("JK Rowling", w); + addDoc("Joanne K Roling", w); + addDoc("Bruce Willis", w); + addDoc("Willis bruce", w); + addDoc("Brute willis", w); + addDoc("B. willis", w); + IndexReader r = w.Reader; + w.Dispose(); + + Query q = new SlowFuzzyQuery(new Term("field", "giga"), 0.9f); + + // 3. search + IndexSearcher searcher = NewSearcher(r); + ScoreDoc[] hits = searcher.Search(q, 10).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals("Giga byte", searcher.Doc(hits[0].Doc).Get("field")); + r.Dispose(); + index.Dispose(); + } + + [Test] + public void TestDistanceAsEditsSearching() + { + Directory index = NewDirectory(); + RandomIndexWriter w = new RandomIndexWriter(Random(), index, Similarity, TimeZone); + addDoc("foobar", w); + addDoc("test", w); + addDoc("working", w); + IndexReader reader = w.Reader; + IndexSearcher searcher = NewSearcher(reader); + w.Dispose(); + + SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", "fouba"), 2); + ScoreDoc[] hits = searcher.Search(q, 10).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals("foobar", searcher.Doc(hits[0].Doc).Get("field")); + + q = new SlowFuzzyQuery(new Term("field", "foubara"), 2); + hits = searcher.Search(q, 10).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals("foobar", searcher.Doc(hits[0].Doc).Get("field")); + + q = new SlowFuzzyQuery(new Term("field", "t"), 3); + hits = searcher.Search(q, 10).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals("test", searcher.Doc(hits[0].Doc).Get("field")); + + q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50); + hits = searcher.Search(q, 10).ScoreDocs; + assertEquals(1, hits.Length); + assertEquals("test", searcher.Doc(hits[0].Doc).Get("field")); + + q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50); + hits = searcher.Search(q, 10).ScoreDocs; + assertEquals(2, hits.Length); + assertEquals("test", searcher.Doc(hits[0].Doc).Get("field")); + assertEquals("foobar", searcher.Doc(hits[1].Doc).Get("field")); + + reader.Dispose(); + index.Dispose(); + } + + private void addDoc(string text, RandomIndexWriter writer) + { + Document doc = new Document(); + doc.Add(NewTextField("field", text, Field.Store.YES)); + writer.AddDocument(doc); + } + } +} +#pragma warning restore 612, 618 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3395a8b4/Lucene.Net.Tests.Sandbox/Queries/TestSlowFuzzyQuery2.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Tests.Sandbox/Queries/TestSlowFuzzyQuery2.cs b/Lucene.Net.Tests.Sandbox/Queries/TestSlowFuzzyQuery2.cs new file mode 100644 index 0000000..6a2988e --- /dev/null +++ b/Lucene.Net.Tests.Sandbox/Queries/TestSlowFuzzyQuery2.cs @@ -0,0 +1,194 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Search.Similarities; +using Lucene.Net.Support; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.Globalization; +using System.IO; +using System.Text; + +namespace Lucene.Net.Sandbox.Queries +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /** + * Tests the results of fuzzy against pre-recorded output + * The format of the file is the following: + * + * Header Row: # of bits: generate 2^n sequential documents + * with a value of Integer.toBinaryString + * + * Entries: an entry is a param spec line, a resultCount line, and + * then 'resultCount' results lines. The results lines are in the + * expected order. + * + * param spec line: a comma-separated list of params to FuzzyQuery + * (query, prefixLen, pqSize, minScore) + * query = query text as a number (expand with Integer.toBinaryString) + * prefixLen = prefix length + * pqSize = priority queue maximum size for TopTermsBoostOnlyBooleanQueryRewrite + * minScore = minimum similarity + * + * resultCount line: total number of expected hits. + * + * results line: comma-separated docID, score pair + **/ + public class TestSlowFuzzyQuery2 : LuceneTestCase + { + /** epsilon for score comparisons */ + static readonly float epsilon = 0.00001f; + + static int[][] mappings = new int[][] { + new int[] { 0x40, 0x41 }, + new int[] { 0x40, 0x0195 }, + new int[] { 0x40, 0x0906 }, + new int[] { 0x40, 0x1040F }, + new int[] { 0x0194, 0x0195 }, + new int[] { 0x0194, 0x0906 }, + new int[] { 0x0194, 0x1040F }, + new int[] { 0x0905, 0x0906 }, + new int[] { 0x0905, 0x1040F }, + new int[] { 0x1040E, 0x1040F } + }; + + [Test] + public void TestFromTestData() + { + // TODO: randomize! + assertFromTestData(mappings[Random().nextInt(mappings.Length)]); + } + + public void assertFromTestData(int[] codePointTable) + { + if (VERBOSE) + { + Console.WriteLine("TEST: codePointTable=" + codePointTable); + } + //Stream stream = GetType().getResourceAsStream("fuzzyTestData.txt"); + Stream stream = GetType().Assembly.GetManifestResourceStream("Lucene.Net.Sandbox.Queries.fuzzyTestData.txt"); + TextReader reader = new StreamReader(stream, Encoding.UTF8); + + int bits = int.Parse(reader.ReadLine(), CultureInfo.InvariantCulture); + int terms = (int)Math.Pow(2, bits); + + Store.Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)).SetMergePolicy(NewLogMergePolicy())); + + Document doc = new Document(); + Field field = NewTextField("field", "", Field.Store.NO); + doc.Add(field); + + for (int i = 0; i < terms; i++) + { + field.StringValue = (MapInt(codePointTable, i)); + writer.AddDocument(doc); + } + + IndexReader r = writer.Reader; + IndexSearcher searcher = NewSearcher(r); + if (VERBOSE) + { + Console.WriteLine("TEST: searcher=" + searcher); + } + // even though this uses a boost-only rewrite, this test relies upon queryNorm being the default implementation, + // otherwise scores are different! + searcher.Similarity = (new DefaultSimilarity()); + + writer.Dispose(); + String line; + int lineNum = 0; + while ((line = reader.ReadLine()) != null) + { + lineNum++; + String[] @params = line.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); + String query = MapInt(codePointTable, int.Parse(@params[0], CultureInfo.InvariantCulture)); + int prefix = int.Parse(@params[1], CultureInfo.InvariantCulture); + int pqSize = int.Parse(@params[2], CultureInfo.InvariantCulture); + float minScore = float.Parse(@params[3], CultureInfo.InvariantCulture); +#pragma warning disable 612, 618 + SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", query), minScore, prefix); +#pragma warning restore 612, 618 + q.SetRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize)); + int expectedResults = int.Parse(reader.ReadLine(), CultureInfo.InvariantCulture); + TopDocs docs = searcher.Search(q, expectedResults); + assertEquals(expectedResults, docs.TotalHits); + for (int i = 0; i < expectedResults; i++) + { + String[] scoreDoc = reader.ReadLine().Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); + assertEquals(int.Parse(scoreDoc[0], CultureInfo.InvariantCulture), docs.ScoreDocs[i].Doc); + assertEquals(float.Parse(scoreDoc[1], CultureInfo.InvariantCulture), docs.ScoreDocs[i].Score, epsilon); + } + } + r.Dispose(); + dir.Dispose(); + } + + /* map bits to unicode codepoints */ + private static String MapInt(int[] codePointTable, int i) + { + StringBuilder sb = new StringBuilder(); + String binary = Number.ToBinaryString(i); + for (int j = 0; j < binary.Length; j++) + sb.AppendCodePoint(codePointTable[binary[j] - '0']); + return sb.toString(); + } + + /* Code to generate test data + public static void main(String args[]) throws Exception { + int bits = 3; + System.out.println(bits); + int terms = (int) Math.pow(2, bits); + + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), + IndexWriter.MaxFieldLength.UNLIMITED); + + Document doc = new Document(); + Field field = newField("field", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + + for (int i = 0; i < terms; i++) { + field.setValue(Integer.toBinaryString(i)); + writer.addDocument(doc); + } + + writer.forceMerge(1); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(dir); + for (int prefix = 0; prefix < bits; prefix++) + for (int pqsize = 1; pqsize <= terms; pqsize++) + for (float minscore = 0.1F; minscore < 1F; minscore += 0.2F) + for (int query = 0; query < terms; query++) { + FuzzyQuery q = new FuzzyQuery( + new Term("field", Integer.toBinaryString(query)), minscore, prefix); + q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqsize)); + System.out.println(query + "," + prefix + "," + pqsize + "," + minscore); + TopDocs docs = searcher.search(q, terms); + System.out.println(docs.totalHits); + for (int i = 0; i < docs.totalHits; i++) + System.out.println(docs.scoreDocs[i].doc + "," + docs.scoreDocs[i].score); + } + } + */ + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3395a8b4/Lucene.Net.Tests.Sandbox/Queries/TestSortedSetSortField.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Tests.Sandbox/Queries/TestSortedSetSortField.cs b/Lucene.Net.Tests.Sandbox/Queries/TestSortedSetSortField.cs new file mode 100644 index 0000000..4a14356 --- /dev/null +++ b/Lucene.Net.Tests.Sandbox/Queries/TestSortedSetSortField.cs @@ -0,0 +1,235 @@ +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; +using Lucene.Net.Util; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Sandbox.Queries +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// Simple tests for SortedSetSortField + public class TestSortedSetSortField : LuceneTestCase + { + [Test] + public void TestForward() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(NewStringField("value", "baz", Field.Store.NO)); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("value", "foo", Field.Store.NO)); + doc.Add(NewStringField("value", "bar", Field.Store.NO)); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + Sort sort = new Sort(new SortedSetSortField("value", false)); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(2, td.TotalHits); + // 'bar' comes before 'baz' + assertEquals("1", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("2", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestReverse() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(NewStringField("value", "foo", Field.Store.NO)); + doc.Add(NewStringField("value", "bar", Field.Store.NO)); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("value", "baz", Field.Store.NO)); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + Sort sort = new Sort(new SortedSetSortField("value", true)); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(2, td.TotalHits); + // 'bar' comes before 'baz' + assertEquals("2", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("1", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestMissingFirst() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(NewStringField("value", "baz", Field.Store.NO)); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("value", "foo", Field.Store.NO)); + doc.Add(NewStringField("value", "bar", Field.Store.NO)); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("id", "3", Field.Store.YES)); + writer.AddDocument(doc); + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + SortField sortField = new SortedSetSortField("value", false); + sortField.MissingValue = (SortField.STRING_FIRST); + Sort sort = new Sort(sortField); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.TotalHits); + // 'bar' comes before 'baz' + // null comes first + assertEquals("3", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("1", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + assertEquals("2", searcher.Doc(td.ScoreDocs[2].Doc).Get("id")); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestMissingLast() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(NewStringField("value", "baz", Field.Store.NO)); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("value", "foo", Field.Store.NO)); + doc.Add(NewStringField("value", "bar", Field.Store.NO)); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("id", "3", Field.Store.YES)); + writer.AddDocument(doc); + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + SortField sortField = new SortedSetSortField("value", false); + sortField.MissingValue = (SortField.STRING_LAST); + Sort sort = new Sort(sortField); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.TotalHits); + // 'bar' comes before 'baz' + assertEquals("1", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("2", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + // null comes last + assertEquals("3", searcher.Doc(td.ScoreDocs[2].Doc).Get("id")); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestSingleton() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(NewStringField("value", "baz", Field.Store.NO)); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("value", "bar", Field.Store.NO)); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + Sort sort = new Sort(new SortedSetSortField("value", false)); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(2, td.TotalHits); + // 'bar' comes before 'baz' + assertEquals("1", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("2", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestEmptyIndex() + { + IndexSearcher empty = NewSearcher(new MultiReader()); + Query query = new TermQuery(new Term("contents", "foo")); + + Sort sort = new Sort(); + sort.SetSort(new SortedSetSortField("sortedset", false)); + TopDocs td = empty.Search(query, null, 10, sort, true, true); + assertEquals(0, td.TotalHits); + + // for an empty index, any selector should work + foreach (Selector v in Enum.GetValues(typeof(Selector))) + { + sort.SetSort(new SortedSetSortField("sortedset", false, v)); + td = empty.Search(query, null, 10, sort, true, true); + assertEquals(0, td.TotalHits); + } + } + + [Test] + public void TestEquals() + { + SortField sf = new SortedSetSortField("a", false); + assertFalse(sf.equals(null)); + + + assertEquals(sf, sf); + + SortField sf2 = new SortedSetSortField("a", false); + assertEquals(sf, sf2); + assertEquals(sf.GetHashCode(), sf2.GetHashCode()); + + + assertFalse(sf.equals(new SortedSetSortField("a", true))); + assertFalse(sf.equals(new SortedSetSortField("b", false))); + assertFalse(sf.equals(new SortedSetSortField("a", false, Selector.MAX))); + assertFalse(sf.equals("foo")); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3395a8b4/Lucene.Net.Tests.Sandbox/Queries/TestSortedSetSortFieldDocValues.cs ---------------------------------------------------------------------- diff --git a/Lucene.Net.Tests.Sandbox/Queries/TestSortedSetSortFieldDocValues.cs b/Lucene.Net.Tests.Sandbox/Queries/TestSortedSetSortFieldDocValues.cs new file mode 100644 index 0000000..342f679 --- /dev/null +++ b/Lucene.Net.Tests.Sandbox/Queries/TestSortedSetSortFieldDocValues.cs @@ -0,0 +1,213 @@ +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; +using Lucene.Net.Util; +using NUnit.Framework; + +namespace Lucene.Net.Sandbox.Queries +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// Simple tests for SortedSetSortField, indexing the sortedset up front + [SuppressCodecs("Lucene40", "Lucene41", "Appending", "Lucene3x")]// avoid codecs that don't support sortedset + public class TestSortedSetSortFieldDocValues : LuceneTestCase + { + public override void SetUp() + { + base.SetUp(); + // ensure there is nothing in fieldcache before test starts + FieldCache.DEFAULT.PurgeAllCaches(); + } + + private void assertNoFieldCaches() + { + // docvalues sorting should NOT create any fieldcache entries! + assertEquals(0, FieldCache.DEFAULT.CacheEntries.Length); + } + + [Test] + public void TestForward() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("baz"))); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("foo"))); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("bar"))); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + Sort sort = new Sort(new SortedSetSortField("value", false)); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(2, td.TotalHits); + // 'bar' comes before 'baz' + assertEquals("1", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("2", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + assertNoFieldCaches(); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestReverse() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("foo"))); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("bar"))); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("baz"))); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + Sort sort = new Sort(new SortedSetSortField("value", true)); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(2, td.TotalHits); + // 'bar' comes before 'baz' + assertEquals("2", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("1", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + assertNoFieldCaches(); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestMissingFirst() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("baz"))); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("foo"))); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("bar"))); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("id", "3", Field.Store.YES)); + writer.AddDocument(doc); + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + SortField sortField = new SortedSetSortField("value", false); + sortField.MissingValue = (SortField.STRING_FIRST); + Sort sort = new Sort(sortField); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.TotalHits); + // 'bar' comes before 'baz' + // null comes first + assertEquals("3", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("1", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + assertEquals("2", searcher.Doc(td.ScoreDocs[2].Doc).Get("id")); + assertNoFieldCaches(); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestMissingLast() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("baz"))); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("foo"))); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("bar"))); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(NewStringField("id", "3", Field.Store.YES)); + writer.AddDocument(doc); + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + SortField sortField = new SortedSetSortField("value", false); + sortField.MissingValue = (SortField.STRING_LAST); + Sort sort = new Sort(sortField); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.TotalHits); + // 'bar' comes before 'baz' + assertEquals("1", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("2", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + // null comes last + assertEquals("3", searcher.Doc(td.ScoreDocs[2].Doc).Get("id")); + assertNoFieldCaches(); + + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestSingleton() + { + Directory dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("baz"))); + doc.Add(NewStringField("id", "2", Field.Store.YES)); + writer.AddDocument(doc); + doc = new Document(); + doc.Add(new SortedSetDocValuesField("value", new BytesRef("bar"))); + doc.Add(NewStringField("id", "1", Field.Store.YES)); + writer.AddDocument(doc); + IndexReader ir = writer.Reader; + writer.Dispose(); + + IndexSearcher searcher = NewSearcher(ir); + Sort sort = new Sort(new SortedSetSortField("value", false)); + + TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); + assertEquals(2, td.TotalHits); + // 'bar' comes before 'baz' + assertEquals("1", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); + assertEquals("2", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); + assertNoFieldCaches(); + + ir.Dispose(); + dir.Dispose(); + } + } +}