Return-Path: X-Original-To: apmail-lucene-lucene-net-commits-archive@www.apache.org Delivered-To: apmail-lucene-lucene-net-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 835426C15 for ; Sun, 17 Jul 2011 16:32:53 +0000 (UTC) Received: (qmail 77315 invoked by uid 500); 17 Jul 2011 16:32:53 -0000 Delivered-To: apmail-lucene-lucene-net-commits-archive@lucene.apache.org Received: (qmail 77282 invoked by uid 500); 17 Jul 2011 16:32:53 -0000 Mailing-List: contact lucene-net-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucene.apache.org Delivered-To: mailing list lucene-net-commits@lucene.apache.org Received: (qmail 77274 invoked by uid 99); 17 Jul 2011 16:32:53 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 17 Jul 2011 16:32:53 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 17 Jul 2011 16:32:50 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 865FF23888C2; Sun, 17 Jul 2011 16:32:30 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Date: Sun, 17 Jul 2011 16:32:30 -0000 To: lucene-net-commits@lucene.apache.org From: digy@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20110717163230.865FF23888C2@eris.apache.org> Subject: [Lucene.Net] svn commit: r1147679 [1/2] - in /incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers: ./ Filters/ Miscellaneous/ Properties/ Shingle/ Author: digy Date: Sun Jul 17 16:32:29 2011 New Revision: 1147679 URL: http://svn.apache.org/viewvc?rev=1147679&view=rev Log: [LUCENENET-437] for 2.9.4g Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/TestShingleMatrixFilter.cs Removed: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Filters/ Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1147679&r1=1147678&r2=1147679&view=diff ============================================================================== --- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original) +++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Sun Jul 17 16:32:29 2011 @@ -8,7 +8,7 @@ {67D27628-F1D5-4499-9818-B669731925C8} Library Properties - Lucene.Net.Analyzers + Lucene.Net.Analysis Lucene.Net.Analyzers.Test v4.0 512 @@ -36,7 +36,7 @@ true full false - ..\..\..\bin\contrib\Analyzers\Debug\ + ..\..\..\bin\contrib\Analyzers\ DEBUG;TRACE prompt 4 @@ -59,12 +59,17 @@ - + + + + + + Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs?rev=1147679&view=auto ============================================================================== --- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs (added) +++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs Sun Jul 17 16:32:29 2011 @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; +using Lucene.Net.Analysis; +using Lucene.Net.Util; + +using NUnit.Framework; + +namespace Lucene.Net.Analysis.Miscellaneous +{ + public class ChainedFilterTest : Lucene.Net.TestCase + { + public static int MAX = 500; + + private RAMDirectory directory; + private IndexSearcher searcher; + private Query query; + // private DateFilter dateFilter; DateFilter was deprecated and removed + private TermRangeFilter dateFilter; + private QueryWrapperFilter bobFilter; + private QueryWrapperFilter sueFilter; + + [SetUp] + public void SetUp() + { + directory = new RAMDirectory(); + IndexWriter writer = + new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); + + DateTime cal = new DateTime(1041397200000L * TimeSpan.TicksPerMillisecond); // 2003 January 01 + + for (int i = 0; i < MAX; i++) + { + Document doc = new Document(); + doc.Add(new Field("key", "" + (i + 1), Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.Add(new Field("owner", (i < MAX / 2) ? "bob" : "sue", Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.Add(new Field("date", (cal.Ticks / TimeSpan.TicksPerMillisecond).ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); + writer.AddDocument(doc); + + cal.AddMilliseconds(1); + } + + writer.Close(); + + searcher = new IndexSearcher(directory, true); + + // query for everything to make life easier + BooleanQuery bq = new BooleanQuery(); + bq.Add(new TermQuery(new Term("owner", "bob")), BooleanClause.Occur.SHOULD); + bq.Add(new TermQuery(new Term("owner", "sue")), BooleanClause.Occur.SHOULD); + query = bq; + + // date filter matches everything too + //Date pastTheEnd = parseDate("2099 Jan 1"); + // dateFilter = DateFilter.Before("date", pastTheEnd); + // just treat dates as strings and select the whole range for now... + dateFilter = new TermRangeFilter("date", "", "ZZZZ", true, true); + + bobFilter = new QueryWrapperFilter( + new TermQuery(new Term("owner", "bob"))); + sueFilter = new QueryWrapperFilter( + new TermQuery(new Term("owner", "sue"))); + } + + private ChainedFilter GetChainedFilter(Filter[] chain, ChainedFilter.Logic[] logic) + { + if (logic == null) + { + return new ChainedFilter(chain); + } + else + { + return new ChainedFilter(chain, logic); + } + } + + private ChainedFilter GetChainedFilter(Filter[] chain, ChainedFilter.Logic logic) + { + return new ChainedFilter(chain, logic); + } + + + [Test] + public void TestSingleFilter() + { + ChainedFilter chain = GetChainedFilter(new Filter[] { dateFilter }, null); + + int numHits = searcher.Search(query, chain, 1000).TotalHits; + Assert.AreEqual(MAX, numHits); + + chain = new ChainedFilter(new Filter[] { bobFilter }); + numHits = searcher.Search(query, chain, 1000).TotalHits; + Assert.AreEqual(MAX / 2, numHits); + + chain = GetChainedFilter(new Filter[] { bobFilter }, new ChainedFilter.Logic[] { ChainedFilter.Logic.AND }); + TopDocs hits = searcher.Search(query, chain, 1000); + numHits = hits.TotalHits; + Assert.AreEqual(MAX / 2, numHits); + Assert.AreEqual("bob", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner")); + + chain = GetChainedFilter(new Filter[] { bobFilter }, new ChainedFilter.Logic[] { ChainedFilter.Logic.ANDNOT }); + hits = searcher.Search(query, chain, 1000); + numHits = hits.TotalHits; + Assert.AreEqual(MAX / 2, numHits); + Assert.AreEqual("sue", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner")); + } + + [Test] + public void TestOR() + { + ChainedFilter chain = GetChainedFilter( + new Filter[] { sueFilter, bobFilter }, null); + + int numHits = searcher.Search(query, chain, 1000).TotalHits; + Assert.AreEqual(MAX, numHits, "OR matches all"); + } + + [Test] + public void TestAND() + { + ChainedFilter chain = GetChainedFilter( + new Filter[] { dateFilter, bobFilter }, ChainedFilter.Logic.AND); + + TopDocs hits = searcher.Search(query, chain, 1000); + Assert.AreEqual(MAX / 2, hits.TotalHits, "AND matches just bob"); + Assert.AreEqual("bob", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner")); + } + + [Test] + public void TestXOR() + { + ChainedFilter chain = GetChainedFilter( + new Filter[] { dateFilter, bobFilter }, ChainedFilter.Logic.XOR); + + TopDocs hits = searcher.Search(query, chain, 1000); + Assert.AreEqual(MAX / 2, hits.TotalHits, "XOR matches sue"); + Assert.AreEqual("sue", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner")); + } + + [Test] + public void TestANDNOT() + { + ChainedFilter chain = GetChainedFilter( + new Filter[] { dateFilter, sueFilter }, + new ChainedFilter.Logic[] { ChainedFilter.Logic.AND, ChainedFilter.Logic.ANDNOT }); + + TopDocs hits = searcher.Search(query, chain, 1000); + Assert.AreEqual(MAX / 2, hits.TotalHits, "ANDNOT matches just bob"); + Assert.AreEqual("bob", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner")); + + chain = GetChainedFilter( + new Filter[] { bobFilter, bobFilter }, + new ChainedFilter.Logic[] { ChainedFilter.Logic.ANDNOT, ChainedFilter.Logic.ANDNOT }); + + hits = searcher.Search(query, chain, 1000); + Assert.AreEqual(MAX / 2, hits.TotalHits, "ANDNOT bob ANDNOT bob matches all sues"); + Assert.AreEqual("sue", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner")); + } + + /* + private Date parseDate(String s) throws ParseException { + return new SimpleDateFormat("yyyy MMM dd", Locale.US).parse(s); + } + */ + + [Test] + public void TestWithCachingFilter() + { + Directory dir = new RAMDirectory(); + Analyzer analyzer = new WhitespaceAnalyzer(); + + IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + writer.Close(); + + Searcher searcher = new IndexSearcher(dir, true); + + Query query = new TermQuery(new Term("none", "none")); + + QueryWrapperFilter queryFilter = new QueryWrapperFilter(query); + CachingWrapperFilter cachingFilter = new CachingWrapperFilter(queryFilter); + + searcher.Search(query, cachingFilter, 1); + + CachingWrapperFilter cachingFilter2 = new CachingWrapperFilter(queryFilter); + Filter[] chain = new Filter[2]; + chain[0] = cachingFilter; + chain[1] = cachingFilter2; + ChainedFilter cf = new ChainedFilter(chain); + + // throws java.lang.ClassCastException: org.apache.lucene.util.OpenBitSet cannot be cast to java.util.BitSet + searcher.Search(new MatchAllDocsQuery(), cf, 1); + } + + } +} \ No newline at end of file Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs?rev=1147679&view=auto ============================================================================== --- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs (added) +++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs Sun Jul 17 16:32:29 2011 @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.IO; +using Lucene.Net.Analysis; +using NUnit.Framework; + +namespace Lucene.Net.Analysis.Miscellaneous +{ + public class TestPrefixAndSuffixAwareTokenFilter : BaseTokenStreamTestCase + { + [Test] + public void TestTokenStreamContents() + { + var ts = new PrefixAndSuffixAwareTokenFilter( + new SingleTokenTokenStream(CreateToken("^", 0, 0)), + new WhitespaceTokenizer(new StringReader("hello world")), + new SingleTokenTokenStream(CreateToken("$", 0, 0))); + + AssertTokenStreamContents(ts, + new[] {"^", "hello", "world", "$"}, + new[] {0, 0, 6, 11}, + new[] {0, 5, 11, 11}); + } + + private static Token CreateToken(String term, int start, int offset) + { + var token = new Token(start, offset); + token.SetTermBuffer(term); + return token; + } + } +} \ No newline at end of file Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs?rev=1147679&view=auto ============================================================================== --- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs (added) +++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs Sun Jul 17 16:32:29 2011 @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.IO; +using Lucene.Net.Analysis; +using NUnit.Framework; + +namespace Lucene.Net.Analysis.Miscellaneous +{ + public class TestPrefixAwareTokenFilter : BaseTokenStreamTestCase + { + [Test] + public void TestTokenStreamContents() + { + var ts = new PrefixAwareTokenFilter( + new SingleTokenTokenStream(CreateToken("a", 0, 1)), + new SingleTokenTokenStream(CreateToken("b", 0, 1))); + + AssertTokenStreamContents(ts, + new[] {"a", "b"}, + new[] {0, 1}, + new[] {1, 2}); + + // prefix and suffix using 2x prefix + + ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(CreateToken("^", 0, 0)), + new WhitespaceTokenizer(new StringReader("hello world"))); + ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(CreateToken("$", 0, 0))); + + AssertTokenStreamContents(ts, + new[] {"^", "hello", "world", "$"}, + new[] {0, 0, 6, 11}, + new[] {0, 5, 11, 11}); + } + + private static Token CreateToken(String term, int start, int offset) + { + var token = new Token(start, offset); + token.SetTermBuffer(term); + return token; + } + } +} \ No newline at end of file Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs?rev=1147679&r1=1147678&r2=1147679&view=diff ============================================================================== --- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs (original) +++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs Sun Jul 17 16:32:29 2011 @@ -32,5 +32,5 @@ using System.Runtime.InteropServices; // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("2.9.2.1")] -[assembly: AssemblyFileVersion("2.9.2.1")] +[assembly: AssemblyVersion("2.9.4.2")] +[assembly: AssemblyFileVersion("2.9.4.2")] Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs?rev=1147679&view=auto ============================================================================== --- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs (added) +++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs Sun Jul 17 16:32:29 2011 @@ -0,0 +1,293 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.IO; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.QueryParsers; +using Lucene.Net.Search; +using Lucene.Net.Store; +using NUnit.Framework; +using Directory = Lucene.Net.Store.Directory; + +namespace Lucene.Net.Analysis.Shingle +{ + /// + /// A test class for ShingleAnalyzerWrapper as regards queries and scoring. + /// + public class ShingleAnalyzerWrapperTest : BaseTokenStreamTestCase + { + public IndexSearcher Searcher; + + /// + /// Set up a new index in RAM with three test phrases and the supplied Analyzer. + /// + /// the analyzer to use + /// an indexSearcher on the test index. + public IndexSearcher SetUpSearcher(Analyzer analyzer) + { + Directory dir = new RAMDirectory(); + var writer = new IndexWriter(dir, analyzer, true); + + var doc = new Document(); + doc.Add(new Field("content", "please divide this sentence into shingles", + Field.Store.YES, Field.Index.ANALYZED)); + writer.AddDocument(doc); + + doc = new Document(); + doc.Add(new Field("content", "just another test sentence", + Field.Store.YES, Field.Index.ANALYZED)); + writer.AddDocument(doc); + + doc = new Document(); + doc.Add(new Field("content", "a sentence which contains no test", + Field.Store.YES, Field.Index.ANALYZED)); + writer.AddDocument(doc); + + writer.Close(); + + return new IndexSearcher(dir); + } + + protected Hits QueryParsingTest(Analyzer analyzer, String qs) + { + Searcher = SetUpSearcher(analyzer); + + var qp = new QueryParser("content", analyzer); + + var q = qp.Parse(qs); + + return Searcher.Search(q); + } + + protected void CompareRanks(Hits hits, int[] ranks) + { + Assert.AreEqual(ranks.Length, hits.Length()); + for (int i = 0; i < ranks.Length; i++) + { + Assert.AreEqual(ranks[i], hits.Id(i)); + } + } + + /// + /// Will not work on an index without unigrams, since QueryParser automatically tokenizes on whitespace. + /// + [Test] + public void TestShingleAnalyzerWrapperQueryParsing() + { + var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "test sentence"); + var ranks = new[] {1, 2, 0}; + CompareRanks(hits, ranks); + } + + /// + /// This one fails with an exception. + /// + [Test] + public void TestShingleAnalyzerWrapperPhraseQueryParsingFails() + { + var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "\"this sentence\""); + var ranks = new[] {0}; + CompareRanks(hits, ranks); + } + + /// + /// This one works, actually. + /// + [Test] + public void TestShingleAnalyzerWrapperPhraseQueryParsing() + { + var hits = QueryParsingTest(new ShingleAnalyzerWrapper + (new WhitespaceAnalyzer(), 2), + "\"test sentence\""); + var ranks = new[] {1}; + CompareRanks(hits, ranks); + } + + /// + /// Same as above, is tokenized without using the analyzer. + /// + [Test] + public void TestShingleAnalyzerWrapperRequiredQueryParsing() + { + var hits = QueryParsingTest(new ShingleAnalyzerWrapper + (new WhitespaceAnalyzer(), 2), + "+test +sentence"); + var ranks = new[] {1, 2}; + CompareRanks(hits, ranks); + } + + /// + /// This shows how to construct a phrase query containing shingles. + /// + [Test] + public void TestShingleAnalyzerWrapperPhraseQuery() + { + Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); + Searcher = SetUpSearcher(analyzer); + + var q = new PhraseQuery(); + + var ts = analyzer.TokenStream("content", new StringReader("this sentence")); + var j = -1; + + var posIncrAtt = (PositionIncrementAttribute) ts.AddAttribute(typeof (PositionIncrementAttribute)); + var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute)); + + while (ts.IncrementToken()) + { + j += posIncrAtt.GetPositionIncrement(); + var termText = termAtt.Term(); + q.Add(new Term("content", termText), j); + } + + var hits = Searcher.Search(q); + var ranks = new[] {0}; + CompareRanks(hits, ranks); + } + + /// + /// How to construct a boolean query with shingles. A query like this will + /// implicitly score those documents higher that contain the words in the query + /// in the right order and adjacent to each other. + /// + [Test] + public void TestShingleAnalyzerWrapperBooleanQuery() + { + Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); + Searcher = SetUpSearcher(analyzer); + + var q = new BooleanQuery(); + + var ts = analyzer.TokenStream("content", new StringReader("test sentence")); + + var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute)); + + while (ts.IncrementToken()) + { + var termText = termAtt.Term(); + q.Add(new TermQuery(new Term("content", termText)), + BooleanClause.Occur.SHOULD); + } + + var hits = Searcher.Search(q); + var ranks = new[] {1, 2, 0}; + CompareRanks(hits, ranks); + } + + [Test] + public void TestReusableTokenStream() + { + Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); + AssertAnalyzesToReuse(a, "please divide into shingles", + new[] + { + "please", "please divide", "divide", "divide into", "into", "into shingles", + "shingles" + }, + new[] {0, 0, 7, 7, 14, 14, 19}, + new[] {6, 13, 13, 18, 18, 27, 27}, + new[] {1, 0, 1, 0, 1, 0, 1}); + AssertAnalyzesToReuse(a, "divide me up again", + new[] {"divide", "divide me", "me", "me up", "up", "up again", "again"}, + new[] {0, 0, 7, 7, 10, 10, 13}, + new[] {6, 9, 9, 12, 12, 18, 18}, + new[] {1, 0, 1, 0, 1, 0, 1}); + } + + /// + /// subclass that acts just like whitespace analyzer for testing + /// + [Test] + public void TestLucene1678BwComp() + { + Analyzer a = new ShingleWrapperSubclassAnalyzer(); + AssertAnalyzesToReuse(a, "this is a test", + new[] {"this", "is", "a", "test"}, + new[] {0, 5, 8, 10}, + new[] {4, 7, 9, 14}); + } + + /// + /// analyzer that does not support reuse it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even. + /// + [Test] + public void TestWrappedAnalyzerDoesNotReuse() + { + Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer()); + AssertAnalyzesToReuse(a, "please divide into shingles.", + new[] + { + "please", "please divide", "divide", "divide into", "into", "into shingles", + "shingles" + }, + new[] {0, 0, 7, 7, 14, 14, 19}, + new[] {6, 13, 13, 18, 18, 27, 27}, + new[] {1, 0, 1, 0, 1, 0, 1}); + AssertAnalyzesToReuse(a, "please divide into shingles.", + new[] + { + "please", "please divide", "divide", "divide into", "into", "into shingles.", + "shingles." + }, + new[] {0, 0, 7, 7, 14, 14, 19}, + new[] {6, 13, 13, 18, 18, 28, 28}, + new[] {1, 0, 1, 0, 1, 0, 1}); + AssertAnalyzesToReuse(a, "please divide into shingles.", + new[] + { + "please", "please divide", "divide", "divide into", "into", "into shingles", + "shingles" + }, + new[] {0, 0, 7, 7, 14, 14, 19}, + new[] {6, 13, 13, 18, 18, 27, 27}, + new[] {1, 0, 1, 0, 1, 0, 1}); + } + + #region Nested type: NonreusableAnalyzer + + private class NonreusableAnalyzer : Analyzer + { + private int _invocationCount; + + public override TokenStream TokenStream(String fieldName, TextReader reader) + { + if (++_invocationCount%2 == 0) + return new WhitespaceTokenizer(reader); + + return new LetterTokenizer(reader); + } + } + + #endregion + + #region Nested type: ShingleWrapperSubclassAnalyzer + + private class ShingleWrapperSubclassAnalyzer : ShingleAnalyzerWrapper + { + public override TokenStream TokenStream(String fieldName, TextReader reader) + { + return new WhitespaceTokenizer(reader); + } + } ; + + #endregion + } +} \ No newline at end of file Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs?rev=1147679&view=auto ============================================================================== --- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs (added) +++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs Sun Jul 17 16:32:29 2011 @@ -0,0 +1,530 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.IO; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Tokenattributes; +using NUnit.Framework; + +namespace Lucene.Net.Analysis.Shingle +{ + public class ShingleFilterTests : BaseTokenStreamTestCase + { + public static readonly Token[] TestToken = new[] + { + CreateToken("please", 0, 6), + CreateToken("divide", 7, 13), + CreateToken("this", 14, 18), + CreateToken("sentence", 19, 27), + CreateToken("into", 28, 32), + CreateToken("shingles", 33, 39), + }; + + public static Token[] TestTokenWithHoles; + + public static readonly Token[] BiGramTokens = new[] + { + CreateToken("please", 0, 6), + CreateToken("please divide", 0, 13), + CreateToken("divide", 7, 13), + CreateToken("divide this", 7, 18), + CreateToken("this", 14, 18), + CreateToken("this sentence", 14, 27), + CreateToken("sentence", 19, 27), + CreateToken("sentence into", 19, 32), + CreateToken("into", 28, 32), + CreateToken("into shingles", 28, 39), + CreateToken("shingles", 33, 39), + }; + + public static readonly int[] BiGramPositionIncrements = new[] + { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + }; + + public static readonly String[] BiGramTypes = new[] + { + "word", "shingle", "word", "shingle", "word", "shingle", + "word", + "shingle", "word", "shingle", "word" + }; + + public static readonly Token[] BiGramTokensWithHoles = new[] + { + CreateToken("please", 0, 6), + CreateToken("please divide", 0, 13), + CreateToken("divide", 7, 13), + CreateToken("divide _", 7, 19), + CreateToken("_", 19, 19), + CreateToken("_ sentence", 19, 27), + CreateToken("sentence", 19, 27), + CreateToken("sentence _", 19, 33), + CreateToken("_", 33, 33), + CreateToken("_ shingles", 33, 39), + CreateToken("shingles", 33, 39), + }; + + public static readonly int[] BiGramPositionIncrementsWithHoles = new[] + { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + }; + + public static readonly Token[] BiGramTokensWithoutUnigrams = new[] + { + CreateToken("please divide", 0, 13), + CreateToken("divide this", 7, 18), + CreateToken("this sentence", 14, 27), + CreateToken("sentence into", 19, 32), + CreateToken("into shingles", 28, 39), + }; + + public static readonly int[] BiGramPositionIncrementsWithoutUnigrams = new[] + { + 1, 1, 1, 1, 1 + }; + + public static readonly String[] BiGramTypesWithoutUnigrams = new[] + { + "shingle", "shingle", "shingle", + "shingle", "shingle" + }; + + public static readonly Token[] BiGramTokensWithHolesWithoutUnigrams = new[] + { + CreateToken( + "please divide", 0, 13), + CreateToken("divide _", 7, + 19), + CreateToken("_ sentence", 19, + 27), + CreateToken("sentence _", 19, + 33), + CreateToken("_ shingles", 33, + 39), + }; + + public static readonly int[] BiGramPositionIncrementsWithHolesWithoutUnigrams = new[] + { + 1, 1, 1, 1, 1, 1 + }; + + + public static readonly Token[] TestSingleToken = new[] { CreateToken("please", 0, 6) }; + + public static readonly Token[] SingleToken = new[] { CreateToken("please", 0, 6) }; + + public static readonly int[] SingleTokenIncrements = new[] { 1 }; + + public static readonly String[] SingleTokenTypes = new[] { "word" }; + + public static readonly Token[] EmptyTokenArray = new Token[] { }; + + public static readonly int[] EmptyTokenIncrementsArray = new int[] { }; + + public static readonly String[] EmptyTokenTypesArray = new String[] { }; + + public static readonly Token[] TriGramTokens = new[] + { + CreateToken("please", 0, 6), + CreateToken("please divide", 0, 13), + CreateToken("please divide this", 0, 18), + CreateToken("divide", 7, 13), + CreateToken("divide this", 7, 18), + CreateToken("divide this sentence", 7, 27), + CreateToken("this", 14, 18), + CreateToken("this sentence", 14, 27), + CreateToken("this sentence into", 14, 32), + CreateToken("sentence", 19, 27), + CreateToken("sentence into", 19, 32), + CreateToken("sentence into shingles", 19, 39), + CreateToken("into", 28, 32), + CreateToken("into shingles", 28, 39), + CreateToken("shingles", 33, 39) + }; + + public static readonly int[] TriGramPositionIncrements = new[] + { + 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 + }; + + public static readonly String[] TriGramTypes = new[] + { + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", + "word" + }; + + public static readonly Token[] TriGramTokensWithoutUnigrams = new[] + { + CreateToken("please divide", 0, 13), + CreateToken("please divide this", 0, + 18), + CreateToken("divide this", 7, 18), + CreateToken("divide this sentence", 7, + 27), + CreateToken("this sentence", 14, 27), + CreateToken("this sentence into", 14, + 32), + CreateToken("sentence into", 19, 32), + CreateToken("sentence into shingles", + 19, 39), + CreateToken("into shingles", 28, 39), + }; + + public static readonly int[] TriGramPositionIncrementsWithoutUnigrams = new[] + { + 1, 0, 1, 0, 1, 0, 1, 0, 1 + }; + + public static readonly String[] TriGramTypesWithoutUnigrams = new[] + { + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", + }; + + public static readonly Token[] FourGramTokens = new[] + { + CreateToken("please", 0, 6), + CreateToken("please divide", 0, 13), + CreateToken("please divide this", 0, 18), + CreateToken("please divide this sentence", 0, 27), + CreateToken("divide", 7, 13), + CreateToken("divide this", 7, 18), + CreateToken("divide this sentence", 7, 27), + CreateToken("divide this sentence into", 7, 32), + CreateToken("this", 14, 18), + CreateToken("this sentence", 14, 27), + CreateToken("this sentence into", 14, 32), + CreateToken("this sentence into shingles", 14, 39), + CreateToken("sentence", 19, 27), + CreateToken("sentence into", 19, 32), + CreateToken("sentence into shingles", 19, 39), + CreateToken("into", 28, 32), + CreateToken("into shingles", 28, 39), + CreateToken("shingles", 33, 39) + }; + + public static readonly int[] FourGramPositionIncrements = new[] + { + 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0 + , 1, 0, 1 + }; + + public static readonly String[] FourGramTypes = new[] + { + "word", "shingle", "shingle", "shingle", + "word", "shingle", "shingle", "shingle", + "word", "shingle", "shingle", "shingle", + "word", "shingle", "shingle", + "word", "shingle", + "word" + }; + + public static readonly Token[] FourGramTokensWithoutUnigrams = new[] + { + CreateToken("please divide", 0, 13), + CreateToken("please divide this", 0, + 18), + CreateToken( + "please divide this sentence", 0, + 27), + CreateToken("divide this", 7, 18), + CreateToken("divide this sentence", 7, + 27), + CreateToken( + "divide this sentence into", 7, + 32), + CreateToken("this sentence", 14, 27), + CreateToken("this sentence into", 14, + 32), + CreateToken( + "this sentence into shingles", 14, + 39), + CreateToken("sentence into", 19, 32), + CreateToken( + "sentence into shingles", 19, 39) + , + CreateToken("into shingles", 28, 39), + }; + + public static readonly int[] FourGramPositionIncrementsWithoutUnigrams = new[] + { + 1, 0, 0, 1, 0, 0, 1, 0, 0, + 1, 0, 1 + }; + + public static readonly String[] FourGramTypesWithoutUnigrams = new[] + { + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + "shingle", "shingle", + }; + + private static Token CreateToken(String term, int start, int offset) + { + var token = new Token(start, offset); + token.SetTermBuffer(term); + return token; + } + + [SetUp] + public override void SetUp() + { + base.SetUp(); + TestTokenWithHoles = new[] + { + CreateToken("please", 0, 6), + CreateToken("divide", 7, 13), + CreateToken("sentence", 19, 27), + CreateToken("shingles", 33, 39), + }; + + TestTokenWithHoles[2].SetPositionIncrement(2); + TestTokenWithHoles[3].SetPositionIncrement(2); + } + + + /// + /// Class under test for void ShingleFilter(TokenStream, int) + /// + [Test] + public void TestBiGramFilter() + { + ShingleFilterTest(2, TestToken, BiGramTokens, + BiGramPositionIncrements, BiGramTypes, + true); + } + + [Test] + public void TestBiGramFilterWithHoles() + { + ShingleFilterTest(2, TestTokenWithHoles, BiGramTokensWithHoles, + BiGramPositionIncrements, BiGramTypes, + true); + } + + [Test] + public void TestBiGramFilterWithoutUnigrams() + { + ShingleFilterTest(2, TestToken, BiGramTokensWithoutUnigrams, + BiGramPositionIncrementsWithoutUnigrams, BiGramTypesWithoutUnigrams, + false); + } + + [Test] + public void TestBiGramFilterWithHolesWithoutUnigrams() + { + ShingleFilterTest(2, TestTokenWithHoles, BiGramTokensWithHolesWithoutUnigrams, + BiGramPositionIncrementsWithHolesWithoutUnigrams, BiGramTypesWithoutUnigrams, + false); + } + + [Test] + public void TestBiGramFilterWithSingleToken() + { + ShingleFilterTest(2, TestSingleToken, SingleToken, + SingleTokenIncrements, SingleTokenTypes, + true); + } + + [Test] + public void TestBiGramFilterWithSingleTokenWithoutUnigrams() + { + ShingleFilterTest(2, TestSingleToken, EmptyTokenArray, + EmptyTokenIncrementsArray, EmptyTokenTypesArray, + false); + } + + [Test] + public void TestBiGramFilterWithEmptyTokenStream() + { + ShingleFilterTest(2, EmptyTokenArray, EmptyTokenArray, + EmptyTokenIncrementsArray, EmptyTokenTypesArray, + true); + } + + [Test] + public void TestBiGramFilterWithEmptyTokenStreamWithoutUnigrams() + { + ShingleFilterTest(2, EmptyTokenArray, EmptyTokenArray, + EmptyTokenIncrementsArray, EmptyTokenTypesArray, + false); + } + + [Test] + public void TestTriGramFilter() + { + ShingleFilterTest(3, TestToken, TriGramTokens, + TriGramPositionIncrements, TriGramTypes, + true); + } + + [Test] + public void TestTriGramFilterWithoutUnigrams() + { + ShingleFilterTest(3, TestToken, TriGramTokensWithoutUnigrams, + TriGramPositionIncrementsWithoutUnigrams, TriGramTypesWithoutUnigrams, + false); + } + + [Test] + public void TestFourGramFilter() + { + ShingleFilterTest(4, TestToken, FourGramTokens, + FourGramPositionIncrements, FourGramTypes, + true); + } + + [Test] + public void TestFourGramFilterWithoutUnigrams() + { + ShingleFilterTest(4, TestToken, FourGramTokensWithoutUnigrams, + FourGramPositionIncrementsWithoutUnigrams, + FourGramTypesWithoutUnigrams, false); + } + + [Test] + public void TestReset() + { + Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence")); + TokenStream filter = new ShingleFilter(wsTokenizer, 2); + + AssertTokenStreamContents(filter, + new[] + { + "please", "please divide", "divide", "divide this", "this", + "this sentence", + "sentence" + }, + new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27}, + new[] + { + TypeAttributeImpl.DEFAULT_TYPE, "shingle", TypeAttributeImpl.DEFAULT_TYPE, + "shingle", TypeAttributeImpl.DEFAULT_TYPE, "shingle", + TypeAttributeImpl.DEFAULT_TYPE + }, + new[] {1, 0, 1, 0, 1, 0, 1} + ); + + wsTokenizer.Reset(new StringReader("please divide this sentence")); + + AssertTokenStreamContents(filter, + new[] + { + "please", "please divide", "divide", "divide this", "this", + "this sentence", + "sentence" + }, + new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27}, + new[] + { + TypeAttributeImpl.DEFAULT_TYPE, "shingle", TypeAttributeImpl.DEFAULT_TYPE, + "shingle", TypeAttributeImpl.DEFAULT_TYPE, "shingle", + TypeAttributeImpl.DEFAULT_TYPE + }, + new[] {1, 0, 1, 0, 1, 0, 1} + ); + } + + protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, + int[] positionIncrements, String[] types, bool outputUnigrams) + { + var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); + filter.SetOutputUnigrams(outputUnigrams); + + var termAtt = (TermAttribute) filter.AddAttribute(typeof (TermAttribute)); + var offsetAtt = (OffsetAttribute) filter.AddAttribute(typeof (OffsetAttribute)); + var posIncrAtt = (PositionIncrementAttribute) filter.AddAttribute(typeof (PositionIncrementAttribute)); + var typeAtt = (TypeAttribute) filter.AddAttribute(typeof (TypeAttribute)); + + int i = 0; + while (filter.IncrementToken()) + { + Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected"); + + String termText = termAtt.Term(); + String goldText = tokensToCompare[i].Term(); + + Assert.AreEqual(goldText, termText, "Wrong termText"); + Assert.AreEqual(tokensToCompare[i].StartOffset(), offsetAtt.StartOffset(), + "Wrong startOffset for token \"" + termText + "\""); + Assert.AreEqual(tokensToCompare[i].EndOffset(), offsetAtt.EndOffset(), + "Wrong endOffset for token \"" + termText + "\""); + Assert.AreEqual(positionIncrements[i], posIncrAtt.GetPositionIncrement(), + "Wrong positionIncrement for token \"" + termText + "\""); + Assert.AreEqual(types[i], typeAtt.Type(), "Wrong type for token \"" + termText + "\""); + + i++; + } + + Assert.AreEqual(tokensToCompare.Length, i, + "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + + tokensToCompare.Length + ")"); + } + + #region Nested type: TestTokenStream + + public sealed class TestTokenStream : TokenStream + { + private readonly OffsetAttribute _offsetAtt; + private readonly PositionIncrementAttribute _posIncrAtt; + private readonly TermAttribute _termAtt; + private readonly Token[] _testToken; + private readonly TypeAttribute _typeAtt; + private int _index; + + public TestTokenStream(Token[] testToken) + { + _testToken = testToken; + + _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute)); + _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute)); + _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute)); + _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute)); + } + + public override bool IncrementToken() + { + ClearAttributes(); + + if (_index >= _testToken.Length) + return false; + + Token t = _testToken[_index++]; + + _termAtt.SetTermBuffer(t.TermBuffer(), 0, t.TermLength()); + _offsetAtt.SetOffset(t.StartOffset(), t.EndOffset()); + _posIncrAtt.SetPositionIncrement(t.GetPositionIncrement()); + _typeAtt.SetType(TypeAttributeImpl.DEFAULT_TYPE); + + return true; + } + } + + #endregion + } +} \ No newline at end of file