Return-Path: Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: (qmail 10201 invoked from network); 9 Jan 2009 03:29:43 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 9 Jan 2009 03:29:43 -0000 Received: (qmail 93834 invoked by uid 500); 9 Jan 2009 03:29:43 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 93817 invoked by uid 99); 9 Jan 2009 03:29:43 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 08 Jan 2009 19:29:43 -0800 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 09 Jan 2009 03:29:38 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id F20712388B5D; Thu, 8 Jan 2009 19:28:52 -0800 (PST) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r732916 [9/14] - in /lucene/pylucene/trunk: ./ java/ java/org/ java/org/osafoundation/ java/org/osafoundation/lucene/ java/org/osafoundation/lucene/analysis/ java/org/osafoundation/lucene/queryParser/ java/org/osafoundation/lucene/search/ j... Date: Fri, 09 Jan 2009 03:28:41 -0000 To: commits@lucene.apache.org From: vajda@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20090109032852.F20712388B5D@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/BooksLikeThis.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/BooksLikeThis.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/BooksLikeThis.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/BooksLikeThis.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,98 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +import os + +from lucene import \ + Document, IndexReader, Term, BooleanQuery, IndexSearcher, TermQuery, \ + FSDirectory, System, BooleanClause, Hit + + +class BooksLikeThis(object): + + def main(cls, argv): + + indexDir = System.getProperty("index.dir") + directory = FSDirectory.getDirectory(indexDir, False) + + reader = IndexReader.open(directory) + blt = BooksLikeThis(reader) + + for id in xrange(reader.maxDoc()): + if reader.isDeleted(id): + continue + doc = reader.document(id) + print '' + print doc.get("title").encode('utf-8') + + docs = blt.docsLike(id, doc, 10) + if not docs: + print " None like this" + else: + for doc in docs: + print " ->", doc.get("title").encode('utf-8') + + def __init__(self, reader): + + self.reader = reader + self.searcher = IndexSearcher(reader) + + def docsLike(self, id, doc, max): + + authors = doc.getValues("author") + authorQuery = BooleanQuery() + for author in authors: + authorQuery.add(TermQuery(Term("author", author)), + BooleanClause.Occur.SHOULD) + authorQuery.setBoost(2.0) + + vector = self.reader.getTermFreqVector(id, "subject") + + subjectQuery = BooleanQuery() + for term in vector.getTerms(): + tq = TermQuery(Term("subject", term)) + subjectQuery.add(tq, BooleanClause.Occur.SHOULD) + + likeThisQuery = BooleanQuery() + likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD) + likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD) + + # exclude myself + likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))), + BooleanClause.Occur.MUST_NOT) + + print " Query:", likeThisQuery.toString("contents") + hits = self.searcher.search(likeThisQuery) + + docs = [] + for hit in hits: + hit = Hit.cast_(hit) + doc = hit.getDocument() + if len(docs) < max: + docs.append(doc) + else: + break + + return docs + + main = classmethod(main) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/BooksLikeThis.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/BooksLikeThis.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/CategorizerTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/CategorizerTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/CategorizerTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/CategorizerTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,123 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from math import pi, sqrt, acos +from lia.common.LiaTestCase import LiaTestCase + +from lucene import Document, IndexReader + + +class CategorizerTest(LiaTestCase): + + def setUp(self): + + super(CategorizerTest, self).setUp() + self.categoryMap = {} + + self.buildCategoryVectors() + self.dumpCategoryVectors() + + def testCategorization(self): + + self.assertEqual("/technology/computers/programming/methodology", + self.getCategory("extreme agile methodology")) + self.assertEqual("/education/pedagogy", + self.getCategory("montessori education philosophy")) + + def dumpCategoryVectors(self): + + for category, vectorMap in self.categoryMap.iteritems(): + print "Category", category + for term, freq in vectorMap.iteritems(): + print " ", term, "=", freq + + def buildCategoryVectors(self): + + reader = IndexReader.open(self.directory) + + for id in xrange(reader.maxDoc()): + doc = reader.document(id) + category = doc.get("category") + vectorMap = self.categoryMap.get(category, None) + if vectorMap is None: + vectorMap = self.categoryMap[category] = {} + + termFreqVector = reader.getTermFreqVector(id, "subject") + self.addTermFreqToMap(vectorMap, termFreqVector) + + def addTermFreqToMap(self, vectorMap, termFreqVector): + + terms = termFreqVector.getTerms() + freqs = termFreqVector.getTermFrequencies() + + i = 0 + for term in terms: + if term in vectorMap: + vectorMap[term] += freqs[i] + else: + vectorMap[term] = freqs[i] + i += 1 + + def getCategory(self, subject): + + words = subject.split(' ') + + bestAngle = 2 * pi + bestCategory = None + + for category, vectorMap in self.categoryMap.iteritems(): + angle = self.computeAngle(words, category, vectorMap) + if angle != 'nan' and angle < bestAngle: + bestAngle = angle + bestCategory = category + + return bestCategory + + def computeAngle(self, words, category, vectorMap): + + # assume words are unique and only occur once + + dotProduct = 0 + sumOfSquares = 0 + + for word in words: + categoryWordFreq = 0 + + if word in vectorMap: + categoryWordFreq = vectorMap[word] + + # optimized because we assume frequency in words is 1 + dotProduct += categoryWordFreq + sumOfSquares += categoryWordFreq ** 2 + + if sumOfSquares == 0: + return 'nan' + + if sumOfSquares == len(words): + # avoid precision issues for special case + # sqrt x * sqrt x = x + denominator = sumOfSquares + else: + denominator = sqrt(sumOfSquares) * sqrt(len(words)) + + return acos(dotProduct / denominator) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/CategorizerTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/CategorizerTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/FilterTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/FilterTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/FilterTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/FilterTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,100 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lia.common.LiaTestCase import LiaTestCase + +from lucene import \ + Term, BooleanQuery, IndexSearcher, TermQuery, DateField, \ + CachingWrapperFilter, DateFilter, RangeQuery, QueryFilter, BooleanClause + + +class FilterTest(LiaTestCase): + + def setUp(self): + + super(FilterTest, self).setUp() + + self.allBooks = RangeQuery(Term("pubmonth", "190001"), + Term("pubmonth", "200512"), True) + self.searcher = IndexSearcher(self.directory) + hits = self.searcher.search(self.allBooks) + self.numAllBooks = len(hits) + + def testDateFilter(self): + + jan1 = self.parseDate("2004-01-01") + jan31 = self.parseDate("2004-01-31") + dec31 = self.parseDate("2004-12-31") + + filter = DateFilter("modified", jan1, dec31) + + hits = self.searcher.search(self.allBooks, filter) + self.assertEqual(self.numAllBooks, len(hits), "all modified in 2004") + + filter = DateFilter("modified", jan1, jan31) + hits = self.searcher.search(self.allBooks, filter) + self.assertEqual(0, len(hits), "none modified in January") + + def testQueryFilter(self): + + categoryQuery = TermQuery(Term("category", "/philosophy/eastern")) + categoryFilter = QueryFilter(categoryQuery) + + hits = self.searcher.search(self.allBooks, categoryFilter) + self.assertEqual(1, len(hits), "only tao te ching") + + def testFilterAlternative(self): + + categoryQuery = TermQuery(Term("category", "/philosophy/eastern")) + + constrainedQuery = BooleanQuery() + constrainedQuery.add(self.allBooks, BooleanClause.Occur.MUST) + constrainedQuery.add(categoryQuery, BooleanClause.Occur.MUST) + + hits = self.searcher.search(constrainedQuery) + self.assertEqual(1, len(hits), "only tao te ching") + + def testQueryFilterWithRangeQuery(self): + + jan1 = self.parseDate("2004-01-01") + dec31 = self.parseDate("2004-12-31") + + start = Term("modified", DateField.dateToString(jan1)) + end = Term("modified", DateField.dateToString(dec31)) + + rangeQuery = RangeQuery(start, end, True) + + filter = QueryFilter(rangeQuery) + hits = self.searcher.search(self.allBooks, filter) + self.assertEqual(self.numAllBooks, len(hits), "all of 'em") + + def testCachingWrapper(self): + + jan1 = self.parseDate("2004-01-01") + dec31 = self.parseDate("2004-12-31") + + dateFilter = DateFilter("modified", jan1, dec31) + cachingFilter = CachingWrapperFilter(dateFilter) + + hits = self.searcher.search(self.allBooks, cachingFilter) + self.assertEqual(self.numAllBooks, len(hits), "all of 'em") Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/FilterTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/FilterTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiFieldQueryParserTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiFieldQueryParserTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiFieldQueryParserTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiFieldQueryParserTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,60 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lia.common.LiaTestCase import LiaTestCase + +from lucene import \ + SimpleAnalyzer, MultiFieldQueryParser, IndexSearcher, BooleanClause + + +class MultiFieldQueryParserTest(LiaTestCase): + + def testDefaultOperator(self): + + SHOULD = BooleanClause.Occur.SHOULD + query = MultiFieldQueryParser.parse("development", + ["title", "subject"], + [SHOULD, SHOULD], + SimpleAnalyzer()) + + searcher = IndexSearcher(self.directory) + hits = searcher.search(query) + + self.assertHitsIncludeTitle(hits, "Java Development with Ant") + + # has "development" in the subject field + self.assertHitsIncludeTitle(hits, "Extreme Programming Explained") + + def testSpecifiedOperator(self): + + MUST = BooleanClause.Occur.MUST + query = MultiFieldQueryParser.parse("development", + ["title", "subject"], + [MUST, MUST], + SimpleAnalyzer()) + + searcher = IndexSearcher(self.directory) + hits = searcher.search(query) + + self.assertHitsIncludeTitle(hits, "Java Development with Ant") + self.assertEqual(1, hits.length(), "one and only one") Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiFieldQueryParserTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiFieldQueryParserTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiSearcherTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiSearcherTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiSearcherTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiSearcherTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,74 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from unittest import TestCase +from lucene import \ + WhitespaceAnalyzer, Document, Field, IndexWriter, Term, MultiSearcher, \ + RangeQuery, RAMDirectory, IndexSearcher + + +class MultiSearcherTest(TestCase): + + def setUp(self): + + animals = [ "aardvark", "beaver", "coati", + "dog", "elephant", "frog", "gila monster", + "horse", "iguana", "javelina", "kangaroo", + "lemur", "moose", "nematode", "orca", + "python", "quokka", "rat", "scorpion", + "tarantula", "uromastyx", "vicuna", + "walrus", "xiphias", "yak", "zebra" ] + + analyzer = WhitespaceAnalyzer() + + aTOmDirectory = RAMDirectory() + nTOzDirectory = RAMDirectory() + + aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True) + nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True) + + for animal in animals: + doc = Document() + doc.add(Field("animal", animal, + Field.Store.YES, Field.Index.UN_TOKENIZED)) + + if animal[0].lower() < "n": + aTOmWriter.addDocument(doc) + else: + nTOzWriter.addDocument(doc) + + aTOmWriter.close() + nTOzWriter.close() + + self.searchers = [ IndexSearcher(aTOmDirectory), + IndexSearcher(nTOzDirectory) ] + + def testMulti(self): + + searcher = MultiSearcher(self.searchers) + + # range spans documents across both indexes + query = RangeQuery(Term("animal", "h"), Term("animal", "t"), True) + + hits = searcher.search(query) + self.assertEqual(12, hits.length(), "tarantula not included") Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiSearcherTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/MultiSearcherTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/PhrasePrefixQueryTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/PhrasePrefixQueryTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/PhrasePrefixQueryTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/PhrasePrefixQueryTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,84 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from unittest import TestCase +from lucene import \ + WhitespaceAnalyzer, Document, Field, IndexWriter, Term, BooleanQuery, \ + IndexSearcher, PhrasePrefixQuery, PhraseQuery, RAMDirectory, BooleanClause + + +class PhrasePrefixQueryTest(TestCase): + + def setUp(self): + + directory = RAMDirectory() + writer = IndexWriter(directory, WhitespaceAnalyzer(), True) + + doc1 = Document() + doc1.add(Field("field", "the quick brown fox jumped over the lazy dog", + Field.Store.YES, Field.Index.TOKENIZED)) + writer.addDocument(doc1) + + doc2 = Document() + doc2.add(Field("field", "the fast fox hopped over the hound", + Field.Store.YES, Field.Index.TOKENIZED)) + writer.addDocument(doc2) + writer.close() + + self.searcher = IndexSearcher(directory) + + def testBasic(self): + + query = PhrasePrefixQuery() + query.add([Term("field", "quick"), Term("field", "fast")]) + query.add(Term("field", "fox")) + print query + + hits = self.searcher.search(query) + self.assertEqual(1, len(hits), "fast fox match") + + query.setSlop(1) + hits = self.searcher.search(query) + self.assertEqual(2, len(hits), "both match") + + def testAgainstOR(self): + + quickFox = PhraseQuery() + quickFox.setSlop(1) + quickFox.add(Term("field", "quick")) + quickFox.add(Term("field", "fox")) + + fastFox = PhraseQuery() + fastFox.add(Term("field", "fast")) + fastFox.add(Term("field", "fox")) + + query = BooleanQuery() + query.add(quickFox, BooleanClause.Occur.SHOULD) + query.add(fastFox, BooleanClause.Occur.SHOULD) + hits = self.searcher.search(query) + self.assertEqual(2, len(hits)) + + def debug(self, hits): + + for i, doc in hits: + print "%s: %s" %(hits.score(i), doc['field']) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/PhrasePrefixQueryTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/PhrasePrefixQueryTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SecurityFilterTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SecurityFilterTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SecurityFilterTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SecurityFilterTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,68 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from unittest import TestCase +from lucene import \ + WhitespaceAnalyzer, Document, Field, IndexWriter, Term, MultiSearcher, \ + QueryFilter, RAMDirectory, IndexSearcher, TermQuery + + +class SecurityFilterTest(TestCase): + + def setUp(self): + + self.directory = RAMDirectory() + writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True) + + # Elwood + document = Document() + document.add(Field("owner", "elwood", + Field.Store.YES, Field.Index.UN_TOKENIZED)) + document.add(Field("keywords", "elwoods sensitive info", + Field.Store.YES, Field.Index.TOKENIZED)) + writer.addDocument(document) + + # Jake + document = Document() + document.add(Field("owner", "jake", + Field.Store.YES, Field.Index.UN_TOKENIZED)) + document.add(Field("keywords", "jakes sensitive info", + Field.Store.YES, Field.Index.TOKENIZED)) + writer.addDocument(document) + + writer.close() + + def testSecurityFilter(self): + + query = TermQuery(Term("keywords", "info")) + + searcher = IndexSearcher(self.directory) + hits = searcher.search(query) + self.assertEqual(2, len(hits), "Both documents match") + + jakeFilter = QueryFilter(TermQuery(Term("owner", "jake"))) + + hits = searcher.search(query, jakeFilter) + self.assertEqual(1, len(hits)) + self.assertEqual("jakes sensitive info", hits[0].get("keywords"), + "elwood is safe") Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SecurityFilterTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SecurityFilterTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SortingExample.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SortingExample.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SortingExample.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SortingExample.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,84 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +import os + +from lucene import \ + FSDirectory, Document, Field, IndexSearcher, SimpleAnalyzer, \ + RangeQuery, Sort, SortField, DecimalFormat, System, Term + + +class SortingExample(object): + + def __init__(self, directory): + + self.directory = directory + + def displayHits(self, query, sort): + + searcher = IndexSearcher(self.directory) + hits = searcher.search(query, sort) + + print "\nResults for:", query, "sorted by", sort + print "Title".rjust(30), "pubmonth".rjust(10), \ + "id".center(4), "score".center(15) + + scoreFormatter = DecimalFormat("0.######") + for i, doc in hits: + title = doc["title"] + if len(title) > 30: + title = title[:30] + print title.encode('ascii', 'replace').rjust(30), \ + doc["pubmonth"].rjust(10), \ + str(hits.id(i)).center(4), \ + scoreFormatter.format(hits.score(i)).ljust(12) + print " ", doc["category"] + # print searcher.explain(query, hits.id(i)) + + searcher.close() + + def main(cls, argv): + + earliest = Term("pubmonth", "190001") + latest = Term("pubmonth", "201012") + allBooks = RangeQuery(earliest, latest, True) + + indexDir = System.getProperty("index.dir") + directory = FSDirectory.getDirectory(indexDir, False) + example = SortingExample(directory) + + example.displayHits(allBooks, Sort.RELEVANCE) + example.displayHits(allBooks, Sort.INDEXORDER) + example.displayHits(allBooks, Sort("category")) + example.displayHits(allBooks, Sort("pubmonth", True)) + + example.displayHits(allBooks, + Sort([SortField("category"), + SortField.FIELD_SCORE, + SortField("pubmonth", SortField.INT, True)])) + + example.displayHits(allBooks, + Sort([SortField.FIELD_SCORE, + SortField("category")])) + + main = classmethod(main) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SortingExample.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SortingExample.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SpanQueryTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SpanQueryTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SpanQueryTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SpanQueryTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,221 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from unittest import TestCase +from cStringIO import StringIO + +from lucene import \ + WhitespaceAnalyzer, Document, Field, IndexReader, IndexWriter, Term, \ + IndexSearcher, PhraseQuery, SpanFirstQuery, SpanNearQuery, SpanNotQuery, \ + SpanOrQuery, SpanTermQuery, RAMDirectory, Hit + +from lia.analysis.AnalyzerUtils import AnalyzerUtils + + +class SpanQueryTest(TestCase): + + def setUp(self): + + self.directory = RAMDirectory() + self.analyzer = WhitespaceAnalyzer() + + writer = IndexWriter(self.directory, self.analyzer, True) + + doc = Document() + doc.add(Field("f", "the quick brown fox jumps over the lazy dog", + Field.Store.YES, Field.Index.TOKENIZED)) + writer.addDocument(doc) + + doc = Document() + doc.add(Field("f", "the quick red fox jumps over the sleepy cat", + Field.Store.YES, Field.Index.TOKENIZED)) + writer.addDocument(doc) + + writer.close() + + self.searcher = IndexSearcher(self.directory) + self.reader = IndexReader.open(self.directory) + + self.quick = SpanTermQuery(Term("f", "quick")) + self.brown = SpanTermQuery(Term("f", "brown")) + self.red = SpanTermQuery(Term("f", "red")) + self.fox = SpanTermQuery(Term("f", "fox")) + self.lazy = SpanTermQuery(Term("f", "lazy")) + self.sleepy = SpanTermQuery(Term("f", "sleepy")) + self.dog = SpanTermQuery(Term("f", "dog")) + self.cat = SpanTermQuery(Term("f", "cat")) + + def assertOnlyBrownFox(self, query): + + hits = self.searcher.search(query) + self.assertEqual(1, len(hits)) + self.assertEqual(0, hits.id(0), "wrong doc") + + def assertBothFoxes(self, query): + + hits = self.searcher.search(query) + self.assertEqual(2, len(hits)) + + def assertNoMatches(self, query): + + hits = self.searcher.search(query) + self.assertEquals(0, len(hits)) + + def testSpanTermQuery(self): + + self.assertOnlyBrownFox(self.brown) + self.dumpSpans(self.brown) + + def testSpanFirstQuery(self): + + sfq = SpanFirstQuery(self.brown, 2) + self.assertNoMatches(sfq) + + self.dumpSpans(sfq) + + sfq = SpanFirstQuery(self.brown, 3) + self.dumpSpans(sfq) + self.assertOnlyBrownFox(sfq) + + def testSpanNearQuery(self): + + quick_brown_dog = [self.quick, self.brown, self.dog] + snq = SpanNearQuery(quick_brown_dog, 0, True) + self.assertNoMatches(snq) + self.dumpSpans(snq) + + snq = SpanNearQuery(quick_brown_dog, 4, True) + self.assertNoMatches(snq) + self.dumpSpans(snq) + + snq = SpanNearQuery(quick_brown_dog, 5, True) + self.assertOnlyBrownFox(snq) + self.dumpSpans(snq) + + # interesting - even a sloppy phrase query would require + # more slop to match + snq = SpanNearQuery([self.lazy, self.fox], 3, False) + self.assertOnlyBrownFox(snq) + self.dumpSpans(snq) + + pq = PhraseQuery() + pq.add(Term("f", "lazy")) + pq.add(Term("f", "fox")) + pq.setSlop(4) + self.assertNoMatches(pq) + + pq.setSlop(5) + self.assertOnlyBrownFox(pq) + + def testSpanNotQuery(self): + + quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) + self.assertBothFoxes(quick_fox) + self.dumpSpans(quick_fox) + + quick_fox_dog = SpanNotQuery(quick_fox, self.dog) + self.assertBothFoxes(quick_fox_dog) + self.dumpSpans(quick_fox_dog) + + no_quick_red_fox = SpanNotQuery(quick_fox, self.red) + self.assertOnlyBrownFox(no_quick_red_fox) + self.dumpSpans(no_quick_red_fox) + + def testSpanOrQuery(self): + + quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) + lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True) + sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True) + qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True) + + self.assertOnlyBrownFox(qf_near_ld) + self.dumpSpans(qf_near_ld) + + qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True) + self.dumpSpans(qf_near_sc) + + orQ = SpanOrQuery([qf_near_ld, qf_near_sc]) + self.assertBothFoxes(orQ) + self.dumpSpans(orQ) + + def testPlay(self): + + orQ = SpanOrQuery([self.quick, self.fox]) + self.dumpSpans(orQ) + + quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) + sfq = SpanFirstQuery(quick_fox, 4) + self.dumpSpans(sfq) + + self.dumpSpans(SpanTermQuery(Term("f", "the"))) + + quick_brown = SpanNearQuery([self.quick, self.brown], 0, False) + self.dumpSpans(quick_brown) + + def dumpSpans(self, query): + + spans = query.getSpans(self.reader) + print "%s:" % query + numSpans = 0 + + hits = self.searcher.search(query) + scores = [0, 0] + for hit in hits: + hit = Hit.cast_(hit) + scores[hit.getId()] = hit.getScore() + + while spans.next(): + numSpans += 1 + + id = spans.doc() + doc = self.reader.document(id) + + # for simplicity - assume tokens are in sequential, + # positions, starting from 0 + tokens = AnalyzerUtils.tokensFromAnalysis(self.analyzer, doc["f"]) + buffer = StringIO() + buffer.write(" ") + + i = 0 + for token in tokens: + if i == spans.start(): + buffer.write("<") + + buffer.write(token.termText()) + if i + 1 == spans.end(): + buffer.write(">") + + buffer.write(" ") + i += 1 + + buffer.write("(") + buffer.write(str(scores[id])) + buffer.write(") ") + + print buffer.getvalue() + # print self.searcher.explain(query, id) + + if numSpans == 0: + print " No spans" + + print '' Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SpanQueryTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/SpanQueryTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/__init__.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/__init__.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/__init__.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/__init__.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1 @@ +# advsearching package Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/__init__.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/advsearching/__init__.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerDemo.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerDemo.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerDemo.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerDemo.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,69 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + + +from lia.analysis.AnalyzerUtils import AnalyzerUtils +from lucene import \ + StopAnalyzer, SimpleAnalyzer, WhitespaceAnalyzer, StandardAnalyzer + + +class AnalyzerDemo(object): + + examples = ["The quick brown fox jumped over the lazy dogs", + "XY&Z Corporation - xyz@example.com"] + + analyzers = [WhitespaceAnalyzer(), + SimpleAnalyzer(), + StopAnalyzer(), + StandardAnalyzer()] + + def main(cls, argv): + + # Use the embedded example strings, unless + # command line arguments are specified, then use those. + strings = cls.examples + + if len(argv) > 1: + strings = argv[1:] + + for string in strings: + cls.analyze(string) + + def analyze(cls, text): + + print'"Analyzing "', text, '"' + + for analyzer in cls.analyzers: + name = type(analyzer).__name__ + print " %s:" %(name), + AnalyzerUtils.displayTokens(analyzer, text) + print '' + print '' + + main = classmethod(main) + analyze = classmethod(analyze) + + +if __name__ == "__main__": + import sys + AnalyzerDemo.main(sys.argv) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerDemo.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerDemo.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerUtils.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerUtils.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerUtils.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerUtils.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,94 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lucene import \ + SimpleAnalyzer, Token, TokenStream, StandardAnalyzer, StringReader + + +class AnalyzerUtils(object): + + def main(cls, argv): + + print "SimpleAnalyzer" + cls.displayTokensWithFullDetails(SimpleAnalyzer(), + "The quick brown fox....") + + print "\n----" + print "StandardAnalyzer" + cls.displayTokensWithFullDetails(StandardAnalyzer(), + "I'll e-mail you at xyz@example.com") + + def tokensFromAnalysis(cls, analyzer, text): + return [token for token in analyzer.tokenStream("contents", StringReader(text))] + + def displayTokens(cls, analyzer, text): + + for token in cls.tokensFromAnalysis(analyzer, text): + print "[%s]" %(token.termText()), + + def displayTokensWithPositions(cls, analyzer, text): + + position = 0 + for token in cls.tokensFromAnalysis(analyzer, text): + increment = token.getPositionIncrement() + if increment > 0: + position += increment + print "\n%d:" %(position), + + print "[%s]" %(token.termText()), + + def displayTokensWithFullDetails(cls, analyzer, text): + + position = 0 + for token in cls.tokensFromAnalysis(analyzer, text): + increment = token.getPositionIncrement() + + if increment > 0: + position += increment + print "\n%s:" %(position), + + print "[%s:%d->%d:%s]" %(token.termText(), + token.startOffset(), + token.endOffset(), + token.type()), + + def assertTokensEqual(cls, unittest, tokens, strings): + + unittest.assertEqual(len(strings), len(tokens)) + + i = 0 + for token in tokens: + unittest.assertEqual(strings[i], token.termText(), "index %d" %(i)) + i += 1 + + main = classmethod(main) + tokensFromAnalysis = classmethod(tokensFromAnalysis) + displayTokens = classmethod(displayTokens) + displayTokensWithPositions = classmethod(displayTokensWithPositions) + displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails) + assertTokensEqual = classmethod(assertTokensEqual) + + +if __name__ == "__main__": + import sys + AnalyzerUtils.main(sys.argv) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerUtils.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/AnalyzerUtils.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/UsingAnalyzersExample.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/UsingAnalyzersExample.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/UsingAnalyzersExample.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/UsingAnalyzersExample.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,53 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lucene import \ + RAMDirectory, IndexWriter, StandardAnalyzer, Document, Field, \ + QueryParser + +class UsingAnalyzersExample(object): + + # + # This method doesn't do anything, except compile correctly. + # This is used to show snippets of how Analyzers are used. + # + def someMethod(self): + + directory = RAMDirectory() + + analyzer = StandardAnalyzer() + writer = IndexWriter(directory, analyzer, True) + + doc = Document() + doc.add(Field.Text("title", "This is the title")) + doc.add(Field.UnStored("contents", "...document contents...")) + writer.addDocument(doc) + + writer.addDocument(doc, analyzer) + + expression = "some query" + + query = QueryParser.parse(expression, "contents", analyzer) + + parser = QueryParser("contents", analyzer) + query = parser.parseQuery(expression) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/UsingAnalyzersExample.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/UsingAnalyzersExample.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/__init__.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/__init__.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/__init__.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/__init__.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1 @@ +# analysis package Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/__init__.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/__init__.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/ChineseTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/ChineseTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/ChineseTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/ChineseTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lia.common.LiaTestCase import LiaTestCase +from lucene import Term, IndexSearcher, TermQuery + + +class ChineseTest(LiaTestCase): + + def testChinese(self): + + searcher = IndexSearcher(self.directory) + hits = searcher.search(TermQuery(Term("contents", "道"))) + + self.assertEqual(1, hits.length(), "tao") Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/ChineseTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/ChineseTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/__init__.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/__init__.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/__init__.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/__init__.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1 @@ +# i18n package Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/__init__.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/i18n/__init__.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzer.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzer.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzer.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzer.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,48 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lucene import Token, PythonAnalyzer, PythonTokenStream, JArray + +# +# "Tokenizes" the entire stream as a single token. +# + +class KeywordAnalyzer(PythonAnalyzer): + + def tokenStream(self, fieldName, reader): + + class _tokenStream(PythonTokenStream): + + def __init__(self): + super(_tokenStream, self).__init__() + self.done = False + + def next(self): + if not self.done: + self.done = True + text = JArray('char')(1024) + size = reader.read(text, 0, 1024) + return Token(text, 0, size, 0, size) + return None + + return _tokenStream() Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzer.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzer.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzerTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzerTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzerTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzerTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,89 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from unittest import TestCase + +from lucene import \ + IndexWriter, Term, SimpleAnalyzer, PerFieldAnalyzerWrapper, \ + RAMDirectory, Document, Field, IndexSearcher, TermQuery, \ + QueryParser, Analyzer, StringReader, Token, JavaError + +from lia.analysis.keyword.KeywordAnalyzer import KeywordAnalyzer +from lia.analysis.keyword.SimpleKeywordAnalyzer import SimpleKeywordAnalyzer + + +class KeywordAnalyzerTest(TestCase): + + def setUp(self): + + self.directory = RAMDirectory() + writer = IndexWriter(self.directory, SimpleAnalyzer(), True) + + doc = Document() + doc.add(Field("partnum", "Q36", + Field.Store.YES, Field.Index.UN_TOKENIZED)) + doc.add(Field("description", "Illidium Space Modulator", + Field.Store.YES, Field.Index.TOKENIZED)) + writer.addDocument(doc) + writer.close() + + self.searcher = IndexSearcher(self.directory) + + def testTermQuery(self): + + query = TermQuery(Term("partnum", "Q36")) + hits = self.searcher.search(query) + self.assertEqual(1, hits.length()) + + def testBasicQueryParser(self): + + query = QueryParser("description", + SimpleAnalyzer()).parse("partnum:Q36 AND SPACE") + + hits = self.searcher.search(query) + self.assertEqual("+partnum:q +space", query.toString("description"), + "note Q36 -> q") + self.assertEqual(0, hits.length(), "doc not found :(") + + def testPerFieldAnalyzer(self): + + analyzer = PerFieldAnalyzerWrapper(SimpleAnalyzer()) + analyzer.addAnalyzer("partnum", KeywordAnalyzer()) + + query = QueryParser("description", + analyzer).parse("partnum:Q36 AND SPACE") + hits = self.searcher.search(query) + + self.assertEqual("+partnum:Q36 +space", query.toString("description"), + "Q36 kept as-is") + self.assertEqual(1, hits.length(), "doc found!") + + def testSimpleKeywordAnalyzer(self): + + analyzer = SimpleKeywordAnalyzer() + + input = "Hello World" + ts = analyzer.tokenStream("dummy", StringReader(input)) + self.assertEqual(ts.next().termText(), input) + self.assert_(not list(ts) is None) + ts.close() Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzerTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/KeywordAnalyzerTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/SimpleKeywordAnalyzer.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/SimpleKeywordAnalyzer.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/SimpleKeywordAnalyzer.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/SimpleKeywordAnalyzer.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,44 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +# +# CharTokenizer limits token width to 255 characters, though. +# This implementation assumes keywords are 255 in length or less. +# + +from lucene import PythonAnalyzer, PythonCharTokenizer + + +class SimpleKeywordAnalyzer(PythonAnalyzer): + + def tokenStream(self, fieldName, reader): + + class charTokenizer(PythonCharTokenizer): + def __init__(self, reader): + super(charTokenizer, self).__init__(reader) + def isTokenChar(self, c): + return True + def normalize(self, c): + return c + + return charTokenizer(reader) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/SimpleKeywordAnalyzer.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/SimpleKeywordAnalyzer.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/__init__.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/__init__.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/__init__.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/__init__.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1 @@ +# keyword package Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/__init__.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/keyword/__init__.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzer.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzer.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzer.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzer.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,57 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +import sys + +from lucene import \ + LowerCaseTokenizer, PorterStemFilter, StopAnalyzer, StopFilter, \ + TokenStream, PythonAnalyzer + +from lia.analysis.positional.PositionalStopFilter import PositionalStopFilter + +python_ver = '%d.%d.%d' %(sys.version_info[0:3]) +if python_ver < '2.4': + from sets import Set as set + + +# +# An Analyzer extension +# + +class PositionalPorterStopAnalyzer(PythonAnalyzer): + + def __init__(self, stopWords=None): + + super(PositionalPorterStopAnalyzer, self).__init__() + + if stopWords is None: + stopWords = StopAnalyzer.ENGLISH_STOP_WORDS + + self.stopWords = set(stopWords) + + def tokenStream(self, fieldName, reader): + + tokenStream = LowerCaseTokenizer(reader) + stopFilter = PositionalStopFilter(tokenStream, self.stopWords) + + return PorterStemFilter(stopFilter) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzer.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzer.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzerTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzerTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzerTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzerTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,92 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from unittest import TestCase + +from lucene import \ + IndexWriter, Term, RAMDirectory, Document, Field, \ + IndexSearcher, QueryParser + +from lia.analysis.AnalyzerUtils import AnalyzerUtils +from lia.analysis.positional.PositionalPorterStopAnalyzer import \ + PositionalPorterStopAnalyzer + + +class PositionalPorterStopAnalyzerTest(TestCase): + + porterAnalyzer = PositionalPorterStopAnalyzer() + + def setUp(self): + + self.directory = RAMDirectory() + writer = IndexWriter(self.directory, self.porterAnalyzer, True) + + doc = Document() + doc.add(Field("contents", + "The quick brown fox jumps over the lazy dogs", + Field.Store.YES, Field.Index.TOKENIZED)) + writer.addDocument(doc) + writer.close() + + def testStems(self): + + searcher = IndexSearcher(self.directory) + query = QueryParser("contents", self.porterAnalyzer).parse("laziness") + hits = searcher.search(query) + + self.assertEqual(1, hits.length(), "lazi") + + query = QueryParser("contents", + self.porterAnalyzer).parse('"fox jumped"') + hits = searcher.search(query) + + self.assertEqual(1, hits.length(), "jump jumps jumped jumping") + + def testExactPhrase(self): + + searcher = IndexSearcher(self.directory) + query = QueryParser("contents", + self.porterAnalyzer).parse('"over the lazy"') + hits = searcher.search(query) + + self.assertEqual(0, hits.length(), "exact match not found!") + + def testWithSlop(self): + + searcher = IndexSearcher(self.directory) + + parser = QueryParser("contents", self.porterAnalyzer) + parser.setPhraseSlop(1) + + query = parser.parse('"over the lazy"') + hits = searcher.search(query) + + self.assertEqual(1, hits.length(), "hole accounted for") + + def main(cls): + + text = "The quick brown fox jumps over the lazy dogs" + AnalyzerUtils.displayTokensWithPositions(cls.porterAnalyzer, text) + print '' + + main = classmethod(main) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzerTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalPorterStopAnalyzerTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalStopFilter.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalStopFilter.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalStopFilter.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalStopFilter.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,51 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lucene import PythonTokenFilter + +# +# A TokenFilter extension +# + +class PositionalStopFilter(PythonTokenFilter): + + def __init__(self, tokenStream, stopWords): + + super(PositionalStopFilter, self).__init__(tokenStream) + + self.input = tokenStream + self.stopWords = stopWords + + def next(self): + + increment = 0 + + for token in self.input: + if not token.termText() in self.stopWords: + token.setPositionIncrement(token.getPositionIncrement() + + increment) + return token + + increment += 1 + + return None Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalStopFilter.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/PositionalStopFilter.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/__init__.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/__init__.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/__init__.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/__init__.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1 @@ +# positional package Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/__init__.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/positional/__init__.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/AnalysisParalysisTest.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/AnalysisParalysisTest.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/AnalysisParalysisTest.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/AnalysisParalysisTest.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,48 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lia.common.LiaTestCase import LiaTestCase + +from lucene import \ + QueryParser, StandardAnalyzer, PerFieldAnalyzerWrapper, WhitespaceAnalyzer + + +class AnalysisParalysisTest(LiaTestCase): + + def testAnalyzer(self): + + analyzer = StandardAnalyzer() + queryString = "category:/philosophy/eastern" + + query = QueryParser("contents", analyzer).parse(queryString) + + self.assertEqual("category:\"philosophy eastern\"", + query.toString("contents"), "path got split, yikes!") + + perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer) + perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer()) + query = QueryParser("contents", perFieldAnalyzer).parse(queryString) + + self.assertEqual("category:/philosophy/eastern", + query.toString("contents"), + "leave category field alone") Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/AnalysisParalysisTest.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/AnalysisParalysisTest.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/__init__.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/__init__.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/__init__.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/__init__.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,2 @@ +# queryparser package + Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/__init__.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/queryparser/__init__.py ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/stopanalyzer/StopAnalyzer2.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/stopanalyzer/StopAnalyzer2.py?rev=732916&view=auto ============================================================================== --- lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/stopanalyzer/StopAnalyzer2.py (added) +++ lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/stopanalyzer/StopAnalyzer2.py Thu Jan 8 19:28:33 2009 @@ -0,0 +1,43 @@ +# ==================================================================== +# Copyright (c) 2004-2007 Open Source Applications Foundation. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# ==================================================================== +# + +from lucene import \ + LetterTokenizer, LowerCaseFilter, StopAnalyzer, StopFilter + +# +# An Analyzer extension +# + +class StopAnalyzer2(object): + + def __init__(self, stopWords=None): + + if stopWords is None: + self.stopWords = StopAnalyzer.ENGLISH_STOP_WORDS + else: + self.stopWords = stopWords + + def tokenStream(self, fieldName, reader): + + return StopFilter(LowerCaseFilter(LetterTokenizer(reader)), + self.stopWords) Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/stopanalyzer/StopAnalyzer2.py ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/analysis/stopanalyzer/StopAnalyzer2.py ------------------------------------------------------------------------------ svn:mime-type = text/plain