Return-Path: X-Original-To: apmail-lucene-pylucene-commits-archive@minotaur.apache.org Delivered-To: apmail-lucene-pylucene-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 8F861F94E for ; Sun, 7 Apr 2013 20:58:36 +0000 (UTC) Received: (qmail 46925 invoked by uid 500); 7 Apr 2013 20:58:36 -0000 Delivered-To: apmail-lucene-pylucene-commits-archive@lucene.apache.org Received: (qmail 46905 invoked by uid 500); 7 Apr 2013 20:58:36 -0000 Mailing-List: contact pylucene-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: pylucene-dev@lucene.apache.org Delivered-To: mailing list pylucene-commits@lucene.apache.org Received: (qmail 46893 invoked by uid 99); 7 Apr 2013 20:58:36 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 07 Apr 2013 20:58:36 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 07 Apr 2013 20:58:35 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 1A0E423889FA; Sun, 7 Apr 2013 20:58:15 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1465461 - /lucene/pylucene/trunk/samples/TermPositionVector.py Date: Sun, 07 Apr 2013 20:58:15 -0000 To: pylucene-commits@lucene.apache.org From: vajda@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130407205815.1A0E423889FA@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: vajda Date: Sun Apr 7 20:58:14 2013 New Revision: 1465461 URL: http://svn.apache.org/r1465461 Log: migrated TermPositionVector.py Modified: lucene/pylucene/trunk/samples/TermPositionVector.py Modified: lucene/pylucene/trunk/samples/TermPositionVector.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/TermPositionVector.py?rev=1465461&r1=1465460&r2=1465461&view=diff ============================================================================== --- lucene/pylucene/trunk/samples/TermPositionVector.py (original) +++ lucene/pylucene/trunk/samples/TermPositionVector.py Sun Apr 7 20:58:14 2013 @@ -1,37 +1,53 @@ -from lucene import \ - StandardAnalyzer, RAMDirectory, Document, Field, Version, \ - IndexWriter, IndexReader, TermPositionVector, initVM + +import lucene + +from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer +from org.apache.lucene.analysis.standard import StandardAnalyzer +from org.apache.lucene.store import RAMDirectory +from org.apache.lucene.document import Document, Field, FieldType +from org.apache.lucene.util import BytesRef, BytesRefIterator, Version +from org.apache.lucene.index import \ + IndexWriterConfig, IndexWriter, DirectoryReader if __name__ == '__main__': - initVM() + lucene.initVM() directory = RAMDirectory() -iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT), - True, IndexWriter.MaxFieldLength.LIMITED) +iconfig = IndexWriterConfig(Version.LUCENE_CURRENT, LimitTokenCountAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT), 100)) +iwriter = IndexWriter(directory, iconfig) + +ft = FieldType() +ft.setIndexed(True) +ft.setStored(True) +ft.setTokenized(True) +ft.setStoreTermVectors(True) +ft.setStoreTermVectorOffsets(True) +ft.setStoreTermVectorPositions(True) + ts = ["this bernhard is the text to be index text", - "this claudia is the text to be index"] + "this claudia is the text to be indexed"] for t in ts: doc = Document() - doc.add(Field("fieldname", t, - Field.Store.YES, Field.Index.ANALYZED, - Field.TermVector.WITH_POSITIONS_OFFSETS)) + doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) + iwriter.commit() iwriter.close() +ireader = DirectoryReader.open(directory) + +for doc in xrange(0, len(ts)): + tv = ireader.getTermVector(doc, "fieldname") + termsEnum = tv.iterator(None) + + for term in BytesRefIterator.cast_(termsEnum): + dpEnum = termsEnum.docsAndPositions(None, None) + dpEnum.nextDoc() # prime the enum which works only for the current doc + freq = dpEnum.freq() -ireader = IndexReader.open(directory, True) -tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname')) + print 'term:', term.utf8ToString() + print ' freq:', freq -for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)): - print 'term %s' % t - print ' freq: %i' % f - try: - print ' pos: ' + str([p for p in tpv.getTermPositions(i)]) - except: - print ' no pos' - try: - print ' off: ' + \ - str(["%i-%i" % (o.getStartOffset(), o.getEndOffset()) - for o in tpv.getOffsets(i)]) - except: - print ' no offsets' + for i in xrange(freq): + print " pos:", dpEnum.nextPosition() + print " off: %i-%i" %(dpEnum.startOffset(), dpEnum.endOffset()) + print