lucene-pylucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From va...@apache.org
Subject svn commit: r1465461 - /lucene/pylucene/trunk/samples/TermPositionVector.py
Date Sun, 07 Apr 2013 20:58:15 GMT
Author: vajda
Date: Sun Apr  7 20:58:14 2013
New Revision: 1465461

URL: http://svn.apache.org/r1465461
Log:
migrated TermPositionVector.py

Modified:
    lucene/pylucene/trunk/samples/TermPositionVector.py

Modified: lucene/pylucene/trunk/samples/TermPositionVector.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/TermPositionVector.py?rev=1465461&r1=1465460&r2=1465461&view=diff
==============================================================================
--- lucene/pylucene/trunk/samples/TermPositionVector.py (original)
+++ lucene/pylucene/trunk/samples/TermPositionVector.py Sun Apr  7 20:58:14 2013
@@ -1,37 +1,53 @@
-from lucene import \
-    StandardAnalyzer, RAMDirectory, Document, Field, Version, \
-    IndexWriter, IndexReader, TermPositionVector, initVM
+
+import lucene
+
+from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
+from org.apache.lucene.analysis.standard import StandardAnalyzer
+from org.apache.lucene.store import RAMDirectory
+from org.apache.lucene.document import Document, Field, FieldType
+from org.apache.lucene.util import BytesRef, BytesRefIterator, Version
+from org.apache.lucene.index import \
+    IndexWriterConfig, IndexWriter, DirectoryReader
 
 if __name__ == '__main__':
-    initVM()
+    lucene.initVM()
 
 directory = RAMDirectory()
-iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT),
-                      True, IndexWriter.MaxFieldLength.LIMITED)
+iconfig = IndexWriterConfig(Version.LUCENE_CURRENT, LimitTokenCountAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT),
100))
+iwriter = IndexWriter(directory, iconfig)
+
+ft = FieldType()
+ft.setIndexed(True)
+ft.setStored(True)
+ft.setTokenized(True)
+ft.setStoreTermVectors(True)
+ft.setStoreTermVectorOffsets(True)
+ft.setStoreTermVectorPositions(True)
+
 ts = ["this bernhard is the text to be index text",
-      "this claudia is the text to be index"]
+      "this claudia is the text to be indexed"]
 for t in ts:
     doc = Document()
-    doc.add(Field("fieldname", t,
-                  Field.Store.YES, Field.Index.ANALYZED,
-                  Field.TermVector.WITH_POSITIONS_OFFSETS))
+    doc.add(Field("fieldname", t, ft))
     iwriter.addDocument(doc)
+
 iwriter.commit()
 iwriter.close()
+ireader = DirectoryReader.open(directory)
+
+for doc in xrange(0, len(ts)):
+    tv = ireader.getTermVector(doc, "fieldname")
+    termsEnum = tv.iterator(None)
+
+    for term in BytesRefIterator.cast_(termsEnum):
+        dpEnum = termsEnum.docsAndPositions(None, None)
+        dpEnum.nextDoc()  # prime the enum which works only for the current doc
+        freq = dpEnum.freq()
 
-ireader = IndexReader.open(directory, True)
-tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))
+        print 'term:', term.utf8ToString()
+        print '  freq:', freq
 
-for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)):
-    print 'term %s' % t
-    print '  freq: %i' % f
-    try:
-        print '  pos: ' + str([p for p in tpv.getTermPositions(i)])
-    except:
-        print '  no pos'
-    try:
-        print '  off: ' + \
-              str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
-                   for o in tpv.getOffsets(i)])
-    except:
-        print '  no offsets'
+        for i in xrange(freq):
+            print "  pos:", dpEnum.nextPosition()
+            print "  off: %i-%i" %(dpEnum.startOffset(), dpEnum.endOffset())
+    print



Mime
View raw message