lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From va...@apache.org
Subject svn commit: r732916 [11/14] - in /lucene/pylucene/trunk: ./ java/ java/org/ java/org/osafoundation/ java/org/osafoundation/lucene/ java/org/osafoundation/lucene/analysis/ java/org/osafoundation/lucene/queryParser/ java/org/osafoundation/lucene/search/ ...
Date Fri, 09 Jan 2009 03:28:41 GMT
Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/RTF.rtf
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/RTF.rtf?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/RTF.rtf (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/RTF.rtf Thu Jan  8 19:28:33 2009
@@ -0,0 +1,25 @@
+{\rtf1\ansi\deff1\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset0 Nimbus Roman No9 L;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\froman\fprq2\fcharset0 Times New Roman;}}
+{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
+{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033\snext1 Default;}
+{\s2\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\aspalpha\rtlch\lang1025\ltrch\dbch\af1\loch\f1\sbasedon1\snext2 Normal;}
+{\s3\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033\sbasedon1\snext3 Header;}
+{\s4\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033\sbasedon1\snext4 Footer;}
+{\*\cs6\cf0\rtlch\lang1033\ltrch\dbch\af1\loch\f1\sbasedon7 Default Paragraph Font;}
+{\*\cs7\cf0\rtlch\lang1025\ltrch\dbch\af1\loch\f1 Normal;}
+}
+{\info{\comment StarWriter}{\vern6410}}\deftab720
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\pgdscnxt0 Default;}
+{\pgdsc1\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1800\margrsxn1800\margtsxn720\margbsxn720\headery0{\*\headeryb0\headerxl0\headerxr0\headeryh720}{\header \pard\plain \s3\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033 
+\par }
+\footery0{\*\footeryt720\footerxl0\footerxr0\footeryh0}{\footer \pard\plain \s4\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033 
+\par }
+\pgdscnxt1 Convert 1;}}
+{\*\pgdscno1}\paperh15840\paperw12240\margl1800\margr1800\margt720\margb720\sectd\sbknone\pgwsxn12240\pghsxn15840\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1728\headery720{\header \pard\plain \s3\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033 
+\par }
+\footery720{\footer \pard\plain \s4\cf0\tqc\tx4320\tqr\tx8640{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af1\afs24\lang1033\ltrch\dbch\af1\afs24\langfe1033\loch\f1\fs24\lang1033 
+\par }
+\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
+\pard\plain \s2\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\aspalpha\rtlch\lang1025\ltrch\dbch\af1\loch\f1 {\ltrch\loch\f1 This is the content of the RTF document}
+\par }
\ No newline at end of file

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml Thu Jan  8 19:28:33 2009
@@ -0,0 +1,12 @@
+<?xml version='1.0' encoding='utf-8'?>
+<address-book>
+    <contact type="individual">
+        <name>Zane Pasolini</name>
+        <address>999 W. Prince St.</address>
+        <city>New York</city>
+        <province>NY</province>
+        <postalcode>10013</postalcode>
+        <country>USA</country>
+        <telephone>+1 212 345 6789</telephone>
+    </contact>
+</address-book>

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook-entry.xml
------------------------------------------------------------------------------
    svn:mime-type = text/xml

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml Thu Jan  8 19:28:33 2009
@@ -0,0 +1,21 @@
+<?xml version='1.0' encoding='utf-8'?>
+<address-book>
+    <contact type="individual">
+        <name>Zane Pasolini</name>
+        <address>999 W. Prince St.</address>
+        <city>New York</city>
+        <province>NY</province>
+        <postalcode>10013</postalcode>
+        <country>USA</country>
+        <telephone>+1 212 345 6789</telephone>
+    </contact>
+    <contact type="business">
+        <name>SAMOFIX d.o.o.</name>
+        <address>Ilica 47-2</address>
+        <city>Zagreb</city>
+        <province></province>
+        <postalcode>10000</postalcode>
+        <country>Croatia</country>
+        <telephone>+385 1 123 4567</telephone>
+    </contact>
+</address-book>

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/data/addressbook.xml
------------------------------------------------------------------------------
    svn:mime-type = text/xml

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,133 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+import lia.handlingtypes as handlingtypes
+
+from time import time
+from datetime import timedelta
+from lucene import IndexWriter, StandardAnalyzer
+
+from lia.util.ClassLoader import ClassLoader
+
+ #
+ # A File Indexer capable of recursively indexing a directory tree.
+ # Based on lia.meetlucene.Indexer, but handling more than plaintext.
+ #
+
+class FileIndexer(object):
+
+    def main(cls, argv):
+
+        if len(argv) != 3:
+            print "Usage: python FileIndexer.py <index dir> <data dir>"
+            return
+
+        indexDir = argv[1]
+        dataDir = argv[2]
+
+        propsFile = os.path.join(os.path.dirname(handlingtypes.__file__),
+                                 'framework', 'handler.properties')
+        input = file(propsFile)
+        props = {}
+        while True:
+            line = input.readline().strip()
+            if not line:
+                break
+            if line.startswith('#'):
+                continue
+            name, value = line.split('=')
+            props[name.strip()] = value.strip()
+        input.close()
+        cls.handlerProps = props
+
+        start = time()
+        numIndexed = cls.index(indexDir, dataDir)
+        duration = timedelta(seconds=time() - start)
+
+        print "Indexing %s files took %s" %(numIndexed, duration)
+
+    def index(cls, indexDir, dataDir):
+
+        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
+            raise IOError, "%s does not exist or is not a directory" %(dataDir)
+
+        writer = IndexWriter(indexDir, StandardAnalyzer(), True)
+        writer.setUseCompoundFile(False)
+
+        numIndexed = cls.indexDirectory(writer, dataDir)
+        writer.optimize()
+        writer.close()
+
+        return numIndexed
+
+    def indexDirectory(cls, writer, dir):
+
+        count = 0
+        dirs = []
+
+        for name in os.listdir(dir):
+            path = os.path.join(dir, name)
+            if os.path.isfile(path):
+                doc = cls.indexFile(writer, path)
+                if doc is not None:
+                    count += 1
+            elif os.path.isdir(path) and not name.startswith('.'):
+                dirs.append(path)
+
+        for dir in dirs:
+            count += cls.indexDirectory(writer, dir)
+
+        return count
+
+    def indexFile(cls, writer, path):
+
+        name, ext = os.path.splitext(path)
+        if ext.startswith(os.path.extsep):
+            ext = ext[len(os.path.extsep):]
+
+        if ext:
+            handlerClassName = cls.handlerProps.get(ext, None)
+            if handlerClassName is None:
+                print "error indexing %s: no handler for %s files" %(path, ext)
+                return None
+
+            try:
+                handlerClass = ClassLoader.loadClass(handlerClassName)
+                handler = handlerClass()
+
+                doc = handler.indexFile(writer, path)
+                if doc is not None:
+                    print 'indexed', path
+
+                return doc
+            except SyntaxError:
+                raise
+            except Exception, e:
+                print 'error indexing %s: %s' %(path, e)
+                return None
+
+    main = classmethod(main)
+    index = classmethod(index)
+    indexDirectory = classmethod(indexDirectory)
+    indexFile = classmethod(indexFile)

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/FileIndexer.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1 @@
+# framework package

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/__init__.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/handler.properties
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/handler.properties?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/handler.properties (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/framework/handler.properties Thu Jan  8 19:28:33 2009
@@ -0,0 +1,6 @@
+txt  = lia.handlingtypes.text.PlainTextHandler.PlainTextHandler
+html = lia.handlingtypes.html.HTMLHandler.HTMLHandler
+pdf  = lia.handlingtypes.pdf.PDFHandler.PDFHandler
+xml  = lia.handlingtypes.xml.DigesterXMLHandler.DigesterXMLHandler
+doc  = lia.handlingtypes.msdoc.AntiWordHandler.AntiWordHandler
+#rtf  = lia.handlingtypes.rtf.JavaBuiltInRTFHandler.JavaBuiltInRTFHandler

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,47 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os, popen2
+
+from lucene import Document, Field, StringReader
+from lia.util.Streams import HTMLReader, InputStreamReader
+
+
+class HTMLHandler(object):
+
+    def indexFile(self, writer, path):
+
+        try:
+            file = open(path)
+            string = HTMLReader(InputStreamReader(file, 'utf-8')).read()
+            file.close()
+        except:
+            raise
+        else:
+            doc = Document()
+            doc.add(Field("contents", StringReader(string)))
+            doc.add(Field("filename", os.path.abspath(path),
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            writer.addDocument(doc)
+
+            return doc

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/HTMLHandler.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1 @@
+# html package

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/html/__init__.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,51 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os, popen2
+
+from lucene import Document, Field, StringReader
+from lia.util.Streams import InputStreamReader
+
+
+class AntiWordHandler(object):
+
+    def indexFile(self, writer, path):
+
+        doc = Document()
+
+        try:
+            process = popen2.Popen4(["antiword", "-m", "UTF-8", path])
+            string = InputStreamReader(process.fromchild, 'utf-8').read()
+        except:
+            raise
+        else:
+            doc.add(Field("contents", StringReader(string)))
+            doc.add(Field("filename", os.path.abspath(path),
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            writer.addDocument(doc)
+
+            exitCode = process.wait()
+            if exitCode != 0:
+                raise RuntimeError, "pdftotext exit code %d" %(exitCode)
+
+            return doc

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/AntiWordHandler.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1 @@
+# msdoc package

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/msdoc/__init__.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,68 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os, popen2
+
+from lucene import Document, Field, StringReader
+from lia.util.Streams import InputStreamReader
+
+
+class PDFHandler(object):
+
+    def indexFile(self, writer, path):
+
+        doc = Document()
+
+        try:
+            process = popen2.Popen4(["pdfinfo", "-enc", "UTF-8", path])
+        except:
+            raise
+        else:
+            while True:
+                line = process.fromchild.readline().strip()
+                if not line:
+                    break
+                name, value = line.split(':', 1)
+                doc.add(Field(name.strip(), value.strip(),
+                              Field.Store.YES, Field.Index.UN_TOKENIZED))
+
+            exitCode = process.wait()
+            if exitCode != 0:
+                raise RuntimeError, "pdfinfo exit code %d" %(exitCode)
+        
+        try:
+            process = popen2.Popen4(["pdftotext", "-enc", "UTF-8", path, "-"])
+            string = InputStreamReader(process.fromchild, 'utf-8').read()
+        except:
+            raise
+        else:
+            doc.add(Field("contents", StringReader(string)))
+            doc.add(Field("filename", os.path.abspath(path),
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            writer.addDocument(doc)
+
+            exitCode = process.wait()
+            if exitCode != 0:
+                raise RuntimeError, "pdftotext exit code %d" %(exitCode)
+
+            return doc

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/PDFHandler.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1 @@
+# pdf package

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/pdf/__init__.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,46 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from lucene import Document, Field, \
+    InputStreamReader, FileInputStream, JavaError
+
+
+class PlainTextHandler(object):
+
+    def indexFile(self, writer, path):
+
+        try:
+            reader = InputStreamReader(FileInputStream(path), 'iso-8859-1')
+        except JavaError:
+            raise
+        else:
+            doc = Document()
+            doc.add(Field("contents", reader))
+            doc.add(Field("filename", os.path.abspath(path),
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            writer.addDocument(doc)
+            reader.close()
+
+            return doc

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/PlainTextHandler.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1 @@
+# text package

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/text/__init__.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,76 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import xml.sax
+
+
+class Digester(xml.sax.ContentHandler):
+
+    attributes = {}
+    tags = {}
+
+    def addSetProperty(self, path, property, attribute=None):
+
+        if attribute is not None:
+            pairs = self.attributes.get(path)
+            if pairs is None:
+                self.attributes[path] = pairs = { attribute: property }
+            else:
+                pairs[property] = attribute
+
+        else:
+            self.tags[path] = property
+
+    def parse(self, input):
+
+        xml.sax.parse(input, self)
+        return self.properties
+    
+    def startDocument(self):
+
+        self.properties = {}
+        self.path = []
+
+    def startElement(self, tag, attrs):
+
+        self.path.append(tag)
+        pairs = self.attributes.get('/'.join(self.path))
+        if pairs is not None:
+            for name, value in attrs.items():
+                property = pairs.get(name)
+                if property is not None:
+                    self.properties[property] = value
+
+    def characters(self, data):
+
+        self.data = data.strip()
+
+    def endElement(self, tag):
+
+        if self.data:
+            property = self.tags.get('/'.join(self.path))
+            if property is not None:
+                self.properties[property] = self.data
+            self.data = None
+            
+        self.path.pop()

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/Digester.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,75 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from lucene import Document, Field
+from lia.handlingtypes.xml.Digester import Digester
+
+
+class DigesterXMLHandler(object):
+
+    def __init__(self):
+
+        self.digester = digester = Digester()
+
+        digester.addSetProperty("address-book/contact", "type", "type")
+        digester.addSetProperty("address-book/contact/name", "name")
+        digester.addSetProperty("address-book/contact/address", "address")
+        digester.addSetProperty("address-book/contact/city", "city")
+        digester.addSetProperty("address-book/contact/province", "province")
+        digester.addSetProperty("address-book/contact/postalcode", "postalcode")
+        digester.addSetProperty("address-book/contact/country", "country")
+        digester.addSetProperty("address-book/contact/telephone", "telephone")
+
+    def indexFile(self, writer, path):
+
+        try:
+            file = open(path)
+        except IOError, e:
+            raise
+        else:
+            props = self.digester.parse(file)
+            doc = Document()
+            doc.add(Field("type", props['type'],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("name", props['name'],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("address", props['address'],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("city", props['city'],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("province", props['province'],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("postalcode", props['postalcode'],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("country", props['country'],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("telephone", props['telephone'],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("filename", os.path.abspath(path),
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            writer.addDocument(doc)
+            file.close()
+
+            return doc

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/DigesterXMLHandler.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1 @@
+# xml package

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/handlingtypes/xml/__init__.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,86 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+
+from lucene import \
+     FSDirectory, System, \
+     Document, Field, SimpleAnalyzer, IndexWriter, IndexReader
+
+
+class BaseIndexingTestCase(TestCase):
+    keywords = ["1", "2"]
+    unindexed = ["Netherlands", "Italy"]
+    unstored = ["Amsterdam has lots of bridges",
+                "Venice has lots of canals"]
+    text = ["Amsterdam", "Venice"]
+
+    def setUp(self):
+
+        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
+                                'index-dir')
+        self.dir = FSDirectory.getDirectory(indexDir, True)
+        self.addDocuments(self.dir)
+
+    def addDocuments(self, dir):
+
+        writer = IndexWriter(dir, self.getAnalyzer(), True)
+        writer.setUseCompoundFile(self.isCompound())
+
+        for i in xrange(len(self.keywords)):
+            doc = Document()
+            doc.add(Field("id", self.keywords[i],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("country", self.unindexed[i],
+                          Field.Store.YES, Field.Index.NO))
+            doc.add(Field("contents", self.unstored[i],
+                          Field.Store.NO, Field.Index.TOKENIZED))
+            doc.add(Field("city", self.text[i],
+                          Field.Store.YES, Field.Index.TOKENIZED))
+            writer.addDocument(doc)
+
+        writer.optimize()
+        writer.close()
+
+    def getAnalyzer(self):
+
+        return SimpleAnalyzer()
+
+    def isCompound(self):
+
+        return True
+
+    def testIndexWriter(self):
+
+        writer = IndexWriter(self.dir, self.getAnalyzer(), False)
+        self.assertEqual(len(self.keywords), writer.docCount())
+        writer.close()
+
+    def testIndexReader(self):
+
+        reader = IndexReader.open(self.dir)
+        self.assertEqual(len(self.keywords), reader.maxDoc())
+        self.assertEqual(len(self.keywords), reader.numDocs())
+        reader.close()

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/BaseIndexingTestCase.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,106 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+from time import time
+from datetime import timedelta
+
+from lucene import \
+     IndexWriter, SimpleAnalyzer, FSDirectory, System, Document, Field
+
+
+class CompoundVersusMultiFileIndexTest(TestCase):
+
+    def __init__(self, *args):
+
+        super(CompoundVersusMultiFileIndexTest, self).__init__(*args)
+        self.docs = self.loadDocuments(5000, 10)
+
+    def setUp(self):
+
+        indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+                                "index-dir")
+
+        cIndexDir = "%s-compound" %(indexDir)
+        mIndexDir = "%s-multi" %(indexDir)
+        self.rmdir(cIndexDir)
+        self.rmdir(mIndexDir)
+
+        self.cDir = FSDirectory.getDirectory(cIndexDir, True)
+        self.mDir = FSDirectory.getDirectory(mIndexDir, True)
+
+    def rmdir(self, dir):
+
+        for dir, dirnames, filenames in os.walk(dir):
+            for filename in filenames:
+                os.remove(os.path.join(dir, filename))
+            for dirname in dirnames:
+                os.rmdir(os.path.join(dir, dirname))
+
+    def testTiming(self):
+
+        cTiming = self.timeIndexWriter(self.cDir, True)
+        mTiming = self.timeIndexWriter(self.mDir, False)
+
+        print "Compound Time :", cTiming
+        print "Multi-file Time:", mTiming
+
+        self.assert_(cTiming > mTiming)
+
+    def timeIndexWriter(self, dir, isCompound):
+
+        start = time()
+        self.addDocuments(dir, isCompound)
+
+        return timedelta(seconds=time() - start)
+
+    def addDocuments(self, dir, isCompound):
+
+        writer = IndexWriter(dir, SimpleAnalyzer(), True)
+        writer.setUseCompoundFile(isCompound)
+
+        # change to adjust performance of indexing with FSDirectory
+        # writer.mergeFactor = writer.mergeFactor
+        # writer.maxMergeDocs = writer.maxMergeDocs
+        # writer.minMergeDocs = writer.minMergeDocs
+
+        for word in self.docs:
+            doc = Document()
+            doc.add(Field("keyword", word,
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("unindexed", word,
+                          Field.Store.YES, Field.Index.NO))
+            doc.add(Field("unstored", word,
+                          Field.Store.NO, Field.Index.TOKENIZED))
+            doc.add(Field("text", word,
+                          Field.Store.YES, Field.Index.TOKENIZED))
+            writer.addDocument(doc)
+
+        writer.optimize()
+        writer.close()
+
+    def loadDocuments(self, numDocs, wordsPerDoc):
+
+        return ["Bibamus " * wordsPerDoc] * numDocs

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,70 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+from lucene import IndexWriter, IndexReader
+from lia.indexing.BaseIndexingTestCase import BaseIndexingTestCase
+
+
+class DocumentDeleteTest(BaseIndexingTestCase):
+
+    def testDeleteBeforeIndexMerge(self):
+
+        reader = IndexReader.open(self.dir)
+        self.assertEqual(2, reader.maxDoc())
+        self.assertEqual(2, reader.numDocs())
+        reader.deleteDocument(1)
+
+        self.assert_(reader.isDeleted(1))
+        self.assert_(reader.hasDeletions())
+        self.assertEqual(2, reader.maxDoc())
+        self.assertEqual(1, reader.numDocs())
+
+        reader.close()
+
+        reader = IndexReader.open(self.dir)
+
+        self.assertEqual(2, reader.maxDoc())
+        self.assertEqual(1, reader.numDocs())
+
+        reader.close()
+
+    def testDeleteAfterIndexMerge(self):
+
+        reader = IndexReader.open(self.dir)
+        self.assertEqual(2, reader.maxDoc())
+        self.assertEqual(2, reader.numDocs())
+        reader.deleteDocument(1)
+        reader.close()
+
+        writer = IndexWriter(self.dir, self.getAnalyzer(), False)
+        writer.optimize()
+        writer.close()
+
+        reader = IndexReader.open(self.dir)
+
+        self.assert_(not reader.isDeleted(1))
+        self.assert_(not reader.hasDeletions())
+        self.assertEqual(1, reader.maxDoc())
+        self.assertEqual(1, reader.numDocs())
+
+        reader.close()

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentDeleteTest.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,72 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+from lucene import \
+     IndexWriter, IndexReader, IndexSearcher, \
+     WhitespaceAnalyzer, Document, Field, Term, TermQuery
+
+from lia.indexing.BaseIndexingTestCase import BaseIndexingTestCase
+
+
+class DocumentUpdateTest(BaseIndexingTestCase):
+
+    def testUpdate(self):
+
+        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))
+
+        reader = IndexReader.open(self.dir)
+        reader.deleteDocuments(Term("city", "Amsterdam"))
+        reader.close()
+
+        writer = IndexWriter(self.dir, self.getAnalyzer(), False)
+        doc = Document()
+        doc.add(Field("id", "1", Field.Store.YES, Field.Index.UN_TOKENIZED))
+        doc.add(Field("country", "Russia",
+                      Field.Store.YES, Field.Index.NO))
+        doc.add(Field("contents", "St. Petersburg has lots of bridges",
+                      Field.Store.NO, Field.Index.TOKENIZED))
+        doc.add(Field("city", "St. Petersburg",
+                      Field.Store.YES, Field.Index.TOKENIZED))
+        writer.addDocument(doc)
+        writer.optimize()
+        writer.close()
+
+        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
+        self.assertEqual(1, self.getHitCount("city", "Petersburg"))
+
+
+    def getAnalyzer(self):
+
+        return WhitespaceAnalyzer()
+
+
+    def getHitCount(self, fieldName, searchString):
+
+        searcher = IndexSearcher(self.dir)
+        t = Term(fieldName, searchString)
+        query = TermQuery(t)
+        hits = searcher.search(query)
+        hitCount = hits.length()
+        searcher.close()
+
+        return hitCount

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/DocumentUpdateTest.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,44 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from lia.indexing.BaseIndexingTestCase import BaseIndexingTestCase
+from lucene import IndexReader, RAMDirectory
+
+
+class FS2RAMDirectoryTest(BaseIndexingTestCase):
+
+    def testSlurp(self):
+
+        fsDirReader = IndexReader.open(self.dir)
+        self.assertEqual(len(self.keywords), fsDirReader.maxDoc())
+        self.assertEqual(len(self.keywords), fsDirReader.numDocs())
+
+        ramDir = RAMDirectory(self.dir)
+        ramDirReader = IndexReader.open(ramDir)
+        self.assertEqual(fsDirReader.maxDoc(), ramDirReader.maxDoc())
+        self.assertEqual(fsDirReader.numDocs(), ramDirReader.numDocs())
+
+        fsDirReader.close()
+        ramDir.close()

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FS2RAMDirectoryTest.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,94 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+from time import time
+from datetime import timedelta
+
+from lucene import \
+     IndexWriter, SimpleAnalyzer, Document, Field, System, \
+     FSDirectory, RAMDirectory
+
+
+class FSversusRAMDirectoryTest(TestCase):
+
+    def __init__(self, *args):
+
+        super(FSversusRAMDirectoryTest, self).__init__(*args)
+        self.docs = self.loadDocuments(3000, 5)
+
+    def setUp(self):
+
+        fsIndexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+                                  "fs-index")
+        self.ramDir = RAMDirectory()
+        self.fsDir = FSDirectory.getDirectory(fsIndexDir, True)
+
+    def testTiming(self):
+
+        ramTiming = self.timeIndexWriter(self.ramDir)
+        fsTiming = self.timeIndexWriter(self.fsDir)
+
+        #self.assert_(fsTiming > ramTiming)
+
+        print "RAMDirectory Time:", ramTiming
+        print "FSDirectory Time :", fsTiming
+
+    def timeIndexWriter(self, dir):
+
+        start = time()
+        self.addDocuments(dir)
+
+        return timedelta(seconds=time() - start)
+
+    def addDocuments(self, dir):
+
+        writer = IndexWriter(dir, SimpleAnalyzer(), True)
+
+        #
+        # change to adjust performance of indexing with FSDirectory
+        # writer.mergeFactor = writer.mergeFactor
+        # writer.maxMergeDocs = writer.maxMergeDocs
+        # writer.minMergeDocs = writer.minMergeDocs
+        #
+
+        for word in self.docs:
+            doc = Document()
+            doc.add(Field("keyword", word,
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("unindexed", word,
+                          Field.Store.YES, Field.Index.NO))
+            doc.add(Field("unstored", word,
+                          Field.Store.NO, Field.Index.TOKENIZED))
+            doc.add(Field("text", word,
+                          Field.Store.YES, Field.Index.TOKENIZED))
+            writer.addDocument(doc)
+
+        writer.optimize()
+        writer.close()
+
+    def loadDocuments(self, numDocs, wordsPerDoc):
+
+        return ["Bibamus " * wordsPerDoc] * numDocs

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FSversusRAMDirectoryTest.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,86 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+from time import time
+from datetime import timedelta
+
+from lucene import \
+     IndexWriter, SimpleAnalyzer, Document, Field, System, \
+     Term, TermQuery, IndexSearcher, FSDirectory
+
+
+class FieldLengthTest(TestCase):
+
+    keywords = ["1", "2"]
+    unindexed = ["Netherlands", "Italy"]
+    unstored = ["Amsterdam has lots of bridges",
+                "Venice has lots of canals"]
+    text = ["Amsterdam", "Venice"]
+
+    def setUp(self):
+
+        indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+                                "index-dir")
+        self.dir = FSDirectory.getDirectory(indexDir, True)
+
+    def testFieldSize(self):
+
+        self.addDocuments(self.dir, 10)
+        self.assertEqual(1, self.getHitCount("contents", "bridges"))
+
+        self.addDocuments(self.dir, 1)
+        self.assertEqual(0, self.getHitCount("contents", "bridges"))
+
+    def getHitCount(self, fieldName, searchString):
+
+        searcher = IndexSearcher(self.dir)
+        t = Term(fieldName, searchString)
+        query = TermQuery(t)
+        hits = searcher.search(query)
+        hitCount = hits.length()
+        searcher.close()
+
+        return hitCount
+
+    def addDocuments(self, dir, maxFieldLength):
+
+        writer = IndexWriter(dir, SimpleAnalyzer(), True)
+        writer.setMaxFieldLength(maxFieldLength)
+        
+        for i in xrange(len(self.keywords)):
+            doc = Document()
+            doc.add(Field("id", self.keywords[i],
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            doc.add(Field("country", self.unindexed[i],
+                          Field.Store.YES, Field.Index.NO))
+            doc.add(Field("contents", self.unstored[i],
+                          Field.Store.NO, Field.Index.TOKENIZED))
+            doc.add(Field("city", self.text[i],
+                          Field.Store.YES, Field.Index.TOKENIZED))
+            writer.addDocument(doc)
+
+        writer.optimize()
+        writer.close()

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/FieldLengthTest.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,70 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from time import time
+from datetime import timedelta
+
+from lucene import \
+     IndexWriter, SimpleAnalyzer, Document, Field, Term, FSDirectory, System
+
+
+class IndexTuningDemo(object):
+
+    def main(cls, argv):
+
+        if len(argv) < 5:
+            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
+            return
+            
+        docsInIndex  = int(argv[1])
+
+        # create an index called 'index-dir' in a temp directory
+        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
+                                'index-dir')
+        dir = FSDirectory.getDirectory(indexDir, True)
+        analyzer = SimpleAnalyzer()
+        writer = IndexWriter(dir, analyzer, True)
+
+        # set variables that affect speed of indexing
+        writer.setMergeFactor(int(argv[2]))
+        writer.setMaxMergeDocs(int(argv[3]))
+        writer.setMaxBufferedDocs(int(argv[4]))
+        # writer.infoStream = System.out
+
+        print "Merge factor:  ", writer.getMergeFactor()
+        print "Max merge docs:", writer.getMaxMergeDocs()
+        print "Max buffered docs:", writer.getMaxBufferedDocs()
+
+        start = time()
+        for i in xrange(docsInIndex):
+            doc = Document()
+            doc.add(Field("fieldname", "Bibamus",
+                          Field.Store.YES, Field.Index.TOKENIZED))
+            writer.addDocument(doc)
+
+        writer.close()
+        print "Time: ", timedelta(seconds=time() - start)
+
+    main = classmethod(main)

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/IndexTuningDemo.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,73 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from unittest import TestCase
+
+from lucene import VERSION, \
+     IndexWriter, IndexReader, SimpleAnalyzer, FSDirectory, System
+
+
+class LockTest(TestCase):
+
+    def setUp(self):
+
+        indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+                                "index")
+        self.dir = FSDirectory.getDirectory(indexDir, True)
+
+    def testWriteLock(self):
+
+        if VERSION < '2.1.0':
+            writer1 = None
+            writer2 = None
+            gotException = False
+
+            try:
+                try:
+                    writer1 = IndexWriter(self.dir, SimpleAnalyzer(), True)
+                    writer2 = IndexWriter(self.dir, SimpleAnalyzer(), True)
+
+                    self.fail("We should never reach this point")
+                except:
+                    gotException = True
+            finally:
+                writer1.close()
+                self.assert_(writer2 is None)
+                self.assert_(gotException)
+
+    def testCommitLock(self):
+
+        reader1 = None
+        reader2 = None
+
+        try:
+            writer = IndexWriter(self.dir, SimpleAnalyzer(), True)
+            writer.close()
+
+            reader1 = IndexReader.open(self.dir)
+            reader2 = IndexReader.open(self.dir)
+        finally:
+            reader1.close()
+            reader2.close()

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/LockTest.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,60 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from lucene import \
+     FSDirectory, Document, Field, IndexWriter, SimpleAnalyzer, System
+
+
+class VerboseIndexing(object):
+
+    def main(cls, argv):
+
+        vi = VerboseIndexing()
+        vi.index()
+
+    def index(self):
+
+        dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
+                               "verbose-index")
+        dir = FSDirectory.getDirectory(dirPath, True)
+        writer = IndexWriter(dir, SimpleAnalyzer(), True)
+
+        writer.setInfoStream(System.out)
+
+        for i in xrange(100):
+            doc = Document()
+            doc.add(Field("keyword", "goober",
+                             Field.Store.YES, Field.Index.UN_TOKENIZED))
+            writer.addDocument(doc)
+
+        writer.optimize()
+        writer.close()
+
+    main = classmethod(main)
+
+
+if __name__ == "__main__":
+    import sys
+    VerboseIndexing.main(sys.argv)

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/VerboseIndexing.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1 @@
+# indexing package

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/indexing/__init__.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,99 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from time import time
+from datetime import timedelta
+from lucene import \
+    IndexWriter, StandardAnalyzer, Document, Field, \
+    InputStreamReader, FileInputStream
+
+
+class Indexer(object):
+
+    def main(cls, argv):
+
+        if len(argv) != 3:
+            print "Usage: python Indexer.py <index dir> <data dir>"
+
+        else:
+            indexDir = argv[1]
+            dataDir = argv[2]
+
+            start = time()
+            numIndexed = cls.index(indexDir, dataDir)
+            duration = timedelta(seconds=time() - start)
+
+            print "Indexing %s files took %s" %(numIndexed, duration)
+
+    def index(cls, indexDir, dataDir):
+
+        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
+            raise IOError, "%s does not exist or is not a directory" %(dataDir)
+
+        writer = IndexWriter(indexDir, StandardAnalyzer(), True)
+        writer.setUseCompoundFile(False)
+
+        cls.indexDirectory(writer, dataDir)
+
+        numIndexed = writer.docCount()
+        writer.optimize()
+        writer.close()
+
+        return numIndexed
+
+    def indexDirectory(cls, writer, dir):
+
+        for name in os.listdir(dir):
+            path = os.path.join(dir, name)
+            if os.path.isfile(path):
+                if path.endswith('.txt'):
+                    cls.indexFile(writer, path)
+            elif os.path.isdir(path):
+                cls.indexDirectory(writer, path)
+
+    def indexFile(cls, writer, path):
+
+        try:
+            reader = InputStreamReader(FileInputStream(path), 'iso-8859-1')
+        except IOError, e:
+            print 'IOError while opening %s: %s' %(path, e)
+        else:
+            print 'Indexing', path
+            doc = Document()
+            doc.add(Field("contents", reader))
+            doc.add(Field("path", os.path.abspath(path),
+                          Field.Store.YES, Field.Index.UN_TOKENIZED))
+            writer.addDocument(doc)
+            reader.close()
+
+    main = classmethod(main)
+    index = classmethod(index)
+    indexDirectory = classmethod(indexDirectory)
+    indexFile = classmethod(indexFile)
+
+
+if __name__ == "__main__":
+    import sys
+    Indexer.main(sys.argv)

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Indexer.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1,71 @@
+# ====================================================================
+# Copyright (c) 2004-2007 Open Source Applications Foundation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions: 
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software. 
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# ====================================================================
+#
+
+import os
+
+from time import time
+from datetime import timedelta
+
+from lucene import \
+     Document, IndexSearcher, FSDirectory, QueryParser, StandardAnalyzer, Hit
+
+
+class Searcher(object):
+
+    def main(cls, argv):
+
+        if len(argv) != 3:
+            print "Usage: python Searcher.py <index dir> <query>"
+
+        else:
+            indexDir = argv[1]
+            q = argv[2]
+
+            if not (os.path.exists(indexDir) and os.path.isdir(indexDir)):
+                raise IOError, "%s does not exist or is not a directory" %(indexDir)
+
+            cls.search(indexDir, q)
+
+    def search(cls, indexDir, q):
+
+        fsDir = FSDirectory.getDirectory(indexDir, False)
+        searcher = IndexSearcher(fsDir)
+
+        query = QueryParser("contents", StandardAnalyzer()).parse(q)
+        start = time()
+        hits = searcher.search(query)
+        duration = timedelta(seconds=time() - start)
+
+        print "Found %d document(s) (in %s) that matched query '%s':" %(hits.length(), duration, q)
+
+        for hit in hits:
+            doc = Hit.cast_(hit).getDocument()
+            print doc["path"]
+
+    main = classmethod(main)
+    search = classmethod(search)
+
+
+if __name__ == "__main__":
+    import sys
+    Searcher.main(sys.argv)

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/Searcher.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py?rev=732916&view=auto
==============================================================================
--- lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py (added)
+++ lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py Thu Jan  8 19:28:33 2009
@@ -0,0 +1 @@
+# meetlucene package

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/samples/LuceneInAction/lia/meetlucene/__init__.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message