lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r600465 - in /lucene/java/trunk/src: java/org/apache/lucene/index/DocumentsWriter.java java/org/apache/lucene/index/IndexWriter.java test/org/apache/lucene/index/TestIndexWriter.java
Date Mon, 03 Dec 2007 10:09:11 GMT
Author: mikemccand
Date: Mon Dec  3 02:09:10 2007
New Revision: 600465

URL: http://svn.apache.org/viewvc?rev=600465&view=rev
Log:
LUCENE-1072: make sure on hitting a too-long term that IndexWriter is still usable

Modified:
    lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=600465&r1=600464&r2=600465&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Mon Dec  3 02:09:10
2007
@@ -354,7 +354,7 @@
         state.tvfLocal.reset();
         state.fdtLocal.reset();
       }
-
+      docStoreSegment = null;
       files = null;
 
     } finally {
@@ -518,6 +518,7 @@
     int numAllFieldData;
     FieldData[] fieldDataHash;            // Hash FieldData instances by field name
     int fieldDataHashMask;
+    int maxTermHit;                       // Set to > 0 if this doc has a too-large term
 
     boolean doFlushAfter;
 
@@ -608,6 +609,7 @@
       numStoredFields = 0;
       numFieldData = 0;
       numVectorFields = 0;
+      maxTermHit = 0;
 
       List docFields = doc.getFields();
       final int numDocFields = docFields.size();
@@ -1483,17 +1485,23 @@
             getPostings(postingsFreeList);
           }
 
-          // Pull next free Posting from free list
-          p = postingsFreeList[--postingsFreeCount];
-
           final int textLen1 = 1+tokenTextLen;
           if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) {
-            if (textLen1 > CHAR_BLOCK_SIZE)
-              throw new IllegalArgumentException("term length " + tokenTextLen + " exceeds
max term length " + (CHAR_BLOCK_SIZE-1));
+            if (textLen1 > CHAR_BLOCK_SIZE) {
+              maxTermHit = tokenTextLen;
+              // Just skip this term; we will throw an
+              // exception after processing all accepted
+              // terms in the doc
+              return;
+            }
             charPool.nextBuffer();
           }
           final char[] text = charPool.buffer;
           final int textUpto = charPool.byteUpto;
+
+          // Pull next free Posting from free list
+          p = postingsFreeList[--postingsFreeCount];
+
           p.textStart = textUpto + charPool.byteOffset;
           charPool.byteUpto += textLen1;
 
@@ -2181,26 +2189,28 @@
 
   /** Returns true if the caller (IndexWriter) should now
    * flush. */
-  boolean addDocument(Document doc, Analyzer analyzer)
+  int addDocument(Document doc, Analyzer analyzer)
     throws CorruptIndexException, IOException {
     return updateDocument(doc, analyzer, null);
   }
 
-  boolean updateDocument(Term t, Document doc, Analyzer analyzer)
+  int updateDocument(Term t, Document doc, Analyzer analyzer)
     throws CorruptIndexException, IOException {
     return updateDocument(doc, analyzer, t);
   }
 
-  boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm)
+  int updateDocument(Document doc, Analyzer analyzer, Term delTerm)
     throws CorruptIndexException, IOException {
 
     // This call is synchronized but fast
     final ThreadState state = getThreadState(doc, delTerm);
     boolean success = false;
+    int maxTermHit;
     try {
       // This call is not synchronized and does all the work
       state.processDocument(analyzer);
       // This call synchronized but fast
+      maxTermHit = state.maxTermHit;
       finishDocument(state);
       success = true;
     } finally {
@@ -2209,7 +2219,11 @@
         abort();
       }
     }
-    return state.doFlushAfter || timeToFlushDeletes();
+
+    int status = maxTermHit<<1;
+    if (state.doFlushAfter || timeToFlushDeletes())
+      status += 1;
+    return status;
   }
 
   synchronized int getNumBufferedDeleteTerms() {

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=600465&r1=600464&r2=600465&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Mon Dec  3 02:09:10
2007
@@ -1426,10 +1426,10 @@
    */
   public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException,
IOException {
     ensureOpen();
-    boolean doFlush = false;
+    int status = 0;
     boolean success = false;
     try {
-      doFlush = docWriter.addDocument(doc, analyzer);
+      status = docWriter.addDocument(doc, analyzer);
       success = true;
     } finally {
       if (!success) {
@@ -1446,8 +1446,9 @@
         }
       }
     }
-    if (doFlush)
+    if ((status & 1) != 0)
       flush(true, false);
+    checkMaxTermLength(status);
   }
 
   /**
@@ -1511,10 +1512,10 @@
   public void updateDocument(Term term, Document doc, Analyzer analyzer)
       throws CorruptIndexException, IOException {
     ensureOpen();
-    boolean doFlush = false;
+    int status = 0;
     boolean success = false;
     try {
-      doFlush = docWriter.updateDocument(term, doc, analyzer);
+      status = docWriter.updateDocument(term, doc, analyzer);
       success = true;
     } finally {
       if (!success) {
@@ -1531,8 +1532,17 @@
         }
       }
     }
-    if (doFlush)
+    if ((status & 1) != 0)
       flush(true, false);
+    checkMaxTermLength(status);
+  }
+
+  /** Throws IllegalArgumentException if the return status
+   *  from DocumentsWriter.{add,update}Document indicates
+   *  that a too-long term was encountered */
+  final private void checkMaxTermLength(int status) {
+    if (status > 1)
+      throw new IllegalArgumentException("at least one term (length " + (status>>1)
+ ") exceeds max term length " + (DocumentsWriter.CHAR_BLOCK_SIZE-1) + "; these terms were
skipped");
   }
 
   // for test purpose

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=600465&r1=600464&r2=600465&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Mon Dec  3 02:09:10
2007
@@ -28,8 +28,6 @@
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Hits;
 import org.apache.lucene.search.TermQuery;
@@ -221,12 +219,8 @@
           methodName = "addIndexesNoOptimize(Directory[])";
         }
 
-        int cycleCount = 0;
-
         while(!done) {
 
-          cycleCount++;
-
           // Make a new dir that will enforce disk usage:
           MockRAMDirectory dir = new MockRAMDirectory(startDir);
           writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), false);
@@ -524,7 +518,7 @@
       String[] startFiles = dir.list();
       SegmentInfos infos = new SegmentInfos();
       infos.read(dir);
-      IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(),
infos, null, null);
+      new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
       String[] endFiles = dir.list();
 
       Arrays.sort(startFiles);
@@ -543,17 +537,44 @@
       RAMDirectory dir = new RAMDirectory();
       IndexWriter writer  = new IndexWriter(dir, new StandardAnalyzer(), true);
 
-      char[] chars = new char[16384];
+      char[] chars = new char[16383];
       Arrays.fill(chars, 'x');
       Document doc = new Document();
-      String contents = "a b c " + new String(chars);
+      final String bigTerm = new String(chars);
+
+      // Max length term is 16383, so this contents produces
+      // a too-long term:
+      String contents = "abc xyz x" + bigTerm;
       doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED));
       try {
         writer.addDocument(doc);
         fail("did not hit expected exception");
       } catch (IllegalArgumentException e) {
       }
+
+      // Make sure we can add another normal document
+      doc = new Document();
+      doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.TOKENIZED));
+      writer.addDocument(doc);
       writer.close();
+
+      IndexReader reader = IndexReader.open(dir);
+      // Make sure all terms < max size were indexed
+      assertEquals(2, reader.docFreq(new Term("content", "abc")));
+      assertEquals(1, reader.docFreq(new Term("content", "bbb")));
+      reader.close();
+
+      // Make sure we can add a document with exactly the
+      // maximum length term, and search on that term:
+      doc = new Document();
+      doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
+      writer  = new IndexWriter(dir, new StandardAnalyzer());
+      writer.addDocument(doc);
+      writer.close();
+      reader = IndexReader.open(dir);
+      assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
+      reader.close();
+
       dir.close();
     }
 
@@ -1342,7 +1363,6 @@
     public void testDiverseDocs() throws IOException {
       RAMDirectory dir = new RAMDirectory();      
       IndexWriter writer  = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
-      long t0 = System.currentTimeMillis();
       writer.setRAMBufferSizeMB(0.5);
       Random rand = new Random(31415);
       for(int i=0;i<3;i++) {
@@ -1381,7 +1401,6 @@
       }
       writer.close();
 
-      long t1 = System.currentTimeMillis();
       IndexSearcher searcher = new IndexSearcher(dir);
       Hits hits = searcher.search(new TermQuery(new Term("field", "aaa")));
       assertEquals(300, hits.length());
@@ -1491,7 +1510,6 @@
         addDoc(writer);
       }
       writer.close();
-      IndexReader reader = IndexReader.open(dir);
       Term searchTerm = new Term("content", "aaa");        
       IndexSearcher searcher = new IndexSearcher(dir);
       Hits hits = searcher.search(new TermQuery(searchTerm));



Mime
View raw message