lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r627707 - in /lucene/java/trunk/src: java/org/apache/lucene/index/DocumentsWriter.java test/org/apache/lucene/search/TestTermVectors.java
Date Thu, 14 Feb 2008 11:22:10 GMT
Author: mikemccand
Date: Thu Feb 14 03:22:04 2008
New Revision: 627707

URL: http://svn.apache.org/viewvc?rev=627707&view=rev
Log:
LUCENE-1172: some small additional speedups for DocumentsWriter

Modified:
    lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=627707&r1=627706&r2=627707&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Thu Feb 14 03:22:04
2008
@@ -1020,6 +1020,59 @@
       quickSort(postings, left + 1, hi);
     }
 
+    void quickSort(FieldData[] array, int lo, int hi) {
+      if (lo >= hi)
+        return;
+
+      int mid = (lo + hi) >>> 1;
+
+      if (array[lo].compareTo(array[mid]) > 0) {
+        FieldData tmp = array[lo];
+        array[lo] = array[mid];
+        array[mid] = tmp;
+      }
+
+      if (array[mid].compareTo(array[hi]) > 0) {
+        FieldData tmp = array[mid];
+        array[mid] = array[hi];
+        array[hi] = tmp;
+
+        if (array[lo].compareTo(array[mid]) > 0) {
+          FieldData tmp2 = array[lo];
+          array[lo] = array[mid];
+          array[mid] = tmp2;
+        }
+      }
+
+      int left = lo + 1;
+      int right = hi - 1;
+
+      if (left >= right)
+        return;
+
+      FieldData partition = array[mid];
+
+      for (; ;) {
+        while (array[right].compareTo(partition) > 0)
+          --right;
+
+        while (left < right && array[left].compareTo(partition) <= 0)
+          ++left;
+
+        if (left < right) {
+          FieldData tmp = array[left];
+          array[left] = array[right];
+          array[right] = tmp;
+          --right;
+        } else {
+          break;
+        }
+      }
+
+      quickSort(array, lo, left);
+      quickSort(array, left + 1, hi);
+    }
+
     /** If there are fields we've seen but did not see again
      *  in the last run, then free them up.  Also reduce
      *  postings hash size. */
@@ -1098,6 +1151,7 @@
       throws IOException, AbortException {
 
       final int numFields = numFieldData;
+      assert clearLastVectorFieldName();
 
       assert 0 == fdtLocal.length();
 
@@ -1108,7 +1162,7 @@
         // sort the subset of fields that have vectors
         // enabled; we could save [small amount of] CPU
         // here.
-        Arrays.sort(fieldDataArray, 0, numFields);
+        quickSort(fieldDataArray, 0, numFields-1);
 
       // We process the document one field at a time
       for(int i=0;i<numFields;i++)
@@ -1116,10 +1170,6 @@
 
       if (maxTermPrefix != null && infoStream != null)
         infoStream.println("WARNING: document contains at least one immense term (longer
than the max length " + MAX_TERM_LENGTH + "), all of which were skipped.  Please correct the
analyzer to not produce such terms.  The prefix of the first immense term is: '" + maxTermPrefix
+ "...'"); 
-
-      if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH
-          && numBytesUsed > 0.95 * ramBufferSize)
-        balanceRAM();
     }
 
     final ByteBlockPool postingsPool = new ByteBlockPool();
@@ -1295,6 +1345,26 @@
       pos[posUpto++] = b;
     }
 
+    String lastVectorFieldName;
+
+    // Called only by assert
+    final boolean clearLastVectorFieldName() {
+      lastVectorFieldName = null;
+      return true;
+    }
+
+    // Called only by assert
+    final boolean vectorFieldsInOrder(FieldInfo fi) {
+      try {
+        if (lastVectorFieldName != null)
+          return lastVectorFieldName.compareTo(fi.name) < 0;
+        else
+          return true;
+      } finally {
+        lastVectorFieldName = fi.name;
+      }
+    }
+
     PostingVector[] postingsVectors = new PostingVector[1];
     int maxPostingsVectors;
 
@@ -1360,7 +1430,6 @@
         postingsHash = new Posting[postingsHashSize];
       }
 
-      /** So Arrays.sort can sort us. */
       public int compareTo(Object o) {
         return fieldInfo.name.compareTo(((FieldData) o).fieldInfo.name);
       }
@@ -1535,9 +1604,9 @@
 
       /** Only called when term vectors are enabled.  This
        *  is called the first time we see a given term for
-       *  each * document, to allocate a PostingVector
-       *  instance that * is used to record data needed to
-       *  write the posting * vectors. */
+       *  each document, to allocate a PostingVector
+       *  instance that is used to record data needed to
+       *  write the posting vectors. */
       private PostingVector addNewVector() {
 
         if (postingsVectorsUpto == postingsVectors.length) {
@@ -1837,6 +1906,7 @@
       void writeVectors(FieldInfo fieldInfo) throws IOException {
 
         assert fieldInfo.storeTermVector;
+        assert vectorFieldsInOrder(fieldInfo);
 
         vectorFieldNumbers[numVectorFields] = fieldInfo.number;
         vectorFieldPointers[numVectorFields] = tvfLocal.getFilePointer();
@@ -2586,6 +2656,10 @@
       return;
     }
 
+    if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH
+        && numBytesUsed >= ramBufferSize)
+      balanceRAM();
+
     // Now write the indexed document to the real files.
     if (nextWriteDocID == state.docID) {
       // It's my turn, so write everything now:
@@ -2649,7 +2723,7 @@
       out.writeByte(b);
   }
 
-  byte[] copyByteBuffer = new byte[4096];
+  final byte[] copyByteBuffer = new byte[4096];
 
   /** Copy numBytes from srcIn to destIn */
   void copyBytes(IndexInput srcIn, IndexOutput destIn, long numBytes) throws IOException
{
@@ -3138,7 +3212,7 @@
    * the other two.  This method just frees allocations from
    * the pools once we are over-budget, which balances the
    * pools to match the current docs. */
-  private synchronized void balanceRAM() {
+  synchronized void balanceRAM() {
 
     if (ramBufferSize == IndexWriter.DISABLE_AUTO_FLUSH || bufferIsFull)
       return;

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java?rev=627707&r1=627706&r2=627707&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java Thu Feb 14 03:22:04
2008
@@ -23,7 +23,7 @@
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.*;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.MockRAMDirectory;
 import org.apache.lucene.util.English;
 
 import java.io.IOException;
@@ -34,7 +34,7 @@
 
 public class TestTermVectors extends LuceneTestCase {
   private IndexSearcher searcher;
-  private RAMDirectory directory = new RAMDirectory();
+  private Directory directory = new MockRAMDirectory();
   public TestTermVectors(String s) {
     super(s);
   }
@@ -91,6 +91,37 @@
     }
   }
   
+  public void testTermVectorsFieldOrder() throws IOException {
+    Directory dir = new MockRAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
+    Document doc = new Document();
+    doc.add(new Field("c", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("a", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("b", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("x", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    writer.addDocument(doc);
+    writer.close();
+    IndexReader reader = IndexReader.open(dir);
+    TermFreqVector[] v = reader.getTermFreqVectors(0);
+    assertEquals(4, v.length);
+    String[] expectedFields = new String[]{"a", "b", "c", "x"};
+    int[] expectedPositions = new int[]{1, 2, 0};
+    for(int i=0;i<v.length;i++) {
+      TermPositionVector posVec = (TermPositionVector) v[i];
+      assertEquals(expectedFields[i], posVec.getField());
+      String[] terms = posVec.getTerms();
+      assertEquals(3, terms.length);
+      assertEquals("content", terms[0]);
+      assertEquals("here", terms[1]);
+      assertEquals("some", terms[2]);
+      for(int j=0;j<3;j++) {
+        int[] positions = posVec.getTermPositions(j);
+        assertEquals(1, positions.length);
+        assertEquals(expectedPositions[j], positions[0]);
+      }
+    }
+  }
+
   public void testTermPositionVectors() {
     Query query = new TermQuery(new Term("field", "zero"));
     try {
@@ -198,7 +229,7 @@
     Document testDoc4 = new Document();
     setupDoc(testDoc4, test4);
         
-    Directory dir = new RAMDirectory();
+    Directory dir = new MockRAMDirectory();
     
     try {
       IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, 
@@ -310,10 +341,10 @@
   
   private void setupDoc(Document doc, String text)
   {
-    doc.add(new Field("field", text, Field.Store.YES,
-        Field.Index.TOKENIZED, Field.TermVector.YES));
     doc.add(new Field("field2", text, Field.Store.YES,
         Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("field", text, Field.Store.YES,
+        Field.Index.TOKENIZED, Field.TermVector.YES));
     //System.out.println("Document: " + doc);
   }
 



Mime
View raw message