lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1200665 - in /lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene: index/ index/codecs/ util/
Date Fri, 11 Nov 2011 00:42:54 GMT
Author: rmuir
Date: Fri Nov 11 00:42:54 2011
New Revision: 1200665

URL: http://svn.apache.org/viewvc?rev=1200665&view=rev
Log:
LUCENE-2621: move tv-writing out of indexwriter, finish TV writer codec api

Modified:
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/util/StringHelper.java

Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java?rev=1200665&r1=1200664&r2=1200665&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
Fri Nov 11 00:42:54 2011
@@ -20,11 +20,9 @@ package org.apache.lucene.index;
 import java.io.IOException;
 import java.util.Map;
 
-import org.apache.lucene.index.codecs.DefaultTermVectorsReader;
+import org.apache.lucene.index.codecs.TermVectorsWriter;
 import org.apache.lucene.store.FlushInfo;
 import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.store.IOContext.Context;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
@@ -32,11 +30,9 @@ import org.apache.lucene.util.RamUsageEs
 
 final class TermVectorsTermsWriter extends TermsHashConsumer {
 
+  TermVectorsWriter writer;
   final DocumentsWriterPerThread docWriter;
   int freeCount;
-  IndexOutput tvx;
-  IndexOutput tvd;
-  IndexOutput tvf;
   int lastDocID;
 
   final DocumentsWriterPerThread.DocState docState;
@@ -53,16 +49,14 @@ final class TermVectorsTermsWriter exten
 
   @Override
   void flush(Map<FieldInfo, TermsHashConsumerPerField> fieldsToFlush, final SegmentWriteState
state) throws IOException {
-    if (tvx != null) {
+    if (writer != null) {
       // At least one doc in this run had term vectors enabled
       fill(state.numDocs);
       assert state.segmentName != null;
-      String idxName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.VECTORS_INDEX_EXTENSION);
-      IOUtils.close(tvx, tvf, tvd);
-      tvx = tvd = tvf = null;
-      if (4+((long) state.numDocs)*16 != state.directory.fileLength(idxName)) {
-        throw new RuntimeException("after flush: tvx size mismatch: " + state.numDocs + "
docs vs " + state.directory.fileLength(idxName) + " length in bytes of " + idxName + " file
exists?=" + state.directory.fileExists(idxName));
-      }
+      writer.finish(state.numDocs);
+      // nocommit: I think we should try-finally?
+      IOUtils.close(writer);
+      writer = null;
 
       lastDocID = 0;
       hasVectors = false;
@@ -78,40 +72,16 @@ final class TermVectorsTermsWriter exten
   /** Fills in no-term-vectors for all docs we haven't seen
    *  since the last doc that had term vectors. */
   void fill(int docID) throws IOException {
-    if (lastDocID < docID) {
-      final long tvfPosition = tvf.getFilePointer();
-      while(lastDocID < docID) {
-        tvx.writeLong(tvd.getFilePointer());
-        tvd.writeVInt(0);
-        tvx.writeLong(tvfPosition);
-        lastDocID++;
-      }
+    while(lastDocID < docID) {
+      writer.startDocument(0);
+      lastDocID++;
     }
   }
 
   private final void initTermVectorsWriter() throws IOException {
-    if (tvx == null) {
-      boolean success = false;
-      try {
-        IOContext context = new IOContext(new FlushInfo(docWriter.getNumDocsInRAM(), docWriter.bytesUsed()));
-        // If we hit an exception while init'ing the term
-        // vector output files, we must abort this segment
-        // because those files will be in an unknown
-        // state:
-        tvx = docWriter.directory.createOutput(IndexFileNames.segmentFileName(docWriter.getSegment(),
"", IndexFileNames.VECTORS_INDEX_EXTENSION), context);
-        tvd = docWriter.directory.createOutput(IndexFileNames.segmentFileName(docWriter.getSegment(),
"", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION), context);
-        tvf = docWriter.directory.createOutput(IndexFileNames.segmentFileName(docWriter.getSegment(),
"", IndexFileNames.VECTORS_FIELDS_EXTENSION), context);
-
-        tvx.writeInt(DefaultTermVectorsReader.FORMAT_CURRENT);
-        tvd.writeInt(DefaultTermVectorsReader.FORMAT_CURRENT);
-        tvf.writeInt(DefaultTermVectorsReader.FORMAT_CURRENT);
-        success = true;
-      } finally {
-        if (!success) {
-          IOUtils.closeWhileHandlingException(tvx, tvd, tvf);
-        }
-      }
-
+    if (writer == null) {
+      IOContext context = new IOContext(new FlushInfo(docWriter.getNumDocsInRAM(), docWriter.bytesUsed()));
+      writer = docWriter.codec.termVectorsFormat().vectorsWriter(docWriter.directory, docWriter.getSegment(),
context);
       lastDocID = 0;
     }
   }
@@ -130,23 +100,14 @@ final class TermVectorsTermsWriter exten
     fill(docState.docID);
 
     // Append term vectors to the real outputs:
-    tvx.writeLong(tvd.getFilePointer());
-    tvx.writeLong(tvf.getFilePointer());
-    tvd.writeVInt(numVectorFields);
-    if (numVectorFields > 0) {
-      for(int i=0;i<numVectorFields;i++) {
-        tvd.writeVInt(perFields[i].fieldInfo.number);
-      }
-      long lastPos = tvf.getFilePointer();
-      perFields[0].finishDocument();
-      for(int i=1;i<numVectorFields;i++) {
-        long pos = tvf.getFilePointer();
-        tvd.writeVLong(pos-lastPos);
-        lastPos = pos;
-        perFields[i].finishDocument();
-        // commit the termVectors once successful success - FI will otherwise reset them
-        perFields[i].fieldInfo.commitVectors();
-      }
+    writer.startDocument(numVectorFields);
+    for (int i = 0; i < numVectorFields; i++) {
+      perFields[i].finishDocument();
+      // nocommit: loop thru the fields and commit field info after all are successful?
+      // or commit each one after its done?
+      
+      // commit the termVectors once successful success - FI will otherwise reset them
+      perFields[i].fieldInfo.commitVectors();
     }
 
     assert lastDocID == docState.docID: "lastDocID=" + lastDocID + " docState.docID=" + docState.docID;
@@ -161,29 +122,11 @@ final class TermVectorsTermsWriter exten
   @Override
   public void abort() {
     hasVectors = false;
-    try {
-      IOUtils.closeWhileHandlingException(tvx, tvd, tvf);
-    } catch (IOException e) {
-      // cannot happen since we suppress exceptions
-      throw new RuntimeException(e);
-    }
-    
-    try {
-      docWriter.directory.deleteFile(IndexFileNames.segmentFileName(docWriter.getSegment(),
"", IndexFileNames.VECTORS_INDEX_EXTENSION));
-    } catch (IOException ignored) {
-    }
-    
-    try {
-      docWriter.directory.deleteFile(IndexFileNames.segmentFileName(docWriter.getSegment(),
"", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION));
-    } catch (IOException ignored) {
-    }
-    
-    try {
-      docWriter.directory.deleteFile(IndexFileNames.segmentFileName(docWriter.getSegment(),
"", IndexFileNames.VECTORS_FIELDS_EXTENSION));
-    } catch (IOException ignored) {
+
+    if (writer != null) {
+      writer.abort();
     }
-    
-    tvx = tvd = tvf = null;
+
     lastDocID = 0;
 
     reset();

Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=1200665&r1=1200664&r2=1200665&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
Fri Nov 11 00:42:54 2011
@@ -20,8 +20,7 @@ package org.apache.lucene.index;
 import java.io.IOException;
 
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.index.codecs.DefaultTermVectorsReader;
-import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.index.codecs.TermVectorsWriter;
 import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;
@@ -118,24 +117,14 @@ final class TermVectorsTermsWriterPerFie
     assert termsWriter.vectorFieldsInOrder(fieldInfo);
 
     TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
-    final IndexOutput tvf = termsWriter.tvf;
+    final TermVectorsWriter tv = termsWriter.writer;
 
     // TODO: we may want to make this sort in same order
     // as Codec's terms dict?
     final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
 
-    tvf.writeVInt(numPostings);
-    byte bits = 0x0;
-    if (doVectorPositions)
-      bits |= DefaultTermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
-    if (doVectorOffsets)
-      bits |= DefaultTermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
-    tvf.writeByte(bits);
-
-    int lastLen = 0;
-    byte[] lastBytes = null;
-    int lastStart = 0;
-
+    tv.startField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets);
+    
     final ByteSliceReader reader = termsWriter.vectorSliceReader;
     final ByteBlockPool termBytePool = termsHashPerField.termBytePool;
 
@@ -145,37 +134,26 @@ final class TermVectorsTermsWriterPerFie
 
       // Get BytesRef
       termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]);
+      tv.startTerm(flushTerm, freq);
 
-      // Compute common byte prefix between last term and
-      // this term
-      int prefix = 0;
-      if (j > 0) {
-        while(prefix < lastLen && prefix < flushTerm.length) {
-          if (lastBytes[lastStart+prefix] != flushTerm.bytes[flushTerm.offset+prefix]) {
-            break;
-          }
-          prefix++;
-        }
-      }
-
-      lastLen = flushTerm.length;
-      lastBytes = flushTerm.bytes;
-      lastStart = flushTerm.offset;
-
-      final int suffix = flushTerm.length - prefix;
-      tvf.writeVInt(prefix);
-      tvf.writeVInt(suffix);
-      tvf.writeBytes(flushTerm.bytes, lastStart+prefix, suffix);
-      tvf.writeVInt(freq);
-
+      int accum = 0;
+      
       if (doVectorPositions) {
         termsHashPerField.initReader(reader, termID, 0);
-        reader.writeTo(tvf);
+        for (int i = 0; i < freq; i++) {
+          accum += reader.readVInt();
+          tv.addPosition(accum);
+        }
       }
 
+      accum = 0;
+      
       if (doVectorOffsets) {
         termsHashPerField.initReader(reader, termID, 1);
-        reader.writeTo(tvf);
+        for (int i = 0; i < freq; i++) {
+          accum += reader.readVInt();
+          tv.addOffset(accum, accum + reader.readVInt());
+        }
       }
     }
 

Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java?rev=1200665&r1=1200664&r2=1200665&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsWriter.java
Fri Nov 11 00:42:54 2011
@@ -17,6 +17,7 @@ package org.apache.lucene.index.codecs;
  * limitations under the License.
  */
 
+import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.MergePolicy.MergeAbortedException;
@@ -28,6 +29,7 @@ import org.apache.lucene.index.TermVecto
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
@@ -179,11 +181,72 @@ public final class DefaultTermVectorsWri
   
   @Override
   public void startDocument(int numVectorFields) throws IOException {
+    this.numVectorFields = numVectorFields;
     tvx.writeLong(tvd.getFilePointer());
     tvx.writeLong(tvf.getFilePointer());
     tvd.writeVInt(numVectorFields);
+    fieldCount = 0;
+    fps = ArrayUtil.grow(fps, numVectorFields);
   }
   
+  private long fps[] = new long[10]; // pointers to the tvf before writing each field 
+  private int fieldCount = 0;        // number of fields we have written so far for this
document
+  private int numVectorFields = 0;   // total number of fields we will write for this document
+  
+  @Override
+  public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets)
throws IOException {
+    lastTerm.length = 0;
+    fps[fieldCount++] = tvf.getFilePointer();
+    tvd.writeVInt(info.number);
+    tvf.writeVInt(numTerms);
+    byte bits = 0x0;
+    if (positions)
+      bits |= DefaultTermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
+    if (offsets)
+      bits |= DefaultTermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
+    tvf.writeByte(bits);
+    
+    assert fieldCount <= numVectorFields;
+    if (fieldCount == numVectorFields) {
+      // last field of the document
+      // this is crazy because the file format is crazy!
+      for (int i = 1; i < fieldCount; i++) {
+        tvd.writeVLong(fps[i] - fps[i-1]);
+      }
+    }
+  }
+  
+  private final BytesRef lastTerm = new BytesRef(10);
+
+  @Override
+  public void startTerm(BytesRef term, int freq) throws IOException {
+    final int prefix = StringHelper.bytesDifference(lastTerm.bytes, lastTerm.offset, lastTerm.length,

+                                                    term.bytes, term.offset, term.length);
+    final int suffix = term.length - prefix;
+    tvf.writeVInt(prefix);
+    tvf.writeVInt(suffix);
+    tvf.writeBytes(term.bytes, term.offset + prefix, suffix);
+    tvf.writeVInt(freq);
+    lastTerm.copy(term);
+    lastPosition = lastOffset = 0;
+  }
+
+  int lastPosition = 0;
+  int lastOffset = 0;
+  
+  @Override
+  public void addPosition(int position) throws IOException {
+    tvf.writeVInt(position - lastPosition);
+    lastPosition = position;
+  }
+
+  @Override
+  public void addOffset(int startOffset, int endOffset) throws IOException {
+    tvf.writeVInt(startOffset - lastOffset);
+    tvf.writeVInt(endOffset - startOffset);
+    lastOffset = startOffset;
+  }
+
   @Override
   public void abort() {
     try {

Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java?rev=1200665&r1=1200664&r2=1200665&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/TermVectorsWriter.java
Fri Nov 11 00:42:54 2011
@@ -20,20 +20,33 @@ package org.apache.lucene.index.codecs;
 import java.io.Closeable;
 import java.io.IOException;
 
+import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.index.TermFreqVector;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
 
 public abstract class TermVectorsWriter implements Closeable {
   
-  /** Called before writing the stored fields of the document.
-   *  XXX will be called <code>numVectorFields</code> times.
-   *  Note that if term vectors are enabled, this is called 
-   *  even if the document has no vector fields, in
-   *  this case <code>numVectorFields</code> will be zero. */
+  /** Called before writing the term vectors of the document.
+   *  {@link #startField(FieldInfo, int)} will be called 
+   *  <code>numVectorFields</code> times. Note that if term 
+   *  vectors are enabled, this is called even if the document 
+   *  has no vector fields, in this case <code>numVectorFields</code> 
+   *  will be zero. */
   public abstract void startDocument(int numVectorFields) throws IOException;
   
+  /** Called before writing the terms of the field.
+   *  XXX will be called <code>numTerms</code> times. */
+  public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean
offsets) throws IOException;
+  
+  public abstract void startTerm(BytesRef term, int freq) throws IOException;
+  
+  public abstract void addPosition(int position) throws IOException;
+  
+  public abstract void addOffset(int startOffset, int endOffset) throws IOException;
+  
   /** Aborts writing entirely, implementation should remove
    *  any partially-written files, etc. */
   public abstract void abort();

Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/util/StringHelper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/util/StringHelper.java?rev=1200665&r1=1200664&r2=1200665&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/util/StringHelper.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/util/StringHelper.java
Fri Nov 11 00:42:54 2011
@@ -43,6 +43,22 @@ public abstract class StringHelper {
         return i;
     return len;
   }
+  
+  /**
+   * Compares two byte[] arrays, element by element, and returns the
+   * number of elements common to both arrays.
+   *
+   * @param bytes1 The first byte[] to compare
+   * @param bytes2 The second byte[] to compare
+   * @return The number of common elements.
+   */
+  public static int bytesDifference(byte[] bytes1, int off1, int len1, byte[] bytes2, int
off2, int len2) {
+    int len = len1 < len2 ? len1 : len2;
+    for (int i = 0; i < len; i++)
+      if (bytes1[i+off1] != bytes2[i+off2])
+        return i;
+    return len;
+  }
 
   private StringHelper() {
   }



Mime
View raw message