lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jpou...@apache.org
Subject svn commit: r1441760 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/compressing/ lucene/core/src/java/org/apache/lucene/codecs/lucene41/ lucene/core/src/java/org/apache/lucene/codecs/lucene42/
Date Sat, 02 Feb 2013 14:56:23 GMT
Author: jpountz
Date: Sat Feb  2 14:56:22 2013
New Revision: 1441760

URL: http://svn.apache.org/viewvc?rev=1441760&view=rev
Log:
LUCENE-4733: Make CompressingTermVectorsFormat the new default term vectors format (merged
from r1441732).

Added:
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42TermVectorsFormat.java
      - copied unchanged from r1441732, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42TermVectorsFormat.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsFormat.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1441760&r1=1441759&r2=1441760&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Sat Feb  2 14:56:22 2013
@@ -54,6 +54,10 @@ Optimizations
 * LUCENE-4740: Don't track clones of MMapIndexInput if unmapping
   is disabled. This reduces GC overhead. (Kristofer Karlsson, Uwe Schindler)
 
+* LUCENE-4733: The default Lucene 4.2 codec now uses a more compact
+  TermVectorsFormat (Lucene42TermVectorsFormat) based on
+  CompressingTermVectorsFormat. (Adrien Grand)
+
 New Features
 
 * LUCENE-4686: New specialized DGapVInt8IntEncoder for facets (now the 

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java?rev=1441760&r1=1441759&r2=1441760&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java
Sat Feb  2 14:56:22 2013
@@ -28,7 +28,11 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.packed.PackedInts;
 
-class CompressingStoredFieldsIndexReader implements Closeable, Cloneable {
+/**
+ * Random-access reader for {@link CompressingStoredFieldsIndexWriter}.
+ * @lucene.internal
+ */
+public final class CompressingStoredFieldsIndexReader implements Closeable, Cloneable {
 
   final IndexInput fieldsIndexIn;
 

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java?rev=1441760&r1=1441759&r2=1441760&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java
Sat Feb  2 14:56:22 2013
@@ -20,10 +20,54 @@ package org.apache.lucene.codecs.compres
 import java.io.Closeable;
 import java.io.IOException;
 
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.packed.PackedInts;
 
-class CompressingStoredFieldsIndexWriter implements Closeable {
+/**
+ * Efficient index format for block-based {@link Codec}s.
+ * <p> This writer generates a file which can be loaded into memory using
+ * memory-efficient data structures to quickly locate the block that contains
+ * any document.
+ * <p>In order to have a compact in-memory representation, for every block of
+ * 1024 chunks, this index computes the average number of bytes per
+ * chunk and for every chunk, only stores the difference between<ul>
+ * <li>${chunk number} * ${average length of a chunk}</li>
+ * <li>and the actual start offset of the chunk</li></ul></p>
+ * <p>Data is written as follows:</p>
+ * <ul>
+ * <li>PackedIntsVersion, &lt;Block&gt;<sup>BlockCount</sup>, BlocksEndMarker</li>
+ * <li>PackedIntsVersion --&gt; {@link PackedInts#VERSION_CURRENT} as a {@link
DataOutput#writeVInt VInt}</li>
+ * <li>BlocksEndMarker --&gt; <tt>0</tt> as a {@link DataOutput#writeVInt
VInt}, this marks the end of blocks since blocks are not allowed to start with <tt>0</tt></li>
+ * <li>Block --&gt; BlockChunks, &lt;DocBases&gt;, &lt;StartPointers&gt;</li>
+ * <li>BlockChunks --&gt; a {@link DataOutput#writeVInt VInt} which is the number
of chunks encoded in the block</li>
+ * <li>DocBases --&gt; DocBase, AvgChunkDocs, BitsPerDocBaseDelta, DocBaseDeltas</li>
+ * <li>DocBase --&gt; first document ID of the block of chunks, as a {@link DataOutput#writeVInt
VInt}</li>
+ * <li>AvgChunkDocs --&gt; average number of documents in a single chunk, as a
{@link DataOutput#writeVInt VInt}</li>
+ * <li>BitsPerDocBaseDelta --&gt; number of bits required to represent a delta
from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag
encoding</a></li>
+ * <li>DocBaseDeltas --&gt; {@link PackedInts packed} array of BlockChunks elements
of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using
<a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag
encoding</a>.</li>
+ * <li>StartPointers --&gt; StartPointerBase, AvgChunkSize, BitsPerStartPointerDelta,
StartPointerDeltas</li>
+ * <li>StartPointerBase --&gt; the first start pointer of the block, as a {@link
DataOutput#writeVLong VLong}</li>
+ * <li>AvgChunkSize --&gt; the average size of a chunk of compressed documents,
as a {@link DataOutput#writeVLong VLong}</li>
+ * <li>BitsPerStartPointerDelta --&gt; number of bits required to represent a delta
from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag
encoding</a></li>
+ * <li>StartPointerDeltas --&gt; {@link PackedInts packed} array of BlockChunks
elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start
pointer using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag
encoding</a></li>
+ * </ul>
+ * <p>Notes</p>
+ * <ul>
+ * <li>For any block, the doc base of the n-th chunk can be restored with
+ * <code>DocBase + AvgChunkDocs * n + DocBaseDeltas[n]</code>.</li>
+ * <li>For any block, the start pointer of the n-th chunk can be restored with
+ * <code>StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n]</code>.</li>
+ * <li>Once data is loaded into memory, you can lookup the start pointer of any
+ * document by performing two binary searches: a first one based on the values
+ * of DocBase in order to find the right block, and then inside the block based
+ * on DocBaseDeltas (by reconstructing the doc bases for every chunk).</li>
+ * </ul>
+ * @lucene.internal
+ */
+public final class CompressingStoredFieldsIndexWriter implements Closeable {
   
   static final int BLOCK_SIZE = 1024; // number of chunks to serialize at once
 

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java?rev=1441760&r1=1441759&r2=1441760&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
Sat Feb  2 14:56:22 2013
@@ -34,7 +34,7 @@ import org.apache.lucene.store.IOContext
  * order to improve the compression ratio.
  * @lucene.experimental
  */
-public final class CompressingTermVectorsFormat extends TermVectorsFormat {
+public class CompressingTermVectorsFormat extends TermVectorsFormat {
 
   private final String formatName;
   private final String segmentSuffix;
@@ -79,7 +79,7 @@ public final class CompressingTermVector
   }
 
   @Override
-  public TermVectorsReader vectorsReader(Directory directory,
+  public final TermVectorsReader vectorsReader(Directory directory,
       SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context)
       throws IOException {
     return new CompressingTermVectorsReader(directory, segmentInfo, segmentSuffix,
@@ -87,7 +87,7 @@ public final class CompressingTermVector
   }
 
   @Override
-  public TermVectorsWriter vectorsWriter(Directory directory,
+  public final TermVectorsWriter vectorsWriter(Directory directory,
       SegmentInfo segmentInfo, IOContext context) throws IOException {
     return new CompressingTermVectorsWriter(directory, segmentInfo, segmentSuffix,
         context, formatName, compressionMode, chunkSize);

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsFormat.java?rev=1441760&r1=1441759&r2=1441760&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsFormat.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsFormat.java
Sat Feb  2 14:56:22 2013
@@ -20,6 +20,7 @@ package org.apache.lucene.codecs.lucene4
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.StoredFieldsFormat;
 import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
+import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
 import org.apache.lucene.codecs.compressing.CompressionMode;
 import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat;
 import org.apache.lucene.store.DataOutput;
@@ -50,7 +51,7 @@ import org.apache.lucene.util.packed.Pac
  * <a href="http://fastcompression.blogspot.fr/2011/05/lz4-explained.html">compression
format</a>.</p>
  * <p>Here is a more detailed description of the field data file format:</p>
  * <ul>
- * <li>FieldData (.fdt) --&gt; &lt;Header&gt;, PackedIntsVersion, CompressionFormat,
&lt;Chunk&gt;<sup>ChunkCount</sup></li>
+ * <li>FieldData (.fdt) --&gt; &lt;Header&gt;, PackedIntsVersion, &lt;Chunk&gt;<sup>ChunkCount</sup></li>
  * <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
  * <li>PackedIntsVersion --&gt; {@link PackedInts#VERSION_CURRENT} as a {@link
DataOutput#writeVInt VInt}</li>
  * <li>ChunkCount is not known in advance and is the number of chunks necessary to
store all document of the segment</li>
@@ -95,43 +96,11 @@ import org.apache.lucene.util.packed.Pac
  * </ul>
  * </li>
  * <li><a name="field_index" id="field_index"></a>
- * <p>A fields index file (extension <tt>.fdx</tt>). The data stored in
this
- * file is read to load an in-memory data-structure that can be used to locate
- * the start offset of a block containing any document in the fields data file.</p>
- * <p>In order to have a compact in-memory representation, for every block of
- * 1024 chunks, this stored fields index computes the average number of bytes per
- * chunk and for every chunk, only stores the difference between<ul>
- * <li>${chunk number} * ${average length of a chunk}</li>
- * <li>and the actual start offset of the chunk</li></ul></p>
- * <p>Data is written as follows:</p>
+ * <p>A fields index file (extension <tt>.fdx</tt>).</p>
  * <ul>
- * <li>FieldsIndex (.fdx) --&gt; &lt;Header&gt;, FieldsIndex, PackedIntsVersion,
&lt;Block&gt;<sup>BlockCount</sup>, BlocksEndMarker</li>
+ * <li>FieldsIndex (.fdx) --&gt; &lt;Header&gt;, &lt;ChunkIndex&gt;</li>
  * <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
- * <li>PackedIntsVersion --&gt; {@link PackedInts#VERSION_CURRENT} as a {@link
DataOutput#writeVInt VInt}</li>
- * <li>BlocksEndMarker --&gt; <tt>0</tt> as a {@link DataOutput#writeVInt
VInt}, this marks the end of blocks since blocks are not allowed to start with <tt>0</tt></li>
- * <li>Block --&gt; BlockChunks, &lt;DocBases&gt;, &lt;StartPointers&gt;</li>
- * <li>BlockChunks --&gt; a {@link DataOutput#writeVInt VInt} which is the number
of chunks encoded in the block</li>
- * <li>DocBases --&gt; DocBase, AvgChunkDocs, BitsPerDocBaseDelta, DocBaseDeltas</li>
- * <li>DocBase --&gt; first document ID of the block of chunks, as a {@link DataOutput#writeVInt
VInt}</li>
- * <li>AvgChunkDocs --&gt; average number of documents in a single chunk, as a
{@link DataOutput#writeVInt VInt}</li>
- * <li>BitsPerDocBaseDelta --&gt; number of bits required to represent a delta
from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag
encoding</a></li>
- * <li>DocBaseDeltas --&gt; {@link PackedInts packed} array of BlockChunks elements
of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using
<a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag
encoding</a>.</li>
- * <li>StartPointers --&gt; StartPointerBase, AvgChunkSize, BitsPerStartPointerDelta,
StartPointerDeltas</li>
- * <li>StartPointerBase --&gt; the first start pointer of the block, as a {@link
DataOutput#writeVLong VLong}</li>
- * <li>AvgChunkSize --&gt; the average size of a chunk of compressed documents,
as a {@link DataOutput#writeVLong VLong}</li>
- * <li>BitsPerStartPointerDelta --&gt; number of bits required to represent a delta
from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag
encoding</a></li>
- * <li>StartPointerDeltas --&gt; {@link PackedInts packed} array of BlockChunks
elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start
pointer using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag
encoding</a></li>
- * </ul>
- * <p>Notes</p>
- * <ul>
- * <li>For any block, the doc base of the n-th chunk can be restored with
- * <code>DocBase + AvgChunkDocs * n + DocBaseDeltas[n]</code>.</li>
- * <li>For any block, the start pointer of the n-th chunk can be restored with
- * <code>StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n]</code>.</li>
- * <li>Once data is loaded into memory, you can lookup the start pointer of any
- * document by performing two binary searches: a first one based on the values
- * of DocBase in order to find the right block, and then inside the block based
- * on DocBaseDeltas (by reconstructing the doc bases for every chunk).</li>
+ * <li>ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}</li>
  * </ul>
  * </li>
  * </ol>

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java?rev=1441760&r1=1441759&r2=1441760&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java
Sat Feb  2 14:56:22 2013
@@ -18,13 +18,13 @@ package org.apache.lucene.codecs.lucene4
  */
 
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.FieldInfosFormat;
 import org.apache.lucene.codecs.FilterCodec;
 import org.apache.lucene.codecs.LiveDocsFormat;
+import org.apache.lucene.codecs.NormsFormat;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.SegmentInfoFormat;
-import org.apache.lucene.codecs.DocValuesFormat;
-import org.apache.lucene.codecs.NormsFormat;
 import org.apache.lucene.codecs.StoredFieldsFormat;
 import org.apache.lucene.codecs.TermVectorsFormat;
 import org.apache.lucene.codecs.lucene40.Lucene40DocValuesFormat;
@@ -32,7 +32,6 @@ import org.apache.lucene.codecs.lucene40
 import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat;
 import org.apache.lucene.codecs.lucene40.Lucene40NormsFormat;
 import org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat;
-import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat;
 import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 
@@ -50,7 +49,7 @@ import org.apache.lucene.codecs.perfield
 // (it writes a minor version, etc).
 public class Lucene42Codec extends Codec {
   private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
-  private final TermVectorsFormat vectorsFormat = new Lucene40TermVectorsFormat();
+  private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
   private final FieldInfosFormat fieldInfosFormat = new Lucene40FieldInfosFormat();
   private final SegmentInfoFormat infosFormat = new Lucene40SegmentInfoFormat();
   private final LiveDocsFormat liveDocsFormat = new Lucene40LiveDocsFormat();



Mime
View raw message