lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1633993 [3/4] - in /lucene/dev/branches/branch_5x: ./ dev-tools/ lucene/ lucene/analysis/ lucene/backward-codecs/ lucene/backward-codecs/src/java/org/apache/lucene/codecs/blocktree/ lucene/backward-codecs/src/java/org/apache/lucene/codecs/...
Date Fri, 24 Oct 2014 03:29:37 GMT
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java Fri Oct 24 03:29:33 2014
@@ -46,7 +46,7 @@ public class CompressingTermVectorsForma
    * <p>
    * <code>formatName</code> is the name of the format. This name will be used
    * in the file formats to perform
-   * {@link CodecUtil#checkSegmentHeader codec header checks}.
+   * {@link CodecUtil#checkIndexHeader codec header checks}.
    * <p>
    * The <code>compressionMode</code> parameter allows you to choose between
    * compression algorithms that have various compression and decompression

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java Fri Oct 24 03:29:33 2014
@@ -114,8 +114,8 @@ public final class CompressingTermVector
       Throwable priorE = null;
       try {
         final String codecNameIdx = formatName + CODEC_SFX_IDX;
-        version = CodecUtil.checkSegmentHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
-        assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer();
+        version = CodecUtil.checkIndexHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
+        assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer();
         indexReader = new CompressingStoredFieldsIndexReader(input, si);
         input.readVLong(); // the end of the data file
       } catch (Throwable exception) {
@@ -133,11 +133,11 @@ public final class CompressingTermVector
       final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
       vectorsStream = d.openInput(vectorsStreamFN, context);
       final String codecNameDat = formatName + CODEC_SFX_DAT;
-      int version2 = CodecUtil.checkSegmentHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
+      int version2 = CodecUtil.checkIndexHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
       if (version != version2) {
         throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2, vectorsStream);
       }
-      assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
+      assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
       
       long pos = vectorsStream.getFilePointer();
       // NOTE: data file is too costly to verify checksum against all the bytes on open,

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java Fri Oct 24 03:29:33 2014
@@ -32,11 +32,8 @@ import org.apache.lucene.index.FieldInfo
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexFileNames;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.index.SegmentReader;
 import org.apache.lucene.store.BufferedChecksumIndexInput;
 import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.DataInput;
@@ -231,10 +228,10 @@ public final class CompressingTermVector
 
       final String codecNameIdx = formatName + CODEC_SFX_IDX;
       final String codecNameDat = formatName + CODEC_SFX_DAT;
-      CodecUtil.writeSegmentHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix);
-      CodecUtil.writeSegmentHeader(vectorsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix);
-      assert CodecUtil.segmentHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
-      assert CodecUtil.segmentHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
+      CodecUtil.writeIndexHeader(indexStream, codecNameIdx, VERSION_CURRENT, si.getId(), segmentSuffix);
+      CodecUtil.writeIndexHeader(vectorsStream, codecNameDat, VERSION_CURRENT, si.getId(), segmentSuffix);
+      assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
+      assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
 
       indexWriter = new CompressingStoredFieldsIndexWriter(indexStream);
       indexStream = null;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java Fri Oct 24 03:29:33 2014
@@ -106,7 +106,7 @@ public class Lucene50Codec extends Codec
   /** Returns the postings format that should be used for writing 
    *  new segments of <code>field</code>.
    *  
-   *  The default implementation always returns "Lucene41"
+   *  The default implementation always returns "Lucene50"
    */
   public PostingsFormat getPostingsFormatForField(String field) {
     return defaultFormat;
@@ -115,7 +115,7 @@ public class Lucene50Codec extends Codec
   /** Returns the docvalues format that should be used for writing 
    *  new segments of <code>field</code>.
    *  
-   *  The default implementation always returns "Lucene410"
+   *  The default implementation always returns "Lucene50"
    */
   public DocValuesFormat getDocValuesFormatForField(String field) {
     return defaultDVFormat;
@@ -126,8 +126,8 @@ public class Lucene50Codec extends Codec
     return docValuesFormat;
   }
 
-  private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
-  private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene410");
+  private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
+  private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene50");
 
   private final NormsFormat normsFormat = new Lucene50NormsFormat();
 

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundFormat.java Fri Oct 24 03:29:33 2014
@@ -46,7 +46,7 @@ import org.apache.lucene.store.IndexOutp
  *   <li>Compound (.cfs) --&gt; Header, FileData <sup>FileCount</sup>, Footer</li>
  *   <li>Compound Entry Table (.cfe) --&gt; Header, FileCount, &lt;FileName,
  *       DataOffset, DataLength&gt; <sup>FileCount</sup></li>
- *   <li>Header --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
+ *   <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  *   <li>FileCount --&gt; {@link DataOutput#writeVInt VInt}</li>
  *   <li>DataOffset,DataLength,Checksum --&gt; {@link DataOutput#writeLong UInt64}</li>
  *   <li>FileName --&gt; {@link DataOutput#writeString String}</li>
@@ -79,8 +79,8 @@ public final class Lucene50CompoundForma
     
     try (IndexOutput data =    dir.createOutput(dataFile, context);
          IndexOutput entries = dir.createOutput(entriesFile, context)) {
-      CodecUtil.writeSegmentHeader(data,    DATA_CODEC, VERSION_CURRENT, si.getId(), "");
-      CodecUtil.writeSegmentHeader(entries, ENTRY_CODEC, VERSION_CURRENT, si.getId(), "");
+      CodecUtil.writeIndexHeader(data,    DATA_CODEC, VERSION_CURRENT, si.getId(), "");
+      CodecUtil.writeIndexHeader(entries, ENTRY_CODEC, VERSION_CURRENT, si.getId(), "");
       
       // write number of files
       entries.writeVInt(files.size());

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundReader.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundReader.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50CompoundReader.java Fri Oct 24 03:29:33 2014
@@ -71,7 +71,7 @@ final class Lucene50CompoundReader exten
     boolean success = false;
     handle = directory.openInput(dataFileName, context);
     try {
-      CodecUtil.checkSegmentHeader(handle, Lucene50CompoundFormat.DATA_CODEC, version, version, si.getId(), "");
+      CodecUtil.checkIndexHeader(handle, Lucene50CompoundFormat.DATA_CODEC, version, version, si.getId(), "");
       
       // NOTE: data file is too costly to verify checksum against all the bytes on open,
       // but for now we at least verify proper structure of the checksum footer: which looks
@@ -93,7 +93,7 @@ final class Lucene50CompoundReader exten
     try (ChecksumIndexInput entriesStream = dir.openChecksumInput(entriesFileName, IOContext.READONCE)) {
       Throwable priorE = null;
       try {
-        version = CodecUtil.checkSegmentHeader(entriesStream, Lucene50CompoundFormat.ENTRY_CODEC, 
+        version = CodecUtil.checkIndexHeader(entriesStream, Lucene50CompoundFormat.ENTRY_CODEC, 
                                                               Lucene50CompoundFormat.VERSION_START, 
                                                               Lucene50CompoundFormat.VERSION_CURRENT, segmentID, "");
         final int numEntries = entriesStream.readVInt();

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java Fri Oct 24 03:29:33 2014
@@ -46,7 +46,7 @@ import org.apache.lucene.store.IndexOutp
  * FieldBits,DocValuesBits,DocValuesGen,Attributes&gt; <sup>FieldsCount</sup>,Footer</p>
  * <p>Data types:
  * <ul>
- *   <li>Header --&gt; {@link CodecUtil#checkSegmentHeader SegmentHeader}</li>
+ *   <li>Header --&gt; {@link CodecUtil#checkIndexHeader IndexHeader}</li>
  *   <li>FieldsCount --&gt; {@link DataOutput#writeVInt VInt}</li>
  *   <li>FieldName --&gt; {@link DataOutput#writeString String}</li>
  *   <li>FieldBits, IndexOptions, DocValuesBits --&gt; {@link DataOutput#writeByte Byte}</li>
@@ -114,9 +114,9 @@ public final class Lucene50FieldInfosFor
       Throwable priorE = null;
       FieldInfo infos[] = null;
       try {
-        CodecUtil.checkSegmentHeader(input, CODEC_NAME, 
-                                     FORMAT_START, 
-                                     FORMAT_CURRENT,
+        CodecUtil.checkIndexHeader(input, Lucene50FieldInfosFormat.CODEC_NAME, 
+                                     Lucene50FieldInfosFormat.FORMAT_START, 
+                                     Lucene50FieldInfosFormat.FORMAT_CURRENT,
                                      segmentInfo.getId(), segmentSuffix);
         
         final int size = input.readVInt(); //read in the size
@@ -251,7 +251,7 @@ public final class Lucene50FieldInfosFor
   public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
     final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION);
     try (IndexOutput output = directory.createOutput(fileName, context)) {
-      CodecUtil.writeSegmentHeader(output, CODEC_NAME, FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix);
+      CodecUtil.writeIndexHeader(output, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix);
       output.writeVInt(infos.size());
       for (FieldInfo fi : infos) {
         fi.checkConsistency();

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50LiveDocsFormat.java Fri Oct 24 03:29:33 2014
@@ -41,10 +41,9 @@ import org.apache.lucene.util.MutableBit
  * deletions.</p>
  * <p>Although per-segment, this file is maintained exterior to compound segment
  * files.</p>
- * <p>Deletions (.liv) --&gt; SegmentHeader,Generation,Bits</p>
+ * <p>Deletions (.liv) --&gt; IndexHeader,Generation,Bits</p>
  * <ul>
- *   <li>SegmentHeader --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
- *   <li>Generation --&gt; {@link DataOutput#writeLong Int64}
+ *   <li>SegmentHeader --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  *   <li>Bits --&gt; &lt;{@link DataOutput#writeLong Int64}&gt; <sup>LongCount</sup></li>
  * </ul>
  */
@@ -85,11 +84,8 @@ public final class Lucene50LiveDocsForma
     try (ChecksumIndexInput input = dir.openChecksumInput(name, context)) {
       Throwable priorE = null;
       try {
-        CodecUtil.checkSegmentHeader(input, CODEC_NAME, VERSION_START, VERSION_CURRENT, info.info.getId(), "");
-        long filegen = input.readLong();
-        if (gen != filegen) {
-          throw new CorruptIndexException("file mismatch, expected generation=" + gen + ", got=" + filegen, input);
-        }
+        CodecUtil.checkIndexHeader(input, CODEC_NAME, VERSION_START, VERSION_CURRENT, 
+                                     info.info.getId(), Long.toString(gen, Character.MAX_RADIX));
         long data[] = new long[FixedBitSet.bits2words(length)];
         for (int i = 0; i < data.length; i++) {
           data[i] = input.readLong();
@@ -120,8 +116,7 @@ public final class Lucene50LiveDocsForma
     }
     long data[] = fbs.getBits();
     try (IndexOutput output = dir.createOutput(name, context)) {
-      CodecUtil.writeSegmentHeader(output, CODEC_NAME, VERSION_CURRENT, info.info.getId(), "");
-      output.writeLong(gen);
+      CodecUtil.writeIndexHeader(output, CODEC_NAME, VERSION_CURRENT, info.info.getId(), Long.toString(gen, Character.MAX_RADIX));
       for (int i = 0; i < data.length; i++) {
         output.writeLong(data[i]);
       }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java Fri Oct 24 03:29:33 2014
@@ -47,6 +47,7 @@ class Lucene50NormsConsumer extends Norm
   static final byte CONST_COMPRESSED = 2;
   static final byte UNCOMPRESSED = 3;
   static final byte INDIRECT = 4;
+  static final byte PATCHED = 5;
   static final int BLOCK_SIZE = 1 << 14;
   
   // threshold for indirect encoding, computed as 1 - 1/log2(maxint)
@@ -61,10 +62,10 @@ class Lucene50NormsConsumer extends Norm
     try {
       String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
       data = state.directory.createOutput(dataName, state.context);
-      CodecUtil.writeSegmentHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+      CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
       String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
       meta = state.directory.createOutput(metaName, state.context);
-      CodecUtil.writeSegmentHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+      CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
       success = true;
     } finally {
       if (!success) {
@@ -82,6 +83,11 @@ class Lucene50NormsConsumer extends Norm
 
   @Override
   public void addNormsField(FieldInfo field, Iterable<Number> values) throws IOException {
+    writeNormsField(field, values, 0);
+  }
+  
+  private void writeNormsField(FieldInfo field, Iterable<Number> values, int level) throws IOException {
+    assert level <= 1; // we only "recurse" once in the indirect case
     meta.writeVInt(field.number);
     long minValue = Long.MAX_VALUE;
     long maxValue = Long.MIN_VALUE;
@@ -89,16 +95,12 @@ class Lucene50NormsConsumer extends Norm
     NormMap uniqueValues = new NormMap();
     
     int count = 0;
-    int missingCount = 0;
     
     for (Number nv : values) {
       if (nv == null) {
         throw new IllegalStateException("illegal norms data for field " + field.name + ", got null for value: " + count);
       }
       final long v = nv.longValue();
-      if (v == 0) {
-        missingCount++;
-      }
       
       minValue = Math.min(minValue, v);
       maxValue = Math.max(maxValue, v);
@@ -115,9 +117,15 @@ class Lucene50NormsConsumer extends Norm
     if (uniqueValues != null && uniqueValues.size == 1) {
       // 0 bpv
       addConstant(minValue);
-    } else if (count > 256 && missingCount > count * INDIRECT_THRESHOLD) {
-      // sparse encoding
-      addIndirect(field, values, count, missingCount);
+    } else if (level == 0 && count > 256 && uniqueValues != null && uniqueValues.maxFreq() > count * INDIRECT_THRESHOLD) {
+      long commonValue = uniqueValues.getDecodeTable()[uniqueValues.maxOrd()];
+      if (commonValue == 0) {
+        // if the common value is missing, don't waste RAM on a bitset, since we won't be searching those docs
+        addIndirect(field, values, count, uniqueValues);
+      } else {
+        // otherwise, write a sparse bitset, where 1 indicates 'uncommon value'.
+        addPatched(field, values, count, uniqueValues);
+      }
     } else if (uniqueValues != null) {
       // small number of unique values: this is the typical case:
       FormatAndBits compression = fastestFormatAndBits(uniqueValues.size-1);
@@ -200,10 +208,65 @@ class Lucene50NormsConsumer extends Norm
     writer.finish();
   }
   
-  private void addIndirect(FieldInfo field, final Iterable<Number> values, int count, int missingCount) throws IOException {
-    meta.writeVInt(count - missingCount);
+  // encodes only uncommon values in a sparse bitset
+  // access is constant time, and the common case is predictable
+  // exceptions nest either to CONST (if there are only 2 values), or INDIRECT (if there are > 2 values)
+  private void addPatched(FieldInfo field, final Iterable<Number> values, int count, NormMap uniqueValues) throws IOException {
+    final long decodeTable[] = uniqueValues.getDecodeTable();
+    int commonCount = uniqueValues.maxFreq();
+    final long commonValue = decodeTable[uniqueValues.maxOrd()];
+    
+    meta.writeVInt(count - commonCount);
+    meta.writeByte(PATCHED);
+    meta.writeLong(data.getFilePointer());
+    
+    // write docs with value
+    writeDocsWithValue(values, commonValue);
+    
+    // write exceptions: only two cases make sense
+    // bpv = 1 (folded into sparse bitset already)
+    // bpv > 1 (add indirect exception table)
+    meta.writeVInt(field.number);
+    if (uniqueValues.size == 2) {
+      // special case: implicit in bitset
+      int otherOrd = uniqueValues.maxOrd() == 0 ? 1 : 0;
+      addConstant(decodeTable[otherOrd]);
+    } else {
+      // exception table
+      addIndirect(field, values, count, uniqueValues);
+    }
+  }
+  
+  // encodes values as sparse array: keys[] and values[]
+  // access is log(N) where N = keys.length (slow!)
+  // so this is only appropriate as an exception table for patched, or when common value is 0 (wont be accessed by searching) 
+  private void addIndirect(FieldInfo field, final Iterable<Number> values, int count, NormMap uniqueValues) throws IOException {
+    int commonCount = uniqueValues.maxFreq();
+    final long commonValue = uniqueValues.getDecodeTable()[uniqueValues.maxOrd()];
+    
+    meta.writeVInt(count - commonCount);
     meta.writeByte(INDIRECT);
     meta.writeLong(data.getFilePointer());
+    
+    // write docs with value
+    writeDocsWithValue(values, commonValue);
+    
+    // write actual values
+    writeNormsField(field, new Iterable<Number>() {
+      @Override
+      public Iterator<Number> iterator() {
+        return new FilterIterator<Number,Number>(values.iterator()) {
+          @Override
+          protected boolean predicateFunction(Number value) {
+            return value.longValue() != commonValue;
+          }
+        };
+      }
+    }, 1);
+  }
+  
+  private void writeDocsWithValue(final Iterable<Number> values, long commonValue) throws IOException {
+    data.writeLong(commonValue);
     data.writeVInt(PackedInts.VERSION_CURRENT);
     data.writeVInt(BLOCK_SIZE);
     
@@ -212,25 +275,12 @@ class Lucene50NormsConsumer extends Norm
     int doc = 0;
     for (Number n : values) {
       long v = n.longValue();
-      if (v != 0) {
+      if (v != commonValue) {
         writer.add(doc);
       }
       doc++;
     }
     writer.finish();
-    
-    // write actual values
-    addNormsField(field, new Iterable<Number>() {
-      @Override
-      public Iterator<Number> iterator() {
-        return new FilterIterator<Number,Number>(values.iterator()) {
-          @Override
-          protected boolean predicateFunction(Number value) {
-            return value.longValue() != 0;
-          }
-        };
-      }
-    });
   }
   
   @Override
@@ -259,6 +309,7 @@ class Lucene50NormsConsumer extends Norm
   static class NormMap {
     // we use short: at most we will add 257 values to this map before its rejected as too big above.
     final short[] singleByteRange = new short[256];
+    final int[] freqs = new int[257];
     final Map<Long,Short> other = new HashMap<Long,Short>();
     int size;
     
@@ -273,18 +324,24 @@ class Lucene50NormsConsumer extends Norm
         int index = (int) (l + 128);
         short previous = singleByteRange[index];
         if (previous < 0) {
-          singleByteRange[index] = (short) size;
+          short slot = (short) size;
+          singleByteRange[index] = slot;
+          freqs[slot]++;
           size++;
           return true;
         } else {
+          freqs[previous]++;
           return false;
         }
       } else {
-        if (!other.containsKey(l)) {
+        Short previous = other.get(l);
+        if (previous == null) {
+          freqs[size]++;
           other.put(l, (short)size);
           size++;
           return true;
         } else {
+          freqs[previous]++;
           return false;
         }
       }
@@ -315,5 +372,35 @@ class Lucene50NormsConsumer extends Norm
       }
       return decode;
     }
+    
+    // TODO: if we need more complicated frequency-driven optos, maybe add 'finish' to this api
+    // and sort all ords by frequency. we could then lower BPV and waste a value to represent 'patched',
+    
+    /** retrieves frequency table for items (indexed by ordinal) */
+    public int[] getFreqs() {
+      return freqs;
+    }
+    
+    /** sugar: returns max value over getFreqs() */
+    public int maxFreq() {
+      int max = 0;
+      for (int i = 0; i < size; i++) {
+        max = Math.max(max, freqs[i]);
+      }
+      return max;
+    }
+    
+    /** sugar: returns ordinal with maxFreq() */
+    public int maxOrd() {
+      long max = 0;
+      int maxOrd = 0;
+      for (int i = 0; i < size; i++) {
+        if (freqs[i] > max) {
+          max = freqs[i];
+          maxOrd = i;
+        }
+      }
+      return maxOrd;
+    }
   }
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java Fri Oct 24 03:29:33 2014
@@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWr
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.SmallFloat;
 import org.apache.lucene.util.packed.BlockPackedWriter;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;
 
 /**
@@ -50,6 +51,9 @@ import org.apache.lucene.util.packed.Pac
  *    <li>Indirect: when norms are extremely sparse, missing values are omitted.
  *        Access to an individual value is slower, but missing norm values are never accessed
  *        by search code.
+ *    <li>Patched: when a single norm value dominates, a sparse bitset encodes docs with exceptions,
+ *        so that access to the common value is still very fast. outliers fall thru to an exception 
+ *        handling mechanism (Indirect or Constant).
  * </ul>
  * <p>
  * Files:
@@ -64,7 +68,7 @@ import org.apache.lucene.util.packed.Pac
  *      Norms data (.nvd)</p>
  *   <p>Norms metadata (.dvm) --&gt; Header,&lt;Entry&gt;<sup>NumFields</sup>,Footer</p>
  *   <ul>
- *     <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *     <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  *     <li>Entry --&gt; FieldNumber,Type,Offset</li>
  *     <li>FieldNumber --&gt; {@link DataOutput#writeVInt vInt}</li>
  *     <li>Type --&gt; {@link DataOutput#writeByte Byte}</li>
@@ -81,20 +85,24 @@ import org.apache.lucene.util.packed.Pac
  *             a lookup table of unique values is written, followed by the ordinal for each document.
  *         <li>2 --&gt; constant. When there is a single value for the entire field.
  *         <li>3 --&gt; uncompressed: Values written as a simple byte[].
- *         <li>4 --&gt; indirect. Only documents with a value are written with a sparse encoding.
+ *         <li>4 --&gt; indirect. Only documents with a value are written with monotonic compression. a nested
+ *             entry for the same field will follow for the exception handler.
+ *         <li>5 --&gt; patched. Encoded the same as indirect.
  *      </ul>
  *   <li><a name="nvd" id="nvd"></a>
  *   <p>The Norms data or .nvd file.</p>
  *   <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
- *   <p>Norms data (.nvd) --&gt; Header,&lt;Uncompressed | TableCompressed | DeltaCompressed&gt;<sup>NumFields</sup>,Footer</p>
+ *   <p>Norms data (.nvd) --&gt; Header,&lt;Uncompressed | TableCompressed | DeltaCompressed | MonotonicCompressed &gt;<sup>NumFields</sup>,Footer</p>
  *   <ul>
- *     <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *     <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  *     <li>Uncompressed --&gt;  {@link DataOutput#writeByte Byte}<sup>maxDoc</sup></li>
  *     <li>TableCompressed --&gt; PackedIntsVersion,Table,BitPackedData</li>
  *     <li>Table --&gt; TableSize, {@link DataOutput#writeLong int64}<sup>TableSize</sup></li>
  *     <li>BitpackedData --&gt; {@link PackedInts}</li>
  *     <li>DeltaCompressed --&gt; PackedIntsVersion,BlockSize,DeltaCompressedData</li>
  *     <li>DeltaCompressedData --&gt; {@link BlockPackedWriter BlockPackedWriter(blockSize=16k)}</li>
+ *     <li>MonotonicCompressed --&gt; PackedIntsVersion,BlockSize,MonotonicCompressedData</li>
+ *     <li>MonotonicCompressedData --&gt; {@link MonotonicBlockPackedWriter MonotonicBlockPackedWriter(blockSize=16k)}</li>
  *     <li>PackedIntsVersion,BlockSize,TableSize --&gt; {@link DataOutput#writeVInt vInt}</li>
  *     <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
  *   </ul>

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java Fri Oct 24 03:29:33 2014
@@ -37,6 +37,7 @@ import org.apache.lucene.util.Accountabl
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.SparseFixedBitSet;
 import org.apache.lucene.util.packed.BlockPackedReader;
 import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
 import org.apache.lucene.util.packed.PackedInts;
@@ -48,6 +49,7 @@ import static org.apache.lucene.codecs.l
 import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.TABLE_COMPRESSED;
 import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.UNCOMPRESSED;
 import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.INDIRECT;
+import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED;
 
 /**
  * Reader for {@link Lucene50NormsFormat}
@@ -63,6 +65,7 @@ class Lucene50NormsProducer extends Norm
   
   private final AtomicLong ramBytesUsed;
   private final AtomicInteger activeCount = new AtomicInteger();
+  private final int maxDoc;
   
   private final boolean merging;
   
@@ -75,11 +78,13 @@ class Lucene50NormsProducer extends Norm
     instancesInfo.putAll(original.instancesInfo);
     ramBytesUsed = new AtomicLong(original.ramBytesUsed.get());
     activeCount.set(original.activeCount.get());
+    maxDoc = original.maxDoc;
     merging = true;
   }
     
   Lucene50NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
     merging = false;
+    maxDoc = state.segmentInfo.getDocCount();
     String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
     ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass()));
     int version = -1;
@@ -88,7 +93,7 @@ class Lucene50NormsProducer extends Norm
     try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) {
       Throwable priorE = null;
       try {
-        version = CodecUtil.checkSegmentHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+        version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
         readFields(in, state.fieldInfos);
       } catch (Throwable exception) {
         priorE = exception;
@@ -101,7 +106,7 @@ class Lucene50NormsProducer extends Norm
     this.data = state.directory.openInput(dataName, state.context);
     boolean success = false;
     try {
-      final int version2 = CodecUtil.checkSegmentHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+      final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
       if (version != version2) {
         throw new CorruptIndexException("Format versions mismatch: meta=" + version + ",data=" + version2, data);
       }
@@ -146,6 +151,7 @@ class Lucene50NormsProducer extends Norm
       case TABLE_COMPRESSED:
       case DELTA_COMPRESSED:
         break;
+      case PATCHED:
       case INDIRECT:
         if (meta.readVInt() != info.number) {
           throw new CorruptIndexException("indirect norms entry for field: " + info.name + " is corrupt", meta);
@@ -254,6 +260,7 @@ class Lucene50NormsProducer extends Norm
       }
       case INDIRECT: {
         data.seek(entry.offset);
+        final long common = data.readLong();
         int packedIntsVersion = data.readVInt();
         int blockSize = data.readVInt();
         final MonotonicBlockPackedReader live = MonotonicBlockPackedReader.of(data, packedIntsVersion, blockSize, entry.count, false);
@@ -279,7 +286,34 @@ class Lucene50NormsProducer extends Norm
                 return values.get(mid);
               }
             }
-            return 0;
+            return common;
+          }
+        };
+        break;
+      }
+      case PATCHED: {
+        data.seek(entry.offset);
+        final long common = data.readLong();
+        int packedIntsVersion = data.readVInt();
+        int blockSize = data.readVInt();
+        MonotonicBlockPackedReader live = MonotonicBlockPackedReader.of(data, packedIntsVersion, blockSize, entry.count, true);
+        final SparseFixedBitSet set = new SparseFixedBitSet(maxDoc);
+        for (int i = 0; i < live.size(); i++) {
+          int doc = (int) live.get(i);
+          set.set(doc);
+        }
+        LoadedNorms nestedInstance = loadNorms(entry.nested);
+        instance.ramBytesUsed = set.ramBytesUsed() + nestedInstance.ramBytesUsed;
+        instance.info = Accountables.namedAccountable("patched -> " + nestedInstance.info, instance.ramBytesUsed);
+        final NumericDocValues values = nestedInstance.norms;
+        instance.norms = new NumericDocValues() {
+          @Override
+          public long get(int docID) {
+            if (set.get(docID)) {
+              return values.get(docID);
+            } else {
+              return common;
+            }
           }
         };
         break;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java Fri Oct 24 03:29:33 2014
@@ -34,7 +34,6 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.Version;
 
 /**
@@ -42,20 +41,19 @@ import org.apache.lucene.util.Version;
  * <p>
  * Files:
  * <ul>
- *   <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Id, Footer
+ *   <li><tt>.si</tt>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Footer
  * </ul>
  * </p>
  * Data types:
  * <p>
  * <ul>
- *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *   <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  *   <li>SegSize --&gt; {@link DataOutput#writeInt Int32}</li>
  *   <li>SegVersion --&gt; {@link DataOutput#writeString String}</li>
  *   <li>Files --&gt; {@link DataOutput#writeStringSet Set&lt;String&gt;}</li>
  *   <li>Diagnostics --&gt; {@link DataOutput#writeStringStringMap Map&lt;String,String&gt;}</li>
  *   <li>IsCompoundFile --&gt; {@link DataOutput#writeByte Int8}</li>
  *   <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
- *   <li>Id --&gt; {@link DataOutput#writeString String}</li>
  * </ul>
  * </p>
  * Field Descriptions:
@@ -84,15 +82,16 @@ public class Lucene50SegmentInfoFormat e
   }
   
   @Override
-  public SegmentInfo read(Directory dir, String segment, IOContext context) throws IOException {
+  public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException {
     final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene50SegmentInfoFormat.SI_EXTENSION);
     try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) {
       Throwable priorE = null;
       SegmentInfo si = null;
       try {
-        CodecUtil.checkHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME,
-                                     Lucene50SegmentInfoFormat.VERSION_START,
-                                     Lucene50SegmentInfoFormat.VERSION_CURRENT);
+        CodecUtil.checkIndexHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME,
+                                          Lucene50SegmentInfoFormat.VERSION_START,
+                                          Lucene50SegmentInfoFormat.VERSION_CURRENT,
+                                          segmentID, "");
         final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
         
         final int docCount = input.readInt();
@@ -103,10 +102,7 @@ public class Lucene50SegmentInfoFormat e
         final Map<String,String> diagnostics = input.readStringStringMap();
         final Set<String> files = input.readStringSet();
         
-        byte[] id = new byte[StringHelper.ID_LENGTH];
-        input.readBytes(id, 0, id.length);
-        
-        si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, id);
+        si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, segmentID);
         si.setFiles(files);
       } catch (Throwable exception) {
         priorE = exception;
@@ -124,7 +120,11 @@ public class Lucene50SegmentInfoFormat e
 
     boolean success = false;
     try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
-      CodecUtil.writeHeader(output, Lucene50SegmentInfoFormat.CODEC_NAME, Lucene50SegmentInfoFormat.VERSION_CURRENT);
+      CodecUtil.writeIndexHeader(output, 
+                                   Lucene50SegmentInfoFormat.CODEC_NAME, 
+                                   Lucene50SegmentInfoFormat.VERSION_CURRENT,
+                                   si.getId(),
+                                   "");
       Version version = si.getVersion();
       if (version.major < 5) {
         throw new IllegalArgumentException("invalid major version: should be >= 5 but got: " + version.major + " segment=" + si);
@@ -145,11 +145,6 @@ public class Lucene50SegmentInfoFormat e
         }
       }
       output.writeStringSet(files);
-      byte[] id = si.getId();
-      if (id.length != StringHelper.ID_LENGTH) {
-        throw new IllegalArgumentException("invalid id, got=" + StringHelper.idToString(id));
-      }
-      output.writeBytes(id, 0, id.length);
       CodecUtil.writeFooter(output);
       success = true;
     } finally {

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java Fri Oct 24 03:29:33 2014
@@ -52,7 +52,7 @@ import org.apache.lucene.util.packed.Pac
  * <p>Here is a more detailed description of the field data file format:</p>
  * <ul>
  * <li>FieldData (.fdt) --&gt; &lt;Header&gt;, PackedIntsVersion, &lt;Chunk&gt;<sup>ChunkCount</sup></li>
- * <li>Header --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
+ * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  * <li>PackedIntsVersion --&gt; {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li>
  * <li>ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment</li>
  * <li>Chunk --&gt; DocBase, ChunkDocs, DocFieldCounts, DocLengths, &lt;CompressedDocs&gt;</li>
@@ -104,7 +104,7 @@ import org.apache.lucene.util.packed.Pac
  * <p>A fields index file (extension <tt>.fdx</tt>).</p>
  * <ul>
  * <li>FieldsIndex (.fdx) --&gt; &lt;Header&gt;, &lt;ChunkIndex&gt;</li>
- * <li>Header --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
+ * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  * <li>ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}</li>
  * </ul>
  * </li>

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java Fri Oct 24 03:29:33 2014
@@ -59,7 +59,7 @@ import org.apache.lucene.util.packed.Pac
  * <p>Here is a more detailed description of the field data file format:</p>
  * <ul>
  * <li>VectorData (.tvd) --&gt; &lt;Header&gt;, PackedIntsVersion, ChunkSize, &lt;Chunk&gt;<sup>ChunkCount</sup>, Footer</li>
- * <li>Header --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
+ * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  * <li>PackedIntsVersion --&gt; {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li>
  * <li>ChunkSize is the number of bytes of terms to accumulate before flushing, as a {@link DataOutput#writeVInt VInt}</li>
  * <li>ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment</li>
@@ -113,7 +113,7 @@ import org.apache.lucene.util.packed.Pac
  * <p>An index file (extension <tt>.tvx</tt>).</p>
  * <ul>
  * <li>VectorIndex (.tvx) --&gt; &lt;Header&gt;, &lt;ChunkIndex&gt;, Footer</li>
- * <li>Header --&gt; {@link CodecUtil#writeSegmentHeader SegmentHeader}</li>
+ * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  * <li>ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}</li>
  * <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
  * </ul>

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package.html?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package.html (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package.html Fri Oct 24 03:29:33 2014
@@ -154,20 +154,20 @@ its title, url, or an identifier to acce
 returned for each hit when searching. This is keyed by document number.
 </li>
 <li>
-{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term dictionary}. 
+{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term dictionary}. 
 A dictionary containing all of the terms used in all of the
 indexed fields of all of the documents. The dictionary also contains the number
 of documents which contain the term, and pointers to the term's frequency and
 proximity data.
 </li>
 <li>
-{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Frequency data}. 
+{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Frequency data}. 
 For each term in the dictionary, the numbers of all the
 documents that contain that term, and the frequency of the term in that
 document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
 </li>
 <li>
-{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Proximity data}. 
+{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Proximity data}. 
 For each term in the dictionary, the positions that the
 term occurs in each document. Note that this will not exist if all fields in
 all documents omit position data.
@@ -185,7 +185,7 @@ term frequency. To add Term Vectors to y
 {@link org.apache.lucene.document.Field Field} constructors
 </li>
 <li>
-{@link org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat Per-document values}. 
+{@link org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat Per-document values}. 
 Like stored values, these are also keyed by document
 number, but are generally intended to be loaded into main memory for fast
 access. Whereas stored values are generally intended for summary results from
@@ -264,27 +264,27 @@ systems that frequently run out of file 
 <td>The stored fields for documents</td>
 </tr>
 <tr>
-<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Dictionary}</td>
+<td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary}</td>
 <td>.tim</td>
 <td>The term dictionary, stores term info</td>
 </tr>
 <tr>
-<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Index}</td>
+<td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index}</td>
 <td>.tip</td>
 <td>The index into the Term Dictionary</td>
 </tr>
 <tr>
-<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Frequencies}</td>
+<td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies}</td>
 <td>.doc</td>
 <td>Contains the list of docs which contain each term along with frequency</td>
 </tr>
 <tr>
-<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Positions}</td>
+<td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions}</td>
 <td>.pos</td>
 <td>Stores position information about where a term occurs in the index</td>
 </tr>
 <tr>
-<td>{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Payloads}</td>
+<td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads}</td>
 <td>.pay</td>
 <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
 </tr>
@@ -294,7 +294,7 @@ systems that frequently run out of file 
 <td>Encodes length and boost factors for docs and fields</td>
 </tr>
 <tr>
-<td>{@link org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat Per-Document Values}</td>
+<td>{@link org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat Per-Document Values}</td>
 <td>.dvd, .dvm</td>
 <td>Encodes additional scoring factors or other per-document information.</td>
 </tr>

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java Fri Oct 24 03:29:33 2014
@@ -32,8 +32,6 @@ import java.util.Map;
 
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.blocktree.FieldReader;
-import org.apache.lucene.codecs.blocktree.Stats;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
@@ -279,7 +277,7 @@ public class CheckIndex implements Close
        *  tree terms dictionary (this is only set if the
        *  {@link PostingsFormat} for this segment uses block
        *  tree. */
-      public Map<String,Stats> blockTreeStats = null;
+      public Map<String,Object> blockTreeStats = null;
     }
 
     /**
@@ -454,7 +452,7 @@ public class CheckIndex implements Close
   public Status checkIndex(List<String> onlySegments) throws IOException {
     ensureOpen();
     NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
-    SegmentInfos sis = new SegmentInfos();
+    SegmentInfos sis = null;
     Status result = new Status();
     result.dir = dir;
     String[] files = dir.listAll();
@@ -465,7 +463,7 @@ public class CheckIndex implements Close
     try {
       // Do not use SegmentInfos.read(Directory) since the spooky
       // retrying it does is not necessary here (we hold the write lock):
-      sis.read(dir, lastSegmentsFile);
+      sis = SegmentInfos.readCommit(dir, lastSegmentsFile);
     } catch (Throwable t) {
       if (failFast) {
         IOUtils.reThrow(t);
@@ -1290,14 +1288,12 @@ public class CheckIndex implements Close
         // docs got deleted and then merged away):
         
       } else {
-        if (fieldTerms instanceof FieldReader) {
-          final Stats stats = ((FieldReader) fieldTerms).computeStats();
-          assert stats != null;
-          if (status.blockTreeStats == null) {
-            status.blockTreeStats = new HashMap<>();
-          }
-          status.blockTreeStats.put(field, stats);
+        final Object stats = fieldTerms.getStats();
+        assert stats != null;
+        if (status.blockTreeStats == null) {
+          status.blockTreeStats = new HashMap<>();
         }
+        status.blockTreeStats.put(field, stats);
         
         if (sumTotalTermFreq != 0) {
           final long v = fields.terms(field).getSumTotalTermFreq();
@@ -1424,7 +1420,7 @@ public class CheckIndex implements Close
     }
     
     if (verbose && status.blockTreeStats != null && infoStream != null && status.termCount > 0) {
-      for(Map.Entry<String,Stats> ent : status.blockTreeStats.entrySet()) {
+      for(Map.Entry<String, Object> ent : status.blockTreeStats.entrySet()) {
         infoStream.println("      field \"" + ent.getKey() + "\":");
         infoStream.println("      " + ent.getValue().toString().replace("\n", "\n      "));
       }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java Fri Oct 24 03:29:33 2014
@@ -225,8 +225,7 @@ public abstract class DirectoryReader ex
 
     List<IndexCommit> commits = new ArrayList<>();
 
-    SegmentInfos latest = new SegmentInfos();
-    latest.read(dir);
+    SegmentInfos latest = SegmentInfos.readLatestCommit(dir);
     final long currentGen = latest.getGeneration();
 
     commits.add(new StandardDirectoryReader.ReaderCommit(latest, dir));
@@ -239,11 +238,11 @@ public abstract class DirectoryReader ex
           !fileName.equals(IndexFileNames.OLD_SEGMENTS_GEN) &&
           SegmentInfos.generationFromSegmentsFileName(fileName) < currentGen) {
 
-        SegmentInfos sis = new SegmentInfos();
+        SegmentInfos sis = null;
         try {
           // IOException allowed to throw there, in case
           // segments_N is corrupt
-          sis.read(dir, fileName);
+          sis = SegmentInfos.readCommit(dir, fileName);
         } catch (FileNotFoundException | NoSuchFileException fnfe) {
           // LUCENE-948: on NFS (and maybe others), if
           // you have writers switching back and forth
@@ -252,7 +251,6 @@ public abstract class DirectoryReader ex
           // file segments_X exists when in fact it
           // doesn't.  So, we catch this and handle it
           // as if the file does not exist
-          sis = null;
         }
 
         if (sis != null)

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java Fri Oct 24 03:29:33 2014
@@ -151,6 +151,11 @@ public class FilterLeafReader extends Le
     public boolean hasPayloads() {
       return in.hasPayloads();
     }
+
+    @Override
+    public Object getStats() throws IOException {
+      return in.getStats();
+    }
   }
 
   /** Base class for filtering {@link TermsEnum} implementations. */

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/IndexFileDeleter.java Fri Oct 24 03:29:33 2014
@@ -165,9 +165,9 @@ final class IndexFileDeleter implements 
             if (infoStream.isEnabled("IFD")) {
               infoStream.message("IFD", "init: load commit \"" + fileName + "\"");
             }
-            SegmentInfos sis = new SegmentInfos();
+            SegmentInfos sis = null;
             try {
-              sis.read(directory, fileName);
+              sis = SegmentInfos.readCommit(directory, fileName);
             } catch (FileNotFoundException | NoSuchFileException e) {
               // LUCENE-948: on NFS (and maybe others), if
               // you have writers switching back and forth
@@ -179,7 +179,6 @@ final class IndexFileDeleter implements 
               if (infoStream.isEnabled("IFD")) {
                 infoStream.message("IFD", "init: hit FileNotFoundException when loading commit \"" + fileName + "\"; skipping this commit point");
               }
-              sis = null;
             } catch (IOException e) {
               if (SegmentInfos.generationFromSegmentsFileName(fileName) <= currentGen && directory.fileLength(fileName) > 0) {
                 throw e;
@@ -187,7 +186,6 @@ final class IndexFileDeleter implements 
                 // Most likely we are opening an index that
                 // has an aborted "future" commit, so suppress
                 // exc in this case
-                sis = null;
               }
             }
             if (sis != null) {
@@ -215,9 +213,9 @@ final class IndexFileDeleter implements 
       // listing was stale (eg when index accessed via NFS
       // client with stale directory listing cache).  So we
       // try now to explicitly open this commit point:
-      SegmentInfos sis = new SegmentInfos();
+      SegmentInfos sis = null;
       try {
-        sis.read(directory, currentSegmentsFile);
+        sis = SegmentInfos.readCommit(directory, currentSegmentsFile);
       } catch (IOException e) {
         throw new CorruptIndexException("unable to read current segments_N file", currentSegmentsFile, e);
       }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java Fri Oct 24 03:29:33 2014
@@ -773,7 +773,6 @@ public class IndexWriter implements Clos
 
       // If index is too old, reading the segments will throw
       // IndexFormatTooOldException.
-      segmentInfos = new SegmentInfos();
 
       boolean initialIndexExists = true;
 
@@ -782,13 +781,17 @@ public class IndexWriter implements Clos
         // against an index that's currently open for
         // searching.  In this case we write the next
         // segments_N file with no segments:
+        SegmentInfos sis = null;
         try {
-          segmentInfos.read(directory);
-          segmentInfos.clear();
+          sis = SegmentInfos.readLatestCommit(directory);
+          sis.clear();
         } catch (IOException e) {
           // Likely this means it's a fresh directory
           initialIndexExists = false;
+          sis = new SegmentInfos();
         }
+        
+        segmentInfos = sis;
 
         // Record that we have a change (zero out all
         // segments) pending:
@@ -802,7 +805,7 @@ public class IndexWriter implements Clos
 
         // Do not use SegmentInfos.read(Directory) since the spooky
         // retrying it does is not necessary here (we hold the write lock):
-        segmentInfos.read(directory, lastSegmentsFile);
+        segmentInfos = SegmentInfos.readCommit(directory, lastSegmentsFile);
 
         IndexCommit commit = config.getIndexCommit();
         if (commit != null) {
@@ -813,8 +816,7 @@ public class IndexWriter implements Clos
           // points.
           if (commit.getDirectory() != directory)
             throw new IllegalArgumentException("IndexCommit's directory doesn't match my directory");
-          SegmentInfos oldInfos = new SegmentInfos();
-          oldInfos.read(directory, commit.getSegmentsFileName());
+          SegmentInfos oldInfos = SegmentInfos.readCommit(directory, commit.getSegmentsFileName());
           segmentInfos.replace(oldInfos);
           changed();
           if (infoStream.isEnabled("IW")) {
@@ -2401,8 +2403,7 @@ public class IndexWriter implements Clos
           if (infoStream.isEnabled("IW")) {
             infoStream.message("IW", "addIndexes: process directory " + dir);
           }
-          SegmentInfos sis = new SegmentInfos(); // read infos from dir
-          sis.read(dir);
+          SegmentInfos sis = SegmentInfos.readLatestCommit(dir); // read infos from dir
           totalDocCount += sis.totalDocCount();
 
           for (SegmentCommitInfo info : sis) {

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java Fri Oct 24 03:29:33 2014
@@ -60,18 +60,20 @@ import org.apache.lucene.util.StringHelp
  * Files:
  * <ul>
  * <li><tt>segments_N</tt>: Header, Version, NameCounter, SegCount, &lt;SegName,
- * SegCodec, DelGen, DeletionCount, FieldInfosGen, DocValuesGen,
+ * HasSegID, SegID, SegCodec, DelGen, DeletionCount, FieldInfosGen, DocValuesGen,
  * UpdatesFiles&gt;<sup>SegCount</sup>, CommitUserData, Footer
  * </ul>
  * </p>
  * Data types:
  * <p>
  * <ul>
- * <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
- * <li>GenHeader, NameCounter, SegCount, DeletionCount --&gt;
+ * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
+ * <li>NameCounter, SegCount, DeletionCount --&gt;
  * {@link DataOutput#writeInt Int32}</li>
  * <li>Generation, Version, DelGen, Checksum, FieldInfosGen, DocValuesGen --&gt;
  * {@link DataOutput#writeLong Int64}</li>
+ * <li>HasSegID --&gt; {@link DataOutput#writeByte Int8}</li>
+ * <li>SegID --&gt; {@link DataOutput#writeByte Int8<sup>ID_LENGTH</sup>}</li>
  * <li>SegName, SegCodec --&gt; {@link DataOutput#writeString String}</li>
  * <li>CommitUserData --&gt; {@link DataOutput#writeStringStringMap
  * Map&lt;String,String&gt;}</li>
@@ -94,6 +96,10 @@ import org.apache.lucene.util.StringHelp
  * <li>DeletionCount records the number of deleted documents in this segment.</li>
  * <li>SegCodec is the {@link Codec#getName() name} of the Codec that encoded
  * this segment.</li>
+ * <li>HasSegID is nonzero if the segment has an identifier. Otherwise, when it is 0
+ * the identifier is {@code null} and no SegID is written. Null only happens for Lucene
+ * 4.x segments referenced in commits.</li>
+ * <li>SegID is the identifier of the Codec that encoded this segment. </li>
  * <li>CommitUserData stores an optional user-supplied opaque
  * Map&lt;String,String&gt; that was passed to
  * {@link IndexWriter#setCommitData(java.util.Map)}.</li>
@@ -155,8 +161,8 @@ public final class SegmentInfos implemen
   private byte[] id;
 
   /** Sole constructor. Typically you call this and then
-   *  use {@link #read(Directory) or
-   *  #read(Directory,String)} to populate each {@link
+   *  use {@link #readLatestCommit(Directory) or
+   *  #readCommit(Directory,String)} to populate each {@link
    *  SegmentCommitInfo}.  Alternatively, you can add/remove your
    *  own {@link SegmentCommitInfo}s. */
   public SegmentInfos() {
@@ -245,21 +251,14 @@ public final class SegmentInfos implemen
       throw new IllegalArgumentException("fileName \"" + fileName + "\" is not a segments file");
     }
   }
-
-  /**
-   * Get the next pending_segments_N filename that will be written.
-   */
-  public String getNextPendingSegmentFileName() {
-    long nextGeneration;
-
+  
+  /** return generation of the next pending_segments_N that will be written */
+  private long getNextPendingGeneration() {
     if (generation == -1) {
-      nextGeneration = 1;
+      return 1;
     } else {
-      nextGeneration = generation+1;
+      return generation+1;
     }
-    return IndexFileNames.fileNameFromGeneration(IndexFileNames.PENDING_SEGMENTS,
-                                                 "",
-                                                 nextGeneration);
   }
 
   /** Since Lucene 5.0, every commit (segments_N) writes a unique id.  This will
@@ -277,18 +276,10 @@ public final class SegmentInfos implemen
    * @throws CorruptIndexException if the index is corrupt
    * @throws IOException if there is a low-level IO error
    */
-  public final void read(Directory directory, String segmentFileName) throws IOException {
-    boolean success = false;
-
-    // Clear any previous segments:
-    this.clear();
-
-    generation = generationFromSegmentsFileName(segmentFileName);
+  public static final SegmentInfos readCommit(Directory directory, String segmentFileName) throws IOException {
 
-    lastGeneration = generation;
-
-    ChecksumIndexInput input = directory.openChecksumInput(segmentFileName, IOContext.READ);
-    try {
+    long generation = generationFromSegmentsFileName(segmentFileName);
+    try (ChecksumIndexInput input = directory.openChecksumInput(segmentFileName, IOContext.READ)) {
       // NOTE: as long as we want to throw indexformattooold (vs corruptindexexception), we need
       // to read the magic ourselves.
       int magic = input.readInt();
@@ -297,17 +288,42 @@ public final class SegmentInfos implemen
       }
       // 4.0+
       int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_40, VERSION_50);
-      version = input.readLong();
-      counter = input.readInt();
+      // 5.0+
+      byte id[] = null;
+      if (format >= VERSION_50) {
+        id = new byte[StringHelper.ID_LENGTH];
+        input.readBytes(id, 0, id.length);
+        CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX));
+      }
+      
+      SegmentInfos infos = new SegmentInfos();
+      infos.id = id;
+      infos.generation = generation;
+      infos.lastGeneration = generation;
+      infos.version = input.readLong();
+      infos.counter = input.readInt();
       int numSegments = input.readInt();
       if (numSegments < 0) {
         throw new CorruptIndexException("invalid segment count: " + numSegments, input);
       }
       for (int seg = 0; seg < numSegments; seg++) {
         String segName = input.readString();
+        final byte segmentID[];
+        if (format >= VERSION_50) {
+          byte hasID = input.readByte();
+          if (hasID == 1) {
+            segmentID = new byte[StringHelper.ID_LENGTH];
+            input.readBytes(segmentID, 0, segmentID.length);
+          } else if (hasID == 0) {
+            segmentID = null; // 4.x segment, doesn't have an ID
+          } else {
+            throw new CorruptIndexException("invalid hasID byte, got: " + hasID, input);
+          }
+        } else {
+          segmentID = null;
+        }
         Codec codec = Codec.forName(input.readString());
-        //System.out.println("SIS.read seg=" + seg + " codec=" + codec);
-        SegmentInfo info = codec.segmentInfoFormat().read(directory, segName, IOContext.READ);
+        SegmentInfo info = codec.segmentInfoFormat().read(directory, segName, segmentID, IOContext.READ);
         info.setCodec(codec);
         long delGen = input.readLong();
         int delCount = input.readInt();
@@ -358,13 +374,9 @@ public final class SegmentInfos implemen
             siPerCommit.setDocValuesUpdatesFiles(dvUpdateFiles);
           }
         }
-        add(siPerCommit);
-      }
-      userData = input.readStringStringMap();
-      if (format >= VERSION_50) {
-        id = new byte[StringHelper.ID_LENGTH];
-        input.readBytes(id, 0, id.length);
+        infos.add(siPerCommit);
       }
+      infos.userData = input.readStringStringMap();
 
       if (format >= VERSION_48) {
         CodecUtil.checkFooter(input);
@@ -378,30 +390,17 @@ public final class SegmentInfos implemen
         CodecUtil.checkEOF(input);
       }
 
-      success = true;
-    } finally {
-      if (!success) {
-        // Clear any segment infos we had loaded so we
-        // have a clean slate on retry:
-        this.clear();
-        IOUtils.closeWhileHandlingException(input);
-      } else {
-        input.close();
-      }
+      return infos;
     }
   }
 
   /** Find the latest commit ({@code segments_N file}) and
    *  load all {@link SegmentCommitInfo}s. */
-  public final void read(Directory directory) throws IOException {
-    generation = lastGeneration = -1;
-
-    new FindSegmentsFile(directory) {
-
+  public static final SegmentInfos readLatestCommit(Directory directory) throws IOException {
+    return new FindSegmentsFile<SegmentInfos>(directory) {
       @Override
-      protected Object doBody(String segmentFileName) throws IOException {
-        read(directory, segmentFileName);
-        return null;
+      protected SegmentInfos doBody(String segmentFileName) throws IOException {
+        return readCommit(directory, segmentFileName);
       }
     }.run();
   }
@@ -412,27 +411,38 @@ public final class SegmentInfos implemen
 
   private void write(Directory directory) throws IOException {
 
-    String segmentFileName = getNextPendingSegmentFileName();
+    long nextGeneration = getNextPendingGeneration();
+    String segmentFileName = IndexFileNames.fileNameFromGeneration(IndexFileNames.PENDING_SEGMENTS,
+                                                                   "",
+                                                                   nextGeneration);
     
     // Always advance the generation on write:
-    if (generation == -1) {
-      generation = 1;
-    } else {
-      generation++;
-    }
+    generation = nextGeneration;
     
     IndexOutput segnOutput = null;
     boolean success = false;
 
     try {
       segnOutput = directory.createOutput(segmentFileName, IOContext.DEFAULT);
-      CodecUtil.writeHeader(segnOutput, "segments", VERSION_50);
+      CodecUtil.writeIndexHeader(segnOutput, "segments", VERSION_50, 
+                                   StringHelper.randomId(), Long.toString(nextGeneration, Character.MAX_RADIX));
       segnOutput.writeLong(version); 
       segnOutput.writeInt(counter); // write counter
       segnOutput.writeInt(size()); // write infos
       for (SegmentCommitInfo siPerCommit : this) {
         SegmentInfo si = siPerCommit.info;
         segnOutput.writeString(si.name);
+        byte segmentID[] = si.getId();
+        // TODO: remove this in lucene 6, we don't need to include 4.x segments in commits anymore
+        if (segmentID == null) {
+          segnOutput.writeByte((byte)0);
+        } else {
+          if (segmentID.length != StringHelper.ID_LENGTH) {
+            throw new IllegalStateException("cannot write segment: invalid id segment=" + si.name + "id=" + StringHelper.idToString(segmentID));
+          }
+          segnOutput.writeByte((byte)1);
+          segnOutput.writeBytes(segmentID, segmentID.length);
+        }
         segnOutput.writeString(si.getCodec().getName());
         segnOutput.writeLong(siPerCommit.getDelGen());
         int delCount = siPerCommit.getDelCount();
@@ -452,8 +462,6 @@ public final class SegmentInfos implemen
         assert si.dir == directory;
       }
       segnOutput.writeStringStringMap(userData);
-      byte[] id = StringHelper.randomId();
-      segnOutput.writeBytes(id, 0, id.length);
       CodecUtil.writeFooter(segnOutput);
       segnOutput.close();
       directory.sync(Collections.singleton(segmentFileName));
@@ -547,7 +555,7 @@ public final class SegmentInfos implemen
    * time, etc., it could have been deleted due to a writer
    * commit finishing.
    */
-  public abstract static class FindSegmentsFile {
+  public abstract static class FindSegmentsFile<T> {
 
     final Directory directory;
 
@@ -558,12 +566,12 @@ public final class SegmentInfos implemen
 
     /** Locate the most recent {@code segments} file and
      *  run {@link #doBody} on it. */
-    public Object run() throws IOException {
+    public T run() throws IOException {
       return run(null);
     }
     
     /** Run {@link #doBody} on the provided commit. */
-    public Object run(IndexCommit commit) throws IOException {
+    public T run(IndexCommit commit) throws IOException {
       if (commit != null) {
         if (directory != commit.getDirectory())
           throw new IOException("the specified commit does not match the specified Directory");
@@ -607,11 +615,11 @@ public final class SegmentInfos implemen
           String segmentFileName = IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", gen);
         
           try {
-            Object v = doBody(segmentFileName);
+            T t = doBody(segmentFileName);
             if (infoStream != null) {
               message("success on " + segmentFileName);
             }
-            return v;
+            return t;
           } catch (IOException err) {
             // Save the original root cause:
             if (exc == null) {
@@ -634,7 +642,7 @@ public final class SegmentInfos implemen
      * during the processing that could have been caused by
      * a writer committing.
      */
-    protected abstract Object doBody(String segmentFileName) throws IOException;
+    protected abstract T doBody(String segmentFileName) throws IOException;
   }
 
   // Carry over generation numbers from another SegmentInfos

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java Fri Oct 24 03:29:33 2014
@@ -47,11 +47,10 @@ final class StandardDirectoryReader exte
 
   /** called from DirectoryReader.open(...) methods */
   static DirectoryReader open(final Directory directory, final IndexCommit commit) throws IOException {
-    return (DirectoryReader) new SegmentInfos.FindSegmentsFile(directory) {
+    return new SegmentInfos.FindSegmentsFile<DirectoryReader>(directory) {
       @Override
-      protected Object doBody(String segmentFileName) throws IOException {
-        SegmentInfos sis = new SegmentInfos();
-        sis.read(directory, segmentFileName);
+      protected DirectoryReader doBody(String segmentFileName) throws IOException {
+        SegmentInfos sis = SegmentInfos.readCommit(directory, segmentFileName);
         final SegmentReader[] readers = new SegmentReader[sis.size()];
         for (int i = sis.size()-1; i >= 0; i--) {
           boolean success = false;
@@ -309,11 +308,10 @@ final class StandardDirectoryReader exte
   }
 
   private DirectoryReader doOpenFromCommit(IndexCommit commit) throws IOException {
-    return (DirectoryReader) new SegmentInfos.FindSegmentsFile(directory) {
+    return new SegmentInfos.FindSegmentsFile<DirectoryReader>(directory) {
       @Override
-      protected Object doBody(String segmentFileName) throws IOException {
-        final SegmentInfos infos = new SegmentInfos();
-        infos.read(directory, segmentFileName);
+      protected DirectoryReader doBody(String segmentFileName) throws IOException {
+        final SegmentInfos infos = SegmentInfos.readCommit(directory, segmentFileName);
         return doOpenIfChanged(infos);
       }
     }.run(commit);
@@ -338,8 +336,7 @@ final class StandardDirectoryReader exte
       // IndexWriter.prepareCommit has been called (but not
       // yet commit), then the reader will still see itself as
       // current:
-      SegmentInfos sis = new SegmentInfos();
-      sis.read(directory);
+      SegmentInfos sis = SegmentInfos.readLatestCommit(directory);
 
       // we loaded SegmentInfos from the directory
       return sis.getVersion() == segmentInfos.getVersion();

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Terms.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Terms.java Fri Oct 24 03:29:33 2014
@@ -193,4 +193,18 @@ public abstract class Terms {
       scratch.grow(scratch.length());
     }
   }
+  
+  /** 
+   * Expert: returns additional information about this Terms instance
+   * for debugging purposes.
+   */
+  public Object getStats() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    sb.append("impl=" + getClass().getSimpleName());
+    sb.append(",size=" + size());
+    sb.append(",docCount=" + getDocCount());
+    sb.append(",sumTotalTermFreq=" + getSumTotalTermFreq());
+    sb.append(",sumDocFreq=" + getSumDocFreq());
+    return sb.toString();
+  }
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat Fri Oct 24 03:29:33 2014
@@ -13,4 +13,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-org.apache.lucene.codecs.lucene410.Lucene410DocValuesFormat
+org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat

Modified: lucene/dev/branches/branch_5x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat Fri Oct 24 03:29:33 2014
@@ -13,4 +13,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat
+org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat

Copied: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java (from r1633991, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java?p2=lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java&p1=lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java&r1=1633991&r2=1633993&rev=1633993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestBlockPostingsFormat2.java Fri Oct 24 03:29:33 2014
@@ -27,6 +27,7 @@ import org.apache.lucene.index.FieldInfo
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
@@ -83,8 +84,8 @@ public class TestBlockPostingsFormat2 ex
   public void testDFBlockSize() throws Exception {
     Document doc = newDocument();
     for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE; i++) {
-      for (Field f : doc.getFields()) {
-        f.setStringValue(f.name() + " " + f.name() + "_2");
+      for (IndexableField f : doc.getFields()) {
+        ((Field)f).setStringValue(f.name() + " " + f.name() + "_2");
       }
       iw.addDocument(doc);
     }
@@ -94,8 +95,8 @@ public class TestBlockPostingsFormat2 ex
   public void testDFBlockSizeMultiple() throws Exception {
     Document doc = newDocument();
     for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE * 16; i++) {
-      for (Field f : doc.getFields()) {
-        f.setStringValue(f.name() + " " + f.name() + "_2");
+      for (IndexableField f : doc.getFields()) {
+        ((Field)f).setStringValue(f.name() + " " + f.name() + "_2");
       }
       iw.addDocument(doc);
     }
@@ -105,8 +106,8 @@ public class TestBlockPostingsFormat2 ex
   public void testTTFBlockSize() throws Exception {
     Document doc = newDocument();
     for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE/2; i++) {
-      for (Field f : doc.getFields()) {
-        f.setStringValue(f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2");
+      for (IndexableField f : doc.getFields()) {
+        ((Field)f).setStringValue(f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2");
       }
       iw.addDocument(doc);
     }
@@ -116,7 +117,7 @@ public class TestBlockPostingsFormat2 ex
   public void testTTFBlockSizeMultiple() throws Exception {
     Document doc = newDocument();
     for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE/2; i++) {
-      for (Field f : doc.getFields()) {
+      for (IndexableField f : doc.getFields()) {
         String proto = (f.name() + " " + f.name() + " " + f.name() + " " + f.name() + " " 
                        + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2 " + f.name() + "_2");
         StringBuilder val = new StringBuilder();
@@ -124,7 +125,7 @@ public class TestBlockPostingsFormat2 ex
           val.append(proto);
           val.append(" ");
         }
-        f.setStringValue(val.toString());
+        ((Field)f).setStringValue(val.toString());
       }
       iw.addDocument(doc);
     }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java?rev=1633993&r1=1633992&r2=1633993&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java Fri Oct 24 03:29:33 2014
@@ -17,7 +17,13 @@ package org.apache.lucene.codecs.lucene5
  * limitations under the License.
  */
 
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.NormMap;
 import org.apache.lucene.index.BaseNormsFormatTestCase;
 import org.apache.lucene.util.TestUtil;
 
@@ -30,5 +36,92 @@ public class TestLucene50NormsFormat ext
   @Override
   protected Codec getCodec() {
     return codec;
-  } 
+  }
+  
+  // NormMap is rather complicated, doing domain encoding / tracking frequencies etc.
+  // test it directly some here...
+
+  public void testNormMapSimple() {
+    NormMap map = new NormMap();
+    map.add(10);
+    map.add(5);
+    map.add(4);
+    map.add(10);
+    assertEquals(3, map.size);
+    
+    // first come, first serve ord assignment
+    
+    // encode
+    assertEquals(0, map.getOrd(10));
+    assertEquals(1, map.getOrd(5));
+    assertEquals(2, map.getOrd(4));
+    
+    // decode
+    long decode[] = map.getDecodeTable();
+    assertEquals(10, decode[0]);
+    assertEquals(5, decode[1]);
+    assertEquals(4, decode[2]);
+    
+    // freqs
+    int freqs[] = map.getFreqs();
+    assertEquals(2, freqs[0]);
+    assertEquals(1, freqs[1]);
+    assertEquals(1, freqs[2]);
+    
+    assertEquals(2, map.maxFreq());
+  }
+  
+  public void testNormMapRandom() {
+    Map<Long,Integer> freqs = new HashMap<>();
+    Map<Long,Integer> ords = new HashMap<>();
+    
+    Set<Long> uniqueValuesSet = new HashSet<>();
+    int numUniqValues = TestUtil.nextInt(random(), 1, 256);
+    for (int i = 0; i < numUniqValues; i++) {
+      if (random().nextBoolean()) {
+        uniqueValuesSet.add(TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE));
+      } else {
+        uniqueValuesSet.add(TestUtil.nextLong(random(), Byte.MIN_VALUE, Byte.MAX_VALUE));
+      }
+    }
+    
+    Long uniqueValues[] = uniqueValuesSet.toArray(new Long[uniqueValuesSet.size()]);
+    
+    NormMap map = new NormMap();
+    int numdocs = TestUtil.nextInt(random(), 1, 100000);
+    for (int i = 0; i < numdocs; i++) {
+      long value = uniqueValues[random().nextInt(uniqueValues.length)];
+      // now add to both expected and actual
+      map.add(value);
+      
+      Integer ord = ords.get(value);
+      if (ord == null) {
+        ord = ords.size();
+        ords.put(value, ord);
+        freqs.put(value, 1);
+      } else {
+        freqs.put(value, freqs.get(value)+1);
+      }
+    }
+    
+    // value -> ord
+    assertEquals(ords.size(), map.size);
+    for (Map.Entry<Long,Integer> kv : ords.entrySet()) {
+      assertEquals(kv.getValue().intValue(), map.getOrd(kv.getKey()));
+    }
+    
+    // ord -> value
+    Map<Long,Integer> reversed = new HashMap<>();
+    long table[] = map.getDecodeTable();
+    for (int i = 0; i < map.size; i++) {
+      reversed.put(table[i], i);
+    }
+    assertEquals(ords, reversed);
+    
+    // freqs
+    int freqTable[] = map.getFreqs();
+    for (int i = 0; i < map.size; i++) {
+      assertEquals(freqs.get(table[i]).longValue(), freqTable[i]);
+    }
+  }
 }



Mime
View raw message