lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rjer...@apache.org
Subject svn commit: r1635856 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/lucene50/ lucene/core/src/test/org/apache/lucene/codecs/lucene50/ lucene/test-framework/ lucene/test-framework/src/java/org/...
Date Fri, 31 Oct 2014 20:25:22 GMT
Author: rjernst
Date: Fri Oct 31 20:25:22 2014
New Revision: 1635856

URL: http://svn.apache.org/r1635856
Log:
LUCENE-6030: Add norms patched compression for a small number of common values (merged 1635854)

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/core/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java
    lucene/dev/branches/branch_5x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Fri Oct 31 20:25:22 2014
@@ -236,6 +236,9 @@ Optimizations
 * LUCENE-6022: DocValuesDocIdSet checks live docs before doc values.
   (Adrien Grand)
 
+* LUCENE-6030: Add norms patched compression for a small number of common values
+  (Ryan Ernst)
+
 Build
 
 * LUCENE-5909: Smoke tester now has better command line parsing and

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java
(original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java
Fri Oct 31 20:25:22 2014
@@ -19,9 +19,7 @@ package org.apache.lucene.codecs.lucene5
 
 import java.io.IOException;
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.Iterator;
-import java.util.Map;
 
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.NormsConsumer;
@@ -31,6 +29,7 @@ import org.apache.lucene.index.SegmentWr
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.FilterIterator;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InPlaceMergeSorter;
 import org.apache.lucene.util.packed.BlockPackedWriter;
 import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;
@@ -47,7 +46,8 @@ class Lucene50NormsConsumer extends Norm
   static final byte CONST_COMPRESSED = 2;
   static final byte UNCOMPRESSED = 3;
   static final byte INDIRECT = 4;
-  static final byte PATCHED = 5;
+  static final byte PATCHED_BITSET = 5;
+  static final byte PATCHED_TABLE = 6;
   static final int BLOCK_SIZE = 1 << 14;
   
   // threshold for indirect encoding, computed as 1 - 1/log2(maxint)
@@ -89,11 +89,7 @@ class Lucene50NormsConsumer extends Norm
   private void writeNormsField(FieldInfo field, Iterable<Number> values, int level)
throws IOException {
     assert level <= 1; // we only "recurse" once in the indirect case
     meta.writeVInt(field.number);
-    long minValue = Long.MAX_VALUE;
-    long maxValue = Long.MIN_VALUE;
-    // TODO: more efficient?
     NormMap uniqueValues = new NormMap();
-    
     int count = 0;
     
     for (Number nv : values) {
@@ -101,45 +97,76 @@ class Lucene50NormsConsumer extends Norm
         throw new IllegalStateException("illegal norms data for field " + field.name + ",
got null for value: " + count);
       }
       final long v = nv.longValue();
-      
-      minValue = Math.min(minValue, v);
-      maxValue = Math.max(maxValue, v);
-      
+
       if (uniqueValues != null) {
-        if (uniqueValues.add(v)) {
-          if (uniqueValues.size > 256) {
-            uniqueValues = null;
+        if (v >= Byte.MIN_VALUE && v <= Byte.MAX_VALUE) {
+          if (uniqueValues.add((byte) v)) {
+            if (uniqueValues.size > 256) {
+              uniqueValues = null;
+            }
           }
+        } else {
+          // anything outside an 8 bit float comes from a custom scorer, which is an extreme
edge case
+          uniqueValues = null;
         }
       }
       count++;
     }
-    if (uniqueValues != null && uniqueValues.size == 1) {
+
+    if (uniqueValues == null) {
+      addDeltaCompressed(values, count);
+    } else if (uniqueValues.size == 1) {
       // 0 bpv
-      addConstant(minValue);
-    } else if (level == 0 && count > 256 && uniqueValues != null &&
uniqueValues.maxFreq() > count * INDIRECT_THRESHOLD) {
-      long commonValue = uniqueValues.getDecodeTable()[uniqueValues.maxOrd()];
-      if (commonValue == 0) {
-        // if the common value is missing, don't waste RAM on a bitset, since we won't be
searching those docs
-        addIndirect(field, values, count, uniqueValues);
-      } else {
-        // otherwise, write a sparse bitset, where 1 indicates 'uncommon value'.
-        addPatched(field, values, count, uniqueValues);
-      }
-    } else if (uniqueValues != null) {
-      // small number of unique values: this is the typical case:
-      FormatAndBits compression = fastestFormatAndBits(uniqueValues.size-1);
+      addConstant(uniqueValues.values[0]);
+    } else {
+      // small number of unique values: this is the typical case
+      uniqueValues.optimizeOrdinals();
       
-      if (compression.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE &&
maxValue <= Byte.MAX_VALUE) {
-        addUncompressed(values, count);
+      int numCommonValues = -1;
+      int commonValuesCount = 0;
+      if (level == 0 && count > 256) {
+        float threshold_count = count * INDIRECT_THRESHOLD;
+        if (uniqueValues.freqs[0] > threshold_count) {
+          numCommonValues = 1;
+        } else if ((commonValuesCount = sum(uniqueValues.freqs, 0, 3)) > threshold_count
&& uniqueValues.size > 4) {
+          numCommonValues = 3;
+        } else if ((commonValuesCount = sum(uniqueValues.freqs, 0, 15)) > threshold_count
&& uniqueValues.size > 16) {
+          numCommonValues = 15;
+        }
+      }
+
+      if (numCommonValues == -1) {
+        // no pattern in values, just find the most efficient way to pack the values
+        FormatAndBits compression = fastestFormatAndBits(uniqueValues.size - 1);
+        if (compression.bitsPerValue == 8) {
+          addUncompressed(values, count);
+        } else {
+          addTableCompressed(values, compression, count, uniqueValues);
+        }
+        
+      } else if (numCommonValues == 1) {
+        byte commonValue = uniqueValues.values[0];
+        if (commonValue == 0) {
+          // if the common value is missing, don't waste RAM on a bitset, since we won't
be searching those docs
+          addIndirect(field, values, count, uniqueValues, 0);
+        } else {
+          // otherwise, write a sparse bitset, where 1 indicates 'uncommon value'.
+          addPatchedBitset(field, values, count, uniqueValues);
+        }
       } else {
-        addTableCompressed(values, compression, count, uniqueValues);
+        addPatchedTable(field, values, numCommonValues, commonValuesCount, count, uniqueValues);
       }
-    } else {
-      addDeltaCompressed(values, count);
     }
   }
   
+  private int sum(int[] freqs, int start, int end) {
+    int accum = 0;
+    for (int i = start; i < end; ++i) {
+      accum += freqs[i];
+    }
+    return accum;
+  }
+  
   private FormatAndBits fastestFormatAndBits(int max) {
     // we only use bpv=1,2,4,8     
     PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
@@ -152,18 +179,18 @@ class Lucene50NormsConsumer extends Norm
     return new FormatAndBits(format, bitsPerValue);
   }
   
-  private void addConstant(long constant) throws IOException {
+  private void addConstant(byte constant) throws IOException {
     meta.writeVInt(0);
     meta.writeByte(CONST_COMPRESSED);
     meta.writeLong(constant);
   }
-  
+
   private void addUncompressed(Iterable<Number> values, int count) throws IOException
{
     meta.writeVInt(count);
     meta.writeByte(UNCOMPRESSED); // uncompressed byte[]
     meta.writeLong(data.getFilePointer());
     for (Number nv : values) {
-      data.writeByte((byte) nv.longValue());
+      data.writeByte(nv.byteValue());
     }
   }
   
@@ -171,25 +198,28 @@ class Lucene50NormsConsumer extends Norm
     meta.writeVInt(count);
     meta.writeByte(TABLE_COMPRESSED); // table-compressed
     meta.writeLong(data.getFilePointer());
-    data.writeVInt(PackedInts.VERSION_CURRENT);
-    
-    long[] decode = uniqueValues.getDecodeTable();
-    // upgrade to power of two sized array
-    int size = 1 << compression.bitsPerValue;
-    data.writeVInt(size);
-    for (int i = 0; i < decode.length; i++) {
-      data.writeLong(decode[i]);
-    }
-    for (int i = decode.length; i < size; i++) {
-      data.writeLong(0);
-    }
 
+    writeTable(values, compression, count, uniqueValues, uniqueValues.size);
+  }
+
+  private void writeTable(Iterable<Number> values, FormatAndBits compression, int count,
NormMap uniqueValues, int numOrds) throws IOException {
+    data.writeVInt(PackedInts.VERSION_CURRENT);
     data.writeVInt(compression.format.getId());
     data.writeVInt(compression.bitsPerValue);
+    
+    data.writeVInt(numOrds);
+    for (int i = 0; i < numOrds; i++) {
+      data.writeByte(uniqueValues.values[i]);
+    }
 
     final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, compression.format,
count, compression.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
     for(Number nv : values) {
-      writer.add(uniqueValues.getOrd(nv.longValue()));
+      int ord = uniqueValues.ord(nv.byteValue());
+      if (ord < numOrds) {
+        writer.add(ord);
+      } else {
+        writer.add(numOrds); // collapses all ords >= numOrds into a single value
+      }
     }
     writer.finish();
   }
@@ -211,17 +241,15 @@ class Lucene50NormsConsumer extends Norm
   // encodes only uncommon values in a sparse bitset
   // access is constant time, and the common case is predictable
   // exceptions nest either to CONST (if there are only 2 values), or INDIRECT (if there
are > 2 values)
-  private void addPatched(FieldInfo field, final Iterable<Number> values, int count,
NormMap uniqueValues) throws IOException {
-    final long decodeTable[] = uniqueValues.getDecodeTable();
-    int commonCount = uniqueValues.maxFreq();
-    final long commonValue = decodeTable[uniqueValues.maxOrd()];
+  private void addPatchedBitset(FieldInfo field, final Iterable<Number> values, int
count, NormMap uniqueValues) throws IOException {
+    int commonCount = uniqueValues.freqs[0];
     
     meta.writeVInt(count - commonCount);
-    meta.writeByte(PATCHED);
+    meta.writeByte(PATCHED_BITSET);
     meta.writeLong(data.getFilePointer());
     
     // write docs with value
-    writeDocsWithValue(values, commonValue);
+    writeDocsWithValue(values, uniqueValues, 0);
     
     // write exceptions: only two cases make sense
     // bpv = 1 (folded into sparse bitset already)
@@ -229,44 +257,58 @@ class Lucene50NormsConsumer extends Norm
     meta.writeVInt(field.number);
     if (uniqueValues.size == 2) {
       // special case: implicit in bitset
-      int otherOrd = uniqueValues.maxOrd() == 0 ? 1 : 0;
-      addConstant(decodeTable[otherOrd]);
+      addConstant(uniqueValues.values[1]);
     } else {
       // exception table
-      addIndirect(field, values, count, uniqueValues);
+      addIndirect(field, values, count, uniqueValues, 0);
     }
   }
+
+  // encodes common values in a table, and the rest of the values as exceptions using INDIRECT.
+  // the exceptions should not be accessed very often, since the values are uncommon
+  private void addPatchedTable(FieldInfo field, final Iterable<Number> values, final
int numCommonValues, int commonValuesCount, int count, final NormMap uniqueValues) throws
IOException {
+    meta.writeVInt(count);
+    meta.writeByte(PATCHED_TABLE);
+    meta.writeLong(data.getFilePointer());
+
+    assert numCommonValues == 3 || numCommonValues == 15;
+    FormatAndBits compression = fastestFormatAndBits(numCommonValues);
+    
+    writeTable(values, compression, count, uniqueValues, numCommonValues);
+
+    meta.writeVInt(field.number);
+    addIndirect(field, values, count - commonValuesCount, uniqueValues, numCommonValues);
+  }
   
   // encodes values as sparse array: keys[] and values[]
   // access is log(N) where N = keys.length (slow!)
   // so this is only appropriate as an exception table for patched, or when common value
is 0 (wont be accessed by searching) 
-  private void addIndirect(FieldInfo field, final Iterable<Number> values, int count,
NormMap uniqueValues) throws IOException {
-    int commonCount = uniqueValues.maxFreq();
-    final long commonValue = uniqueValues.getDecodeTable()[uniqueValues.maxOrd()];
+  private void addIndirect(FieldInfo field, final Iterable<Number> values, int count,
final NormMap uniqueValues, final int minOrd) throws IOException {
+    int commonCount = uniqueValues.freqs[minOrd];
     
     meta.writeVInt(count - commonCount);
     meta.writeByte(INDIRECT);
     meta.writeLong(data.getFilePointer());
     
     // write docs with value
-    writeDocsWithValue(values, commonValue);
+    writeDocsWithValue(values, uniqueValues, minOrd);
     
     // write actual values
     writeNormsField(field, new Iterable<Number>() {
       @Override
       public Iterator<Number> iterator() {
-        return new FilterIterator<Number,Number>(values.iterator()) {
+        return new FilterIterator<Number, Number>(values.iterator()) {
           @Override
           protected boolean predicateFunction(Number value) {
-            return value.longValue() != commonValue;
+            return uniqueValues.ord(value.byteValue()) > minOrd;
           }
         };
       }
     }, 1);
   }
   
-  private void writeDocsWithValue(final Iterable<Number> values, long commonValue)
throws IOException {
-    data.writeLong(commonValue);
+  private void writeDocsWithValue(final Iterable<Number> values, NormMap uniqueValues,
int minOrd) throws IOException {
+    data.writeLong(uniqueValues.values[minOrd]);
     data.writeVInt(PackedInts.VERSION_CURRENT);
     data.writeVInt(BLOCK_SIZE);
     
@@ -274,8 +316,8 @@ class Lucene50NormsConsumer extends Norm
     final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
     int doc = 0;
     for (Number n : values) {
-      long v = n.longValue();
-      if (v != commonValue) {
+      int ord = uniqueValues.ord(n.byteValue());
+      if (ord > minOrd) {
         writer.add(doc);
       }
       doc++;
@@ -304,103 +346,63 @@ class Lucene50NormsConsumer extends Norm
       meta = data = null;
     }
   }
-  
+
   // specialized deduplication of long->ord for norms: 99.99999% of the time this will
be a single-byte range.
   static class NormMap {
     // we use short: at most we will add 257 values to this map before its rejected as too
big above.
-    final short[] singleByteRange = new short[256];
+    private final short[] ords = new short[256];
     final int[] freqs = new int[257];
-    final Map<Long,Short> other = new HashMap<Long,Short>();
+    final byte[] values = new byte[257];
     int size;
-    
+
     {
-      Arrays.fill(singleByteRange, (short)-1);
+      Arrays.fill(ords, (short)-1);
     }
 
-    /** adds an item to the mapping. returns true if actually added */
-    public boolean add(long l) {
+    // adds an item to the mapping. returns true if actually added
+    public boolean add(byte l) {
       assert size <= 256; // once we add > 256 values, we nullify the map in addNumericField
and don't use this strategy
-      if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
-        int index = (int) (l + 128);
-        short previous = singleByteRange[index];
-        if (previous < 0) {
-          short slot = (short) size;
-          singleByteRange[index] = slot;
-          freqs[slot]++;
-          size++;
-          return true;
-        } else {
-          freqs[previous]++;
-          return false;
-        }
+      int index = (int)l + 128;
+      short previous = ords[index];
+      if (previous < 0) {
+        short slot = (short)size;
+        ords[index] = slot;
+        freqs[slot]++;
+        values[slot] = l;
+        size++;
+        return true;
       } else {
-        Short previous = other.get(l);
-        if (previous == null) {
-          freqs[size]++;
-          other.put(l, (short)size);
-          size++;
-          return true;
-        } else {
-          freqs[previous]++;
-          return false;
-        }
+        freqs[previous]++;
+        return false;
       }
     }
-    
-    /** gets the ordinal for a previously added item */
-    public int getOrd(long l) {
-      if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
-        int index = (int) (l + 128);
-        return singleByteRange[index];
-      } else {
-        // NPE if something is screwed up
-        return other.get(l);
-      }
+
+    public int ord(byte value) {
+      return ords[(int)value + 128];
     }
-    
-    /** retrieves the ordinal table for previously added items */
-    public long[] getDecodeTable() {
-      long decode[] = new long[size];
-      for (int i = 0; i < singleByteRange.length; i++) {
-        short s = singleByteRange[i];
-        if (s >= 0) {
-          decode[s] = i - 128;
+
+    // reassign ordinals so higher frequencies have lower ordinals
+    public void optimizeOrdinals() {
+      new InPlaceMergeSorter() {
+        @Override
+        protected int compare(int i, int j) {
+          return freqs[j] - freqs[i]; // sort descending
         }
-      }
-      for (Map.Entry<Long,Short> entry : other.entrySet()) {
-        decode[entry.getValue()] = entry.getKey();
-      }
-      return decode;
-    }
-    
-    // TODO: if we need more complicated frequency-driven optos, maybe add 'finish' to this
api
-    // and sort all ords by frequency. we could then lower BPV and waste a value to represent
'patched',
-    
-    /** retrieves frequency table for items (indexed by ordinal) */
-    public int[] getFreqs() {
-      return freqs;
-    }
-    
-    /** sugar: returns max value over getFreqs() */
-    public int maxFreq() {
-      int max = 0;
-      for (int i = 0; i < size; i++) {
-        max = Math.max(max, freqs[i]);
-      }
-      return max;
-    }
-    
-    /** sugar: returns ordinal with maxFreq() */
-    public int maxOrd() {
-      long max = 0;
-      int maxOrd = 0;
-      for (int i = 0; i < size; i++) {
-        if (freqs[i] > max) {
-          max = freqs[i];
-          maxOrd = i;
+        @Override
+        protected void swap(int i, int j) {
+          // swap ordinal i with ordinal j
+          ords[(int)values[i] + 128] = (short)j;
+          ords[(int)values[j] + 128] = (short)i;
+
+          int tmpFreq = freqs[i];
+          byte tmpValue = values[i];
+          freqs[i] = freqs[j];
+          values[i] = values[j];
+          freqs[j] = tmpFreq;
+          values[j] = tmpValue;
         }
-      }
-      return maxOrd;
+      }.sort(0, size);
     }
   }
+
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java
(original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java
Fri Oct 31 20:25:22 2014
@@ -51,9 +51,12 @@ import org.apache.lucene.util.packed.Pac
  *    <li>Indirect: when norms are extremely sparse, missing values are omitted.
  *        Access to an individual value is slower, but missing norm values are never accessed
  *        by search code.
- *    <li>Patched: when a single norm value dominates, a sparse bitset encodes docs
with exceptions,
- *        so that access to the common value is still very fast. outliers fall thru to an
exception 
- *        handling mechanism (Indirect or Constant).
+ *    <li>Patched bitset: when a single norm value dominates, a sparse bitset encodes
docs
+ *        with exceptions, so that access to the common value is still very fast. outliers
+ *        fall through to an exception handling mechanism (Indirect or Constant).
+ *    <li>Patched table: when a small number of norm values dominate, a table is used
for the
+ *        common values to allow fast access. less common values fall through to an exception
+ *        handling mechanism (Indirect).
  * </ul>
  * <p>
  * Files:
@@ -87,7 +90,9 @@ import org.apache.lucene.util.packed.Pac
  *         <li>3 --&gt; uncompressed: Values written as a simple byte[].
  *         <li>4 --&gt; indirect. Only documents with a value are written with
monotonic compression. a nested
  *             entry for the same field will follow for the exception handler.
- *         <li>5 --&gt; patched. Encoded the same as indirect.
+ *         <li>5 --&gt; patched bitset. Encoded the same as indirect.
+ *         <li>6 --&gt; patched table. Documents with very common values are written
with a lookup table.
+ *             Other values are written using a nested indirect.
  *      </ul>
  *   <li><a name="nvd" id="nvd"></a>
  *   <p>The Norms data or .nvd file.</p>

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java
(original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java
Fri Oct 31 20:25:22 2014
@@ -49,7 +49,8 @@ import static org.apache.lucene.codecs.l
 import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.TABLE_COMPRESSED;
 import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.UNCOMPRESSED;
 import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.INDIRECT;
-import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED;
+import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED_BITSET;
+import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED_TABLE;
 
 /**
  * Reader for {@link Lucene50NormsFormat}
@@ -151,7 +152,8 @@ class Lucene50NormsProducer extends Norm
       case TABLE_COMPRESSED:
       case DELTA_COMPRESSED:
         break;
-      case PATCHED:
+      case PATCHED_BITSET:
+      case PATCHED_TABLE:
       case INDIRECT:
         if (meta.readVInt() != info.number) {
           throw new CorruptIndexException("indirect norms entry for field: " + info.name
+ " is corrupt", meta);
@@ -237,16 +239,22 @@ class Lucene50NormsProducer extends Norm
       case TABLE_COMPRESSED: {
         data.seek(entry.offset);
         int packedIntsVersion = data.readVInt();
-        int size = data.readVInt();
-        if (size > 256) {
-          throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct
values, got=" + size, data);
-        }
-        final long decode[] = new long[size];
-        for (int i = 0; i < decode.length; i++) {
-          decode[i] = data.readLong();
-        }
         final int formatID = data.readVInt();
         final int bitsPerValue = data.readVInt();
+        
+        if (bitsPerValue != 1 && bitsPerValue != 2 && bitsPerValue != 4)
{
+          throw new CorruptIndexException("TABLE_COMPRESSED only supports bpv=1, bpv=2 and
bpv=4, got=" + bitsPerValue, data);
+        }
+        int size = 1 << bitsPerValue;
+        final byte decode[] = new byte[size];
+        final int ordsSize = data.readVInt();
+        for (int i = 0; i < ordsSize; ++i) {
+          decode[i] = data.readByte();
+        }
+        for (int i = ordsSize; i < size; ++i) {
+          decode[i] = 0;
+        }
+
         final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID),
packedIntsVersion, entry.count, bitsPerValue);
         instance.info = Accountables.namedAccountable("table compressed", ordsReader);
         instance.ramBytesUsed = RamUsageEstimator.sizeOf(decode) + ordsReader.ramBytesUsed();
@@ -291,7 +299,7 @@ class Lucene50NormsProducer extends Norm
         };
         break;
       }
-      case PATCHED: {
+      case PATCHED_BITSET: {
         data.seek(entry.offset);
         final long common = data.readLong();
         int packedIntsVersion = data.readVInt();
@@ -304,7 +312,7 @@ class Lucene50NormsProducer extends Norm
         }
         LoadedNorms nestedInstance = loadNorms(entry.nested);
         instance.ramBytesUsed = set.ramBytesUsed() + nestedInstance.ramBytesUsed;
-        instance.info = Accountables.namedAccountable("patched -> " + nestedInstance.info,
instance.ramBytesUsed);
+        instance.info = Accountables.namedAccountable("patched bitset -> " + nestedInstance.info,
instance.ramBytesUsed);
         final NumericDocValues values = nestedInstance.norms;
         instance.norms = new NumericDocValues() {
           @Override
@@ -318,6 +326,42 @@ class Lucene50NormsProducer extends Norm
         };
         break;
       }
+      case PATCHED_TABLE: {
+        data.seek(entry.offset);
+        int packedIntsVersion = data.readVInt();
+        final int formatID = data.readVInt();
+        final int bitsPerValue = data.readVInt();
+
+        if (bitsPerValue != 2 && bitsPerValue != 4) {
+          throw new CorruptIndexException("PATCHED_TABLE only supports bpv=2 and bpv=4, got="
+ bitsPerValue, data);
+        }
+        final int size = 1 << bitsPerValue;
+        final int ordsSize = data.readVInt();
+        final byte decode[] = new byte[ordsSize];
+        assert ordsSize + 1 == size;
+        for (int i = 0; i < ordsSize; ++i) {
+          decode[i] = data.readByte();
+        }
+        
+        final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID),
packedIntsVersion, entry.count, bitsPerValue);
+        final LoadedNorms nestedInstance = loadNorms(entry.nested);
+        instance.ramBytesUsed = RamUsageEstimator.sizeOf(decode) + ordsReader.ramBytesUsed()
+ nestedInstance.ramBytesUsed;
+        instance.info = Accountables.namedAccountable("patched table -> " + nestedInstance.info,
instance.ramBytesUsed);
+        final NumericDocValues values = nestedInstance.norms;
+        instance.norms = new NumericDocValues() {
+          @Override
+          public long get(int docID) {
+            int ord = (int)ordsReader.get(docID);
+            try {
+              // doing a try/catch here eliminates a seemingly unavoidable branch in hotspot...
+              return decode[ord];
+            } catch (IndexOutOfBoundsException e) {
+              return values.get(docID);
+            }
+          }
+        };
+        break;
+      }
       default:
         throw new AssertionError();
     }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java
(original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java
Fri Oct 31 20:25:22 2014
@@ -43,85 +43,88 @@ public class TestLucene50NormsFormat ext
 
   public void testNormMapSimple() {
     NormMap map = new NormMap();
-    map.add(10);
-    map.add(5);
-    map.add(4);
-    map.add(10);
+    map.add((byte)4);
+    map.add((byte) 10);
+    map.add((byte) 5);
+    map.add((byte)10);
     assertEquals(3, map.size);
     
     // first come, first serve ord assignment
-    
-    // encode
-    assertEquals(0, map.getOrd(10));
-    assertEquals(1, map.getOrd(5));
-    assertEquals(2, map.getOrd(4));
-    
-    // decode
-    long decode[] = map.getDecodeTable();
-    assertEquals(10, decode[0]);
-    assertEquals(5, decode[1]);
-    assertEquals(4, decode[2]);
-    
-    // freqs
-    int freqs[] = map.getFreqs();
-    assertEquals(2, freqs[0]);
-    assertEquals(1, freqs[1]);
-    assertEquals(1, freqs[2]);
-    
-    assertEquals(2, map.maxFreq());
+    assertEquals(0, map.ord((byte) 4));
+    assertEquals(1, map.ord((byte) 10));
+    assertEquals(2, map.ord((byte) 5));
+    
+    assertEquals(4, map.values[0]);
+    assertEquals(10, map.values[1]);
+    assertEquals(5, map.values[2]);
+    
+    assertEquals(1, map.freqs[0]);
+    assertEquals(2, map.freqs[1]);
+    assertEquals(1, map.freqs[2]);
+
+    // optimizing reorders the ordinals
+    map.optimizeOrdinals();
+    assertEquals(0, map.ord((byte)10));
+    assertEquals(1, map.ord((byte)4));
+    assertEquals(2, map.ord((byte)5));
+
+    assertEquals(10, map.values[0]);
+    assertEquals(4, map.values[1]);
+    assertEquals(5, map.values[2]);
+
+    assertEquals(2, map.freqs[0]);
+    assertEquals(1, map.freqs[1]);
+    assertEquals(1, map.freqs[2]);
   }
   
   public void testNormMapRandom() {
-    Map<Long,Integer> freqs = new HashMap<>();
-    Map<Long,Integer> ords = new HashMap<>();
-    
-    Set<Long> uniqueValuesSet = new HashSet<>();
+
+    Set<Byte> uniqueValuesSet = new HashSet<>();
     int numUniqValues = TestUtil.nextInt(random(), 1, 256);
     for (int i = 0; i < numUniqValues; i++) {
-      if (random().nextBoolean()) {
-        uniqueValuesSet.add(TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE));
-      } else {
-        uniqueValuesSet.add(TestUtil.nextLong(random(), Byte.MIN_VALUE, Byte.MAX_VALUE));
-      }
+      uniqueValuesSet.add(Byte.valueOf((byte)TestUtil.nextInt(random(), Byte.MIN_VALUE, Byte.MAX_VALUE)));
     }
-    
-    Long uniqueValues[] = uniqueValuesSet.toArray(new Long[uniqueValuesSet.size()]);
-    
+    Byte uniqueValues[] = uniqueValuesSet.toArray(new Byte[uniqueValuesSet.size()]);
+
+    Map<Byte,Integer> freqs = new HashMap<>();
     NormMap map = new NormMap();
     int numdocs = TestUtil.nextInt(random(), 1, 100000);
     for (int i = 0; i < numdocs; i++) {
-      long value = uniqueValues[random().nextInt(uniqueValues.length)];
+      byte value = uniqueValues[random().nextInt(uniqueValues.length)];
       // now add to both expected and actual
       map.add(value);
-      
-      Integer ord = ords.get(value);
-      if (ord == null) {
-        ord = ords.size();
-        ords.put(value, ord);
-        freqs.put(value, 1);
+      if (freqs.containsKey(value)) {
+        freqs.put(value, freqs.get(value) + 1);
       } else {
-        freqs.put(value, freqs.get(value)+1);
+        freqs.put(value, 1);
       }
     }
-    
-    // value -> ord
-    assertEquals(ords.size(), map.size);
-    for (Map.Entry<Long,Integer> kv : ords.entrySet()) {
-      assertEquals(kv.getValue().intValue(), map.getOrd(kv.getKey()));
+
+    assertEquals(freqs.size(), map.size);
+    for (Map.Entry<Byte,Integer> kv : freqs.entrySet()) {
+      byte value = kv.getKey();
+      int freq = kv.getValue();
+      int ord = map.ord(value);
+      assertEquals(freq, map.freqs[ord]);
+      assertEquals(value, map.values[ord]);
     }
-    
-    // ord -> value
-    Map<Long,Integer> reversed = new HashMap<>();
-    long table[] = map.getDecodeTable();
-    for (int i = 0; i < map.size; i++) {
-      reversed.put(table[i], i);
+
+    // optimizing should reorder ordinals from greatest to least frequency
+    map.optimizeOrdinals();
+    // recheck consistency
+    assertEquals(freqs.size(), map.size);
+    for (Map.Entry<Byte,Integer> kv : freqs.entrySet()) {
+      byte value = kv.getKey();
+      int freq = kv.getValue();
+      int ord = map.ord(value);
+      assertEquals(freq, map.freqs[ord]);
+      assertEquals(value, map.values[ord]);
     }
-    assertEquals(ords, reversed);
-    
-    // freqs
-    int freqTable[] = map.getFreqs();
-    for (int i = 0; i < map.size; i++) {
-      assertEquals(freqs.get(table[i]).longValue(), freqTable[i]);
+    // also check descending freq
+    int prevFreq = map.freqs[0];
+    for (int i = 1; i < map.size; ++i) {
+      assertTrue(prevFreq >= map.freqs[i]);
+      prevFreq = map.freqs[i];
     }
   }
 }

Modified: lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
(original)
+++ lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
Fri Oct 31 20:25:22 2014
@@ -22,6 +22,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
 
+import com.carrotsearch.randomizedtesting.annotations.Seed;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
@@ -43,6 +44,7 @@ import org.apache.lucene.util.TestUtil;
  * test passes, then all Lucene/Solr tests should also pass.  Ie,
  * if there is some bug in a given NormsFormat that this
  * test fails to catch then this test needs to be improved! */
+@Seed(value = "AD2222476BCB8800")
 public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCase {
   
   public void testByteRange() throws Exception {
@@ -182,6 +184,32 @@ public abstract class BaseNormsFormatTes
     }
   }
   
+  public void testNCommon() throws Exception {
+    final int iterations = atLeast(1);
+    final Random r = random();
+    for (int i = 0; i < iterations; ++i) {
+      // 16 is 4 bpv, the max before we jump to 8bpv
+      for (int n = 2; n < 16; ++n) {
+        final int N = n;
+        final long[] commonValues = new long[N];
+        for (int j = 0; j < N; ++j) {
+          commonValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+        }
+        final int numOtherValues = TestUtil.nextInt(r, 2, 256 - N);
+        final long[] otherValues = new long[numOtherValues];
+        for (int j = 0; j < numOtherValues; ++j) {
+          otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+        }
+        doTestNormsVersusStoredFields(new LongProducer() {
+          @Override
+          long next() {
+            return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N
- 1)];
+          }
+        });
+      }
+    }
+  }
+  
   private void doTestNormsVersusStoredFields(LongProducer longs) throws Exception {
     int numDocs = atLeast(500);
     long norms[] = new long[numDocs];
@@ -226,7 +254,7 @@ public abstract class BaseNormsFormatTes
       NumericDocValues docValues = r.getNormValues("stored");
       for (int i = 0; i < r.maxDoc(); i++) {
         long storedValue = Long.parseLong(r.document(i).get("stored"));
-        assertEquals(storedValue, docValues.get(i));
+        assertEquals("doc " + i, storedValue, docValues.get(i));
       }
     }
     ir.close();



Mime
View raw message