incubator-blur-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From twilli...@apache.org
Subject [4/7] git commit: Updating the blur codec to allow for complete off heap (disk) based doc values.
Date Thu, 18 Sep 2014 01:58:15 GMT
Updating the blur codec to allow for complete off heap (disk) based doc values.


Project: http://git-wip-us.apache.org/repos/asf/incubator-blur/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-blur/commit/435519ab
Tree: http://git-wip-us.apache.org/repos/asf/incubator-blur/tree/435519ab
Diff: http://git-wip-us.apache.org/repos/asf/incubator-blur/diff/435519ab

Branch: refs/heads/blur-374
Commit: 435519abe00f42fc32aa72ce25b861343f212367
Parents: 8d019a4
Author: Aaron McCurry <amccurry@gmail.com>
Authored: Tue Sep 16 17:19:00 2014 -0400
Committer: Aaron McCurry <amccurry@gmail.com>
Committed: Tue Sep 16 17:19:00 2014 -0400

----------------------------------------------------------------------
 .../manager/writer/BlurIndexSimpleWriter.java   |   4 +-
 .../blur/utils/TableShardCountCollapser.java    |   4 +-
 .../mapreduce/lib/GenericBlurRecordWriter.java  |   4 +-
 .../org/apache/blur/filter/FilterCacheTest.java |   4 +-
 .../apache/blur/lucene/codec/Blur021Codec.java  |   1 +
 .../apache/blur/lucene/codec/Blur022Codec.java  |   1 +
 .../apache/blur/lucene/codec/Blur024Codec.java  | 156 ++++++++
 .../lucene/codec/DiskDocValuesConsumer.java     | 169 ++++++++
 .../blur/lucene/codec/DiskDocValuesFormat.java  |  62 +++
 .../lucene/codec/DiskDocValuesProducer.java     | 387 +++++++++++++++++++
 .../services/org.apache.lucene.codecs.Codec     |   3 +-
 .../org.apache.lucene.codecs.DocValuesFormat    |  17 +
 .../blur/lucene/codec/Blur024CodecTest.java     | 231 +++++++++++
 13 files changed, 1034 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-core/src/main/java/org/apache/blur/manager/writer/BlurIndexSimpleWriter.java
----------------------------------------------------------------------
diff --git a/blur-core/src/main/java/org/apache/blur/manager/writer/BlurIndexSimpleWriter.java b/blur-core/src/main/java/org/apache/blur/manager/writer/BlurIndexSimpleWriter.java
index 166ba28..fd9f22c 100644
--- a/blur-core/src/main/java/org/apache/blur/manager/writer/BlurIndexSimpleWriter.java
+++ b/blur-core/src/main/java/org/apache/blur/manager/writer/BlurIndexSimpleWriter.java
@@ -40,7 +40,7 @@ import org.apache.blur.index.ExitableReader;
 import org.apache.blur.index.IndexDeletionPolicyReader;
 import org.apache.blur.log.Log;
 import org.apache.blur.log.LogFactory;
-import org.apache.blur.lucene.codec.Blur022Codec;
+import org.apache.blur.lucene.codec.Blur024Codec;
 import org.apache.blur.lucene.warmup.TraceableDirectory;
 import org.apache.blur.manager.indexserver.BlurIndexWarmup;
 import org.apache.blur.server.IndexSearcherClosable;
@@ -101,7 +101,7 @@ public class BlurIndexSimpleWriter extends BlurIndex {
     Analyzer analyzer = _fieldManager.getAnalyzerForIndex();
     _conf = new IndexWriterConfig(LUCENE_VERSION, analyzer);
     _conf.setWriteLockTimeout(TimeUnit.MINUTES.toMillis(5));
-    _conf.setCodec(new Blur022Codec(_tableContext.getBlurConfiguration()));
+    _conf.setCodec(new Blur024Codec(_tableContext.getBlurConfiguration()));
     _conf.setSimilarity(_tableContext.getSimilarity());
     _conf.setMergedSegmentWarmer(new BlurIndexReaderWarmer(shardContext, _isClosed, indexWarmup));
     TieredMergePolicy mergePolicy = (TieredMergePolicy) _conf.getMergePolicy();

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-core/src/main/java/org/apache/blur/utils/TableShardCountCollapser.java
----------------------------------------------------------------------
diff --git a/blur-core/src/main/java/org/apache/blur/utils/TableShardCountCollapser.java b/blur-core/src/main/java/org/apache/blur/utils/TableShardCountCollapser.java
index e8ebd76..adbb8f8 100644
--- a/blur-core/src/main/java/org/apache/blur/utils/TableShardCountCollapser.java
+++ b/blur-core/src/main/java/org/apache/blur/utils/TableShardCountCollapser.java
@@ -24,7 +24,7 @@ import java.util.List;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
-import org.apache.blur.lucene.codec.Blur022Codec;
+import org.apache.blur.lucene.codec.Blur024Codec;
 import org.apache.blur.store.hdfs.HdfsDirectory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
@@ -131,7 +131,7 @@ public class TableShardCountCollapser extends Configured implements Tool {
     for (int i = 0; i < newShardCount; i++) {
       System.out.println("Base Index [" + paths[i] + "]");
       IndexWriterConfig lconf = new IndexWriterConfig(LUCENE_VERSION, new KeywordAnalyzer());
-      lconf.setCodec(new Blur022Codec());
+      lconf.setCodec(new Blur024Codec());
       HdfsDirectory dir = new HdfsDirectory(getConf(), paths[i]);
       IndexWriter indexWriter = new IndexWriter(dir, lconf);
       Directory[] dirs = new Directory[numberOfShardsToMergePerPass - 1];

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-mapred-hadoop1/src/main/java/org/apache/blur/mapreduce/lib/GenericBlurRecordWriter.java
----------------------------------------------------------------------
diff --git a/blur-mapred-hadoop1/src/main/java/org/apache/blur/mapreduce/lib/GenericBlurRecordWriter.java b/blur-mapred-hadoop1/src/main/java/org/apache/blur/mapreduce/lib/GenericBlurRecordWriter.java
index 0a62e37..4bd311b 100644
--- a/blur-mapred-hadoop1/src/main/java/org/apache/blur/mapreduce/lib/GenericBlurRecordWriter.java
+++ b/blur-mapred-hadoop1/src/main/java/org/apache/blur/mapreduce/lib/GenericBlurRecordWriter.java
@@ -26,7 +26,7 @@ import org.apache.blur.analysis.FieldManager;
 import org.apache.blur.log.Log;
 import org.apache.blur.log.LogFactory;
 import org.apache.blur.lucene.LuceneVersionConstant;
-import org.apache.blur.lucene.codec.Blur022Codec;
+import org.apache.blur.lucene.codec.Blur024Codec;
 import org.apache.blur.mapreduce.lib.BlurMutate.MUTATE_TYPE;
 import org.apache.blur.server.TableContext;
 import org.apache.blur.store.hdfs.HdfsDirectory;
@@ -118,7 +118,7 @@ public class GenericBlurRecordWriter {
     Analyzer analyzer = _fieldManager.getAnalyzerForIndex();
 
     _conf = new IndexWriterConfig(LuceneVersionConstant.LUCENE_VERSION, analyzer);
-    _conf.setCodec(new Blur022Codec());
+    _conf.setCodec(new Blur024Codec());
     _conf.setSimilarity(tableContext.getSimilarity());
     TieredMergePolicy mergePolicy = (TieredMergePolicy) _conf.getMergePolicy();
     mergePolicy.setUseCompoundFile(false);

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-query/src/test/java/org/apache/blur/filter/FilterCacheTest.java
----------------------------------------------------------------------
diff --git a/blur-query/src/test/java/org/apache/blur/filter/FilterCacheTest.java b/blur-query/src/test/java/org/apache/blur/filter/FilterCacheTest.java
index 7acadb8..bca2bef 100644
--- a/blur-query/src/test/java/org/apache/blur/filter/FilterCacheTest.java
+++ b/blur-query/src/test/java/org/apache/blur/filter/FilterCacheTest.java
@@ -22,7 +22,7 @@ import java.io.IOException;
 import java.util.Arrays;
 import java.util.TreeSet;
 
-import org.apache.blur.lucene.codec.Blur022Codec;
+import org.apache.blur.lucene.codec.Blur024Codec;
 import org.apache.lucene.analysis.core.KeywordAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
@@ -112,7 +112,7 @@ public class FilterCacheTest {
 
   private void writeDocs(FilterCache filterCache, RAMDirectory directory) throws IOException {
     IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new KeywordAnalyzer());
-    conf.setCodec(new Blur022Codec());
+    conf.setCodec(new Blur024Codec());
     IndexWriter indexWriter = new IndexWriter(directory, conf);
     int count = 10000;
     addDocs(indexWriter, count);

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur021Codec.java
----------------------------------------------------------------------
diff --git a/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur021Codec.java b/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur021Codec.java
index b2f4caa..33f8bcd 100644
--- a/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur021Codec.java
+++ b/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur021Codec.java
@@ -33,6 +33,7 @@ import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat;
 import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 
+@Deprecated
 public class Blur021Codec extends Codec {
   private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
   private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur022Codec.java
----------------------------------------------------------------------
diff --git a/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur022Codec.java b/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur022Codec.java
index b1e8330..b4828b9 100644
--- a/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur022Codec.java
+++ b/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur022Codec.java
@@ -39,6 +39,7 @@ import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat;
 import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 
+@Deprecated
 public class Blur022Codec extends Codec {
 
   private final StoredFieldsFormat fieldsFormat;

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur024Codec.java
----------------------------------------------------------------------
diff --git a/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur024Codec.java b/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur024Codec.java
new file mode 100644
index 0000000..c70f709
--- /dev/null
+++ b/blur-store/src/main/java/org/apache/blur/lucene/codec/Blur024Codec.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.blur.lucene.codec;
+
+import static org.apache.blur.utils.BlurConstants.BLUR_SHARD_INDEX_CHUNKSIZE;
+import static org.apache.blur.utils.BlurConstants.BLUR_SHARD_INDEX_COMPRESSIONMODE;
+import static org.apache.blur.utils.BlurConstants.FAST;
+import static org.apache.blur.utils.BlurConstants.FAST_DECOMPRESSION;
+import static org.apache.blur.utils.BlurConstants.HIGH_COMPRESSION;
+
+import org.apache.blur.BlurConfiguration;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.FieldInfosFormat;
+import org.apache.lucene.codecs.LiveDocsFormat;
+import org.apache.lucene.codecs.NormsFormat;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.SegmentInfoFormat;
+import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.codecs.lucene42.Lucene42FieldInfosFormat;
+import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat;
+import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat;
+import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
+import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
+
+public class Blur024Codec extends Codec {
+
+  private final StoredFieldsFormat fieldsFormat;
+  private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
+  private final FieldInfosFormat fieldInfosFormat = new Lucene42FieldInfosFormat();
+  private final SegmentInfoFormat infosFormat;
+  private final LiveDocsFormat liveDocsFormat = new Blur021LiveDocsFormat();
+
+  private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
+    @Override
+    public PostingsFormat getPostingsFormatForField(String field) {
+      return Blur024Codec.this.getPostingsFormatForField(field);
+    }
+  };
+
+  private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
+    @Override
+    public DocValuesFormat getDocValuesFormatForField(String field) {
+      return Blur024Codec.this.getDocValuesFormatForField(field);
+    }
+  };
+
+  public Blur024Codec() {
+    this(1 << 14, CompressionMode.FAST);
+  }
+
+  public Blur024Codec(int chunkSize, CompressionMode compressionMode) {
+    super("Blur024");
+    infosFormat = new Blur022SegmentInfoFormat(chunkSize, compressionMode);
+    fieldsFormat = new Blur022StoredFieldsFormat(chunkSize, compressionMode);
+  }
+
+  public Blur024Codec(BlurConfiguration configuration) {
+    this(configuration.getInt(BLUR_SHARD_INDEX_CHUNKSIZE, 1 << 14), getCompressionMode(configuration));
+  }
+
+  private static CompressionMode getCompressionMode(BlurConfiguration configuration) {
+    String type = configuration.get(BLUR_SHARD_INDEX_COMPRESSIONMODE, FAST);
+    if (HIGH_COMPRESSION.equals(type)) {
+      return CompressionMode.HIGH_COMPRESSION;
+    } else if (FAST.equals(type)) {
+      return CompressionMode.FAST;
+    } else if (FAST_DECOMPRESSION.equals(type)) {
+      return CompressionMode.FAST_DECOMPRESSION;
+    } else {
+      throw new IllegalArgumentException("blur.shard.index.compressionmode=" + type
+          + " not supported.  Valid entries are [FAST,FAST_DECOMPRESSION,HIGH_COMPRESSION]");
+    }
+  }
+
+  @Override
+  public final StoredFieldsFormat storedFieldsFormat() {
+    return fieldsFormat;
+  }
+
+  @Override
+  public final TermVectorsFormat termVectorsFormat() {
+    return vectorsFormat;
+  }
+
+  @Override
+  public final PostingsFormat postingsFormat() {
+    return postingsFormat;
+  }
+
+  @Override
+  public final FieldInfosFormat fieldInfosFormat() {
+    return fieldInfosFormat;
+  }
+
+  @Override
+  public final SegmentInfoFormat segmentInfoFormat() {
+    return infosFormat;
+  }
+
+  @Override
+  public final LiveDocsFormat liveDocsFormat() {
+    return liveDocsFormat;
+  }
+
+  /**
+   * Returns the postings format that should be used for writing new segments of
+   * <code>field</code>.
+   * 
+   * The default implementation always returns "Lucene41"
+   */
+  public PostingsFormat getPostingsFormatForField(String field) {
+    return defaultFormat;
+  }
+
+  /**
+   * Returns the docvalues format that should be used for writing new segments
+   * of <code>field</code>.
+   * 
+   * The default implementation always returns "Lucene42"
+   */
+  public DocValuesFormat getDocValuesFormatForField(String field) {
+    return defaultDVFormat;
+  }
+
+  @Override
+  public final DocValuesFormat docValuesFormat() {
+    return docValuesFormat;
+  }
+
+  private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
+  private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Blur024Disk");
+
+  private final NormsFormat normsFormat = new Lucene42NormsFormat();
+
+  @Override
+  public final NormsFormat normsFormat() {
+    return normsFormat;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesConsumer.java
----------------------------------------------------------------------
diff --git a/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesConsumer.java b/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesConsumer.java
new file mode 100644
index 0000000..43d42e4
--- /dev/null
+++ b/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesConsumer.java
@@ -0,0 +1,169 @@
+package org.apache.blur.lucene.codec;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.BlockPackedWriter;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
+import org.apache.lucene.util.packed.PackedInts;
+
+/** writer for {@link DiskDocValuesFormat} */
+public class DiskDocValuesConsumer extends DocValuesConsumer {
+
+  static final int BLOCK_SIZE = 16384;
+
+  final IndexOutput data, meta;
+  final int maxDoc;
+  
+  public DiskDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+    boolean success = false;
+    try {
+      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+      data = state.directory.createOutput(dataName, state.context);
+      CodecUtil.writeHeader(data, dataCodec, DiskDocValuesFormat.VERSION_CURRENT);
+      String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
+      meta = state.directory.createOutput(metaName, state.context);
+      CodecUtil.writeHeader(meta, metaCodec, DiskDocValuesFormat.VERSION_CURRENT);
+      maxDoc = state.segmentInfo.getDocCount();
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(this);
+      }
+    }
+  }
+  
+  @Override
+  public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
+    long count = 0;
+    for (@SuppressWarnings("unused") Number nv : values) {
+      ++count;
+    }
+
+    meta.writeVInt(field.number);
+    meta.writeByte(DiskDocValuesFormat.NUMERIC);
+    meta.writeVInt(PackedInts.VERSION_CURRENT);
+    meta.writeLong(data.getFilePointer());
+    meta.writeVLong(count);
+    meta.writeVInt(BLOCK_SIZE);
+
+    final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
+    for (Number nv : values) {
+      writer.add(nv.longValue());
+    }
+    writer.finish();
+  }
+
+  @Override
+  public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
+    // write the byte[] data
+    meta.writeVInt(field.number);
+    meta.writeByte(DiskDocValuesFormat.BINARY);
+    int minLength = Integer.MAX_VALUE;
+    int maxLength = Integer.MIN_VALUE;
+    final long startFP = data.getFilePointer();
+    long count = 0;
+    for(BytesRef v : values) {
+      minLength = Math.min(minLength, v.length);
+      maxLength = Math.max(maxLength, v.length);
+      data.writeBytes(v.bytes, v.offset, v.length);
+      count++;
+    }
+    meta.writeVInt(minLength);
+    meta.writeVInt(maxLength);
+    meta.writeVLong(count);
+    meta.writeLong(startFP);
+    
+    // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
+    // otherwise, we need to record the length fields...
+    if (minLength != maxLength) {
+      meta.writeLong(data.getFilePointer());
+      meta.writeVInt(PackedInts.VERSION_CURRENT);
+      meta.writeVInt(BLOCK_SIZE);
+
+      final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
+      long addr = 0;
+      for (BytesRef v : values) {
+        addr += v.length;
+        writer.add(addr);
+      }
+      writer.finish();
+    }
+  }
+
+  @Override
+  public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
+    meta.writeVInt(field.number);
+    meta.writeByte(DiskDocValuesFormat.SORTED);
+    addBinaryField(field, values);
+    addNumericField(field, docToOrd);
+  }
+  
+  @Override
+  public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
+    meta.writeVInt(field.number);
+    meta.writeByte(DiskDocValuesFormat.SORTED_SET);
+    // write the ord -> byte[] as a binary field
+    addBinaryField(field, values);
+    // write the stream of ords as a numeric field
+    // NOTE: we could return an iterator that delta-encodes these within a doc
+    addNumericField(field, ords);
+    
+    // write the doc -> ord count as a absolute index to the stream
+    meta.writeVInt(field.number);
+    meta.writeByte(DiskDocValuesFormat.NUMERIC);
+    meta.writeVInt(PackedInts.VERSION_CURRENT);
+    meta.writeLong(data.getFilePointer());
+    meta.writeVLong(maxDoc);
+    meta.writeVInt(BLOCK_SIZE);
+
+    final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
+    long addr = 0;
+    for (Number v : docToOrdCount) {
+      addr += v.longValue();
+      writer.add(addr);
+    }
+    writer.finish();
+  }
+
+  @Override
+  public void close() throws IOException {
+    boolean success = false;
+    try {
+      if (meta != null) {
+        meta.writeVInt(-1); // write EOF marker
+      }
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(data, meta);
+      } else {
+        IOUtils.closeWhileHandlingException(data, meta);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesFormat.java
----------------------------------------------------------------------
diff --git a/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesFormat.java b/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesFormat.java
new file mode 100644
index 0000000..ea7750e
--- /dev/null
+++ b/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesFormat.java
@@ -0,0 +1,62 @@
+package org.apache.blur.lucene.codec;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+/**
+ * DocValues format that keeps most things on disk.
+ * <p>
+ * Things like ordinals and disk offsets are loaded into ram,
+ * for single-seek access to all the types.
+ * <p>
+ * @lucene.experimental
+ */
+public final class DiskDocValuesFormat extends DocValuesFormat {
+
+  public DiskDocValuesFormat() {
+    super("Blur024Disk");
+  }
+
+  @Override
+  public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+    return new DiskDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
+  }
+
+  @Override
+  public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
+    return new DiskDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
+  }
+  
+  public static final String DATA_CODEC = "DiskDocValuesData";
+  public static final String DATA_EXTENSION = "dvdd";
+  public static final String META_CODEC = "DiskDocValuesMetadata";
+  public static final String META_EXTENSION = "dvdm";
+  public static final int VERSION_START = 0;
+  public static final int VERSION_CURRENT = VERSION_START;
+  public static final byte NUMERIC = 0;
+  public static final byte BINARY = 1;
+  public static final byte SORTED = 2;
+  public static final byte SORTED_SET = 3;
+}

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesProducer.java
----------------------------------------------------------------------
diff --git a/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesProducer.java b/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesProducer.java
new file mode 100644
index 0000000..6a52fc2
--- /dev/null
+++ b/blur-store/src/main/java/org/apache/blur/lucene/codec/DiskDocValuesProducer.java
@@ -0,0 +1,387 @@
+package org.apache.blur.lucene.codec;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.BlockPackedReader;
+import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
+
+class DiskDocValuesProducer extends DocValuesProducer {
+  private final Map<Integer,NumericEntry> numerics;
+  private final Map<Integer,BinaryEntry> binaries;
+  private final Map<Integer,NumericEntry> ords;
+  private final Map<Integer,NumericEntry> ordIndexes;
+  private final IndexInput data;
+
+  // memory-resident structures
+  private final Map<Integer,BlockPackedReader> ordinalInstances = new HashMap<Integer,BlockPackedReader>();
+  private final Map<Integer,MonotonicBlockPackedReader> addressInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
+  private final Map<Integer,MonotonicBlockPackedReader> ordIndexInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
+  
+  DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+    String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
+    // read in the entries from the metadata file.
+    IndexInput in = state.directory.openInput(metaName, state.context);
+    boolean success = false;
+    try {
+      CodecUtil.checkHeader(in, metaCodec, 
+                                DiskDocValuesFormat.VERSION_START,
+                                DiskDocValuesFormat.VERSION_START);
+      numerics = new HashMap<Integer,NumericEntry>();
+      ords = new HashMap<Integer,NumericEntry>();
+      ordIndexes = new HashMap<Integer,NumericEntry>();
+      binaries = new HashMap<Integer,BinaryEntry>();
+      readFields(in, state.fieldInfos);
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(in);
+      } else {
+        IOUtils.closeWhileHandlingException(in);
+      }
+    }
+    
+    String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+    data = state.directory.openInput(dataName, state.context);
+    CodecUtil.checkHeader(data, dataCodec, 
+                                DiskDocValuesFormat.VERSION_START,
+                                DiskDocValuesFormat.VERSION_START);
+  }
+  
+  private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
+    int fieldNumber = meta.readVInt();
+    while (fieldNumber != -1) {
+      byte type = meta.readByte();
+      if (type == DiskDocValuesFormat.NUMERIC) {
+        numerics.put(fieldNumber, readNumericEntry(meta));
+      } else if (type == DiskDocValuesFormat.BINARY) {
+        BinaryEntry b = readBinaryEntry(meta);
+        binaries.put(fieldNumber, b);
+      } else if (type == DiskDocValuesFormat.SORTED) {
+        // sorted = binary + numeric
+        if (meta.readVInt() != fieldNumber) {
+          throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
+        }
+        if (meta.readByte() != DiskDocValuesFormat.BINARY) {
+          throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
+        }
+        BinaryEntry b = readBinaryEntry(meta);
+        binaries.put(fieldNumber, b);
+        
+        if (meta.readVInt() != fieldNumber) {
+          throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
+        }
+        if (meta.readByte() != DiskDocValuesFormat.NUMERIC) {
+          throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
+        }
+        NumericEntry n = readNumericEntry(meta);
+        ords.put(fieldNumber, n);
+      } else if (type == DiskDocValuesFormat.SORTED_SET) {
+        // sortedset = binary + numeric + ordIndex
+        if (meta.readVInt() != fieldNumber) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        if (meta.readByte() != DiskDocValuesFormat.BINARY) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        BinaryEntry b = readBinaryEntry(meta);
+        binaries.put(fieldNumber, b);
+        
+        if (meta.readVInt() != fieldNumber) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        if (meta.readByte() != DiskDocValuesFormat.NUMERIC) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        NumericEntry n1 = readNumericEntry(meta);
+        ords.put(fieldNumber, n1);
+        
+        if (meta.readVInt() != fieldNumber) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        if (meta.readByte() != DiskDocValuesFormat.NUMERIC) {
+          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
+        }
+        NumericEntry n2 = readNumericEntry(meta);
+        ordIndexes.put(fieldNumber, n2);
+      } else {
+        throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta);
+      }
+      fieldNumber = meta.readVInt();
+    }
+  }
+  
+  static NumericEntry readNumericEntry(IndexInput meta) throws IOException {
+    NumericEntry entry = new NumericEntry();
+    entry.packedIntsVersion = meta.readVInt();
+    entry.offset = meta.readLong();
+    entry.count = meta.readVLong();
+    entry.blockSize = meta.readVInt();
+    return entry;
+  }
+  
+  static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
+    BinaryEntry entry = new BinaryEntry();
+    entry.minLength = meta.readVInt();
+    entry.maxLength = meta.readVInt();
+    entry.count = meta.readVLong();
+    entry.offset = meta.readLong();
+    if (entry.minLength != entry.maxLength) {
+      entry.addressesOffset = meta.readLong();
+      entry.packedIntsVersion = meta.readVInt();
+      entry.blockSize = meta.readVInt();
+    }
+    return entry;
+  }
+
+  @Override
+  public NumericDocValues getNumeric(FieldInfo field) throws IOException {
+    NumericEntry entry = numerics.get(field.number);
+    return getNumeric(entry);
+  }
+  
+  LongNumericDocValues getNumeric(NumericEntry entry) throws IOException {
+    final IndexInput data = this.data.clone();
+    data.seek(entry.offset);
+
+    final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
+    return new LongNumericDocValues() {
+      @Override
+      public long get(long id) {
+        return reader.get(id);
+      }
+    };
+  }
+
+  @Override
+  public BinaryDocValues getBinary(FieldInfo field) throws IOException {
+    BinaryEntry bytes = binaries.get(field.number);
+    if (bytes.minLength == bytes.maxLength) {
+      return getFixedBinary(field, bytes);
+    } else {
+      return getVariableBinary(field, bytes);
+    }
+  }
+  
+  private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) {
+    final IndexInput data = this.data.clone();
+
+    return new LongBinaryDocValues() {
+      @Override
+      public void get(long id, BytesRef result) {
+        long address = bytes.offset + id * bytes.maxLength;
+        try {
+          data.seek(address);
+          // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) 
+          // assume "they" own the bytes after calling this!
+          final byte[] buffer = new byte[bytes.maxLength];
+          data.readBytes(buffer, 0, buffer.length);
+          result.bytes = buffer;
+          result.offset = 0;
+          result.length = buffer.length;
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }
+    };
+  }
+  
+  private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
+    final IndexInput data = this.data.clone();
+    
+    final MonotonicBlockPackedReader addresses;
+    synchronized (addressInstances) {
+      MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
+      if (addrInstance == null) {
+        data.seek(bytes.addressesOffset);
+        addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
+        addressInstances.put(field.number, addrInstance);
+      }
+      addresses = addrInstance;
+    }
+
+    return new LongBinaryDocValues() {
+      @Override
+      public void get(long id, BytesRef result) {
+        long startAddress = bytes.offset + (id == 0 ? 0 : addresses.get(id-1));
+        long endAddress = bytes.offset + addresses.get(id);
+        int length = (int) (endAddress - startAddress);
+        try {
+          data.seek(startAddress);
+          // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) 
+          // assume "they" own the bytes after calling this!
+          final byte[] buffer = new byte[length];
+          data.readBytes(buffer, 0, buffer.length);
+          result.bytes = buffer;
+          result.offset = 0;
+          result.length = length;
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }
+    };
+  }
+
+  @Override
+  public SortedDocValues getSorted(FieldInfo field) throws IOException {
+    final int valueCount = (int) binaries.get(field.number).count;
+    final BinaryDocValues binary = getBinary(field);
+    final BlockPackedReader ordinals;
+    synchronized (ordinalInstances) {
+      BlockPackedReader ordsInstance = ordinalInstances.get(field.number);
+      if (ordsInstance == null) {
+        NumericEntry entry = ords.get(field.number);
+        IndexInput data = this.data.clone();
+        data.seek(entry.offset);
+        ordsInstance = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
+        ordinalInstances.put(field.number, ordsInstance);
+      }
+      ordinals = ordsInstance;
+    }
+    return new SortedDocValues() {
+
+      @Override
+      public int getOrd(int docID) {
+        return (int) ordinals.get(docID);
+      }
+
+      @Override
+      public void lookupOrd(int ord, BytesRef result) {
+        binary.get(ord, result);
+      }
+
+      @Override
+      public int getValueCount() {
+        return valueCount;
+      }
+    };
+  }
+
+  @Override
+  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
+    final long valueCount = binaries.get(field.number).count;
+    // we keep the byte[]s and list of ords on disk, these could be large
+    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
+    final LongNumericDocValues ordinals = getNumeric(ords.get(field.number));
+    // but the addresses to the ord stream are in RAM
+    final MonotonicBlockPackedReader ordIndex;
+    synchronized (ordIndexInstances) {
+      MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number);
+      if (ordIndexInstance == null) {
+        NumericEntry entry = ordIndexes.get(field.number);
+        IndexInput data = this.data.clone();
+        data.seek(entry.offset);
+        ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
+        ordIndexInstances.put(field.number, ordIndexInstance);
+      }
+      ordIndex = ordIndexInstance;
+    }
+    
+    return new SortedSetDocValues() {
+      long offset;
+      long endOffset;
+      
+      @Override
+      public long nextOrd() {
+        if (offset == endOffset) {
+          return NO_MORE_ORDS;
+        } else {
+          long ord = ordinals.get(offset);
+          offset++;
+          return ord;
+        }
+      }
+
+      @Override
+      public void setDocument(int docID) {
+        offset = (docID == 0 ? 0 : ordIndex.get(docID-1));
+        endOffset = ordIndex.get(docID);
+      }
+
+      @Override
+      public void lookupOrd(long ord, BytesRef result) {
+        binary.get(ord, result);
+      }
+
+      @Override
+      public long getValueCount() {
+        return valueCount;
+      }
+    };
+  }
+
+  @Override
+  public void close() throws IOException {
+    data.close();
+  }
+  
+  static class NumericEntry {
+    long offset;
+
+    int packedIntsVersion;
+    long count;
+    int blockSize;
+  }
+  
+  static class BinaryEntry {
+    long offset;
+
+    long count;
+    int minLength;
+    int maxLength;
+    long addressesOffset;
+    int packedIntsVersion;
+    int blockSize;
+  }
+  
+  // internally we compose complex dv (sorted/sortedset) from other ones
+  static abstract class LongNumericDocValues extends NumericDocValues {
+    @Override
+    public final long get(int docID) {
+      return get((long) docID);
+    }
+    
+    abstract long get(long id);
+  }
+  
+  static abstract class LongBinaryDocValues extends BinaryDocValues {
+    @Override
+    public final void get(int docID, BytesRef result) {
+      get((long)docID, result);
+    }
+    
+    abstract void get(long id, BytesRef Result);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec
----------------------------------------------------------------------
diff --git a/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec b/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec
index 8fb3dea..b238c22 100644
--- a/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -14,4 +14,5 @@
 #  limitations under the License.
 
 org.apache.blur.lucene.codec.Blur021Codec
-org.apache.blur.lucene.codec.Blur022Codec
\ No newline at end of file
+org.apache.blur.lucene.codec.Blur022Codec
+org.apache.blur.lucene.codec.Blur024Codec
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
----------------------------------------------------------------------
diff --git a/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
new file mode 100644
index 0000000..02bd2c5
--- /dev/null
+++ b/blur-store/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.blur.lucene.codec.DiskDocValuesFormat
+

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/435519ab/blur-store/src/test/java/org/apache/blur/lucene/codec/Blur024CodecTest.java
----------------------------------------------------------------------
diff --git a/blur-store/src/test/java/org/apache/blur/lucene/codec/Blur024CodecTest.java b/blur-store/src/test/java/org/apache/blur/lucene/codec/Blur024CodecTest.java
new file mode 100644
index 0000000..e2f329d
--- /dev/null
+++ b/blur-store/src/test/java/org/apache/blur/lucene/codec/Blur024CodecTest.java
@@ -0,0 +1,231 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.blur.lucene.codec;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Version;
+import org.junit.Test;
+
+public class Blur024CodecTest {
+
+  private static final int WORDS = 10000;
+
+  @Test
+  public void testDocValuesFormat() throws IOException {
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
+    conf.setCodec(new Blur024Codec());
+    IndexWriter writer = new IndexWriter(directory, conf);
+
+    Document doc = new Document();
+    doc.add(new StringField("f", "v", Store.YES));
+    doc.add(new SortedDocValuesField("f", new BytesRef("v")));
+    writer.addDocument(doc);
+
+    writer.close();
+
+    DirectoryReader reader = DirectoryReader.open(directory);
+    AtomicReaderContext context = reader.leaves().get(0);
+    AtomicReader atomicReader = context.reader();
+    SortedDocValues sortedDocValues = atomicReader.getSortedDocValues("f");
+    assertTrue(sortedDocValues.getClass().getName().startsWith(DiskDocValuesProducer.class.getName()));
+
+    reader.close();
+  }
+
+  @Test
+  public void testLargeDocs() throws IOException {
+    Random random = new Random();
+    Iterable<? extends IndexableField> doc = getLargeDoc(random);
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
+    conf1.setCodec(new Blur024Codec());
+    IndexWriter writer1 = new IndexWriter(directory, conf1);
+    writer1.addDocument(doc);
+    writer1.close();
+
+    DirectoryReader reader1 = DirectoryReader.open(directory);
+    int numDocs1 = reader1.numDocs();
+    assertEquals(1, numDocs1);
+
+    // for (int i = 0; i < numDocs1; i++) {
+    // System.out.println(reader1.document(i));
+    // }
+
+    IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
+    conf2.setCodec(new Blur024Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
+    IndexWriter writer2 = new IndexWriter(directory, conf2);
+    writer2.addDocument(doc);
+    writer2.close();
+
+    DirectoryReader reader2 = DirectoryReader.open(directory);
+    int numDocs2 = reader2.numDocs();
+    assertEquals(2, numDocs2);
+
+    for (int i = 0; i < 2; i++) {
+
+      long t1 = System.nanoTime();
+      Document document1 = reader1.document(0);
+      long t2 = System.nanoTime();
+      Document document2 = reader2.document(1);
+      long t3 = System.nanoTime();
+
+      System.out.println((t3 - t2) / 1000000.0);
+      System.out.println((t2 - t1) / 1000000.0);
+
+      System.out.println("doc1 " + document1.hashCode());
+      System.out.println("doc2 " + document2.hashCode());
+    }
+
+    // for (int i = 0; i < numDocs2; i++) {
+    // System.out.println(reader2.document(i));
+    // }
+
+    // long fileLength = directory.fileLength("_0.fdt");
+
+    for (String name : directory.listAll()) {
+      if (name.endsWith(".fdt")) {
+        System.out.println(name);
+        System.out.println(directory.fileLength(name));
+      }
+    }
+
+  }
+
+  @Test
+  public void testSmallDocs() throws IOException {
+
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
+    conf1.setCodec(new Blur024Codec());
+    Random random1 = new Random(1);
+    IndexWriter writer1 = new IndexWriter(directory, conf1);
+    for (int i = 0; i < 1000; i++) {
+      writer1.addDocument(getSmallDoc(random1));
+    }
+    writer1.close();
+
+    DirectoryReader reader1 = DirectoryReader.open(directory);
+    int numDocs1 = reader1.numDocs();
+    assertEquals(1000, numDocs1);
+
+    // for (int i = 0; i < numDocs1; i++) {
+    // System.out.println(reader1.document(i));
+    // }
+
+    IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
+    conf2.setCodec(new Blur024Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
+    Random random2 = new Random(1);
+    IndexWriter writer2 = new IndexWriter(directory, conf2);
+    for (int i = 0; i < 1000; i++) {
+      writer2.addDocument(getSmallDoc(random2));
+    }
+    writer2.close();
+
+    DirectoryReader reader2 = DirectoryReader.open(directory);
+    int numDocs2 = reader2.numDocs();
+    assertEquals(2000, numDocs2);
+
+    for (int i = 0; i < 2; i++) {
+
+      long t1 = System.nanoTime();
+      long hash1 = 0;
+      long hash2 = 0;
+      for (int d = 0; d < 1000; d++) {
+        Document document1 = reader1.document(d);
+        hash1 += document1.hashCode();
+      }
+      long t2 = System.nanoTime();
+      for (int d = 0; d < 1000; d++) {
+        Document document2 = reader2.document(d + 1000);
+        hash2 += document2.hashCode();
+      }
+      long t3 = System.nanoTime();
+
+      System.out.println((t3 - t2) / 1000000.0);
+      System.out.println((t2 - t1) / 1000000.0);
+
+      System.out.println("doc1 " + hash1);
+      System.out.println("doc2 " + hash2);
+    }
+
+    // for (int i = 0; i < numDocs2; i++) {
+    // System.out.println(reader2.document(i));
+    // }
+
+    // long fileLength = directory.fileLength("_0.fdt");
+
+    for (String name : directory.listAll()) {
+      if (name.endsWith(".fdt")) {
+        System.out.println(name);
+        System.out.println(directory.fileLength(name));
+      }
+    }
+  }
+
+  private Iterable<? extends IndexableField> getSmallDoc(Random random) {
+    Document document = new Document();
+    document.add(new StringField("word", getWord(random), Store.YES));
+    return document;
+  }
+
+  private Iterable<? extends IndexableField> getLargeDoc(Random random) {
+    Document document = new Document();
+    String body = getBody(random);
+    // System.out.println("body size [" + body.length() + "]");
+    document.add(new TextField("body", body, Store.YES));
+    return document;
+  }
+
+  private String getBody(Random random) {
+    StringBuilder builder = new StringBuilder();
+    for (int i = 0; i < WORDS; i++) {
+      builder.append(getWord(random)).append(' ');
+    }
+    return builder.toString();
+  }
+
+  private String getWord(Random random) {
+    StringBuilder builder = new StringBuilder();
+    for (int i = 0; i < 20; i++) {
+      builder.append((char) (random.nextInt(26) + 'a'));
+    }
+    return builder.toString();
+  }
+
+}


Mime
View raw message