lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject lucene-solr:branch_6x: LUCENE-7142: BKDWriter wasn't splitting correctly with long ords; improve tests so we sometimes long ords even for small number of points
Date Sun, 27 Mar 2016 09:50:30 GMT
Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x 9f9c2f74b -> 28a433581


LUCENE-7142: BKDWriter wasn't splitting correctly with long ords; improve tests so we sometimes
long ords even for small number of points


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/28a43358
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/28a43358
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/28a43358

Branch: refs/heads/branch_6x
Commit: 28a4335813df24e3db8269efe60347c2275e0b03
Parents: 9f9c2f7
Author: Mike McCandless <mikemccand@apache.org>
Authored: Sun Mar 27 05:52:00 2016 -0400
Committer: Mike McCandless <mikemccand@apache.org>
Committed: Sun Mar 27 05:52:00 2016 -0400

----------------------------------------------------------------------
 .../org/apache/lucene/util/bkd/BKDWriter.java   | 27 ++++++++---
 .../lucene/util/bkd/OfflinePointReader.java     | 11 +++--
 .../apache/lucene/util/bkd/Test2BBKDPoints.java |  6 ++-
 .../org/apache/lucene/util/bkd/TestBKD.java     | 51 +++++++++++++++++++-
 .../org/apache/lucene/index/RandomCodec.java    | 24 ++++++++-
 5 files changed, 104 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28a43358/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 33f72e4..3cf32a9 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -128,21 +128,31 @@ public class BKDWriter implements Closeable {
   protected long pointCount;
 
   /** true if we have so many values that we must write ords using long (8 bytes) instead
of int (4 bytes) */
-  private final boolean longOrds;
+  protected final boolean longOrds;
 
   /** An upper bound on how many points the caller will add (includes deletions) */
   private final long totalPointCount;
 
   /** True if every document has at most one value.  We specialize this case by not bothering
to store the ord since it's redundant with docID.  */
-  private final boolean singleValuePerDoc;
+  protected final boolean singleValuePerDoc;
+
+  /** How much heap OfflineSorter is allowed to use */ 
+  protected final OfflineSorter.BufferSize offlineSorterBufferMB;
+
+  /** How much heap OfflineSorter is allowed to use */ 
+  protected final int offlineSorterMaxTempFiles;
 
   private final int maxDoc;
 
-  public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
int bytesPerDim, long totalPointCount, boolean singleValuePerDoc) throws IOException {
-    this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE,
DEFAULT_MAX_MB_SORT_IN_HEAP, totalPointCount, singleValuePerDoc);
+  public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
int bytesPerDim,
+                   int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
boolean singleValuePerDoc) throws IOException {
+    this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode,
maxMBSortInHeap, totalPointCount, singleValuePerDoc,
+         totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES);
   }
 
-  public BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean
singleValuePerDoc) throws IOException {
+  protected BKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
int bytesPerDim,
+                      int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
+                      boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB,
int offlineSorterMaxTempFiles) throws IOException {
     verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
     // We use tracking dir to deal with removing files on exception, so each place that
     // creates temp files doesn't need crazy try/finally/sucess logic:
@@ -153,6 +163,8 @@ public class BKDWriter implements Closeable {
     this.bytesPerDim = bytesPerDim;
     this.totalPointCount = totalPointCount;
     this.maxDoc = maxDoc;
+    this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB);
+    this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles;
     docsSeen = new FixedBitSet(maxDoc);
     packedBytesLength = numDims * bytesPerDim;
 
@@ -166,7 +178,8 @@ public class BKDWriter implements Closeable {
     maxPackedValue = new byte[packedBytesLength];
 
     // If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with
long (8 bytes), else we can use int (4 bytes).
-    longOrds = totalPointCount > Integer.MAX_VALUE;
+    this.longOrds = longOrds;
+
     this.singleValuePerDoc = singleValuePerDoc;
 
     // dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
@@ -735,7 +748,7 @@ public class BKDWriter implements Closeable {
       // TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
       IndexOutput[] lastWriter = new IndexOutput[1];
 
-      OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim,
cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES)
{
+      OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim,
cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles) {
 
           /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read.
*/
           @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28a43358/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
index 20faa9e..17758c0 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
@@ -198,12 +198,15 @@ final class OfflinePointReader extends PointReader {
       in.readBytes(buffer, 0, buffer.length);
 
       long ord;
-      if (singleValuePerDoc == false) {
-        ord = readInt(buffer, packedBytesLength+Integer.BYTES);
-      } else if (longOrds) {
+      if (longOrds) {
+        // A long ord, after the docID:
         ord = readLong(buffer, packedBytesLength+Integer.BYTES);
-      } else {
+      } else if (singleValuePerDoc) {
+        // docID is the ord:
         ord = readInt(buffer, packedBytesLength);
+      } else {
+        // An int ord, after the docID:
+        ord = readInt(buffer, packedBytesLength+Integer.BYTES);
       }
 
       if (rightTree.get(ord)) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28a43358/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
index ffcbcf8..af2e463 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
@@ -40,7 +40,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
 
     final int numDocs = (Integer.MAX_VALUE / 26) + 100;
 
-    BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, 26L * numDocs, false);
+    BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES,
+                                BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
26L * numDocs, false);
     int counter = 0;
     byte[] packedBytes = new byte[Long.BYTES];
     for (int docID = 0; docID < numDocs; docID++) {
@@ -73,7 +74,8 @@ public class Test2BBKDPoints extends LuceneTestCase {
 
     final int numDocs = (Integer.MAX_VALUE / 26) + 100;
 
-    BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, 26L * numDocs, false);
+    BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES,
+                                BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
26L * numDocs, false);
     int counter = 0;
     byte[] packedBytes = new byte[2*Long.BYTES];
     for (int docID = 0; docID < numDocs; docID++) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28a43358/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index 18fbacf..38b3fb5 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -891,7 +891,7 @@ public class TestBKD extends LuceneTestCase {
   public void testTieBreakOrder() throws Exception {
     try (Directory dir = newDirectory()) {
       int numDocs = 10000;
-      BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, 4, 2, 0.01f, numDocs, true);
+      BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, Integer.BYTES, 2, 0.01f, numDocs,
true);
       for(int i=0;i<numDocs;i++) {
         w.add(new byte[Integer.BYTES], i);
       }
@@ -925,4 +925,53 @@ public class TestBKD extends LuceneTestCase {
       in.close();
     }
   }
+
+  public void test2DLongOrdsOffline() throws Exception {
+    try (Directory dir = newDirectory()) {
+      int numDocs = 100000;
+      boolean singleValuePerDoc = false;
+      boolean longOrds = true;
+      int offlineSorterMaxTempFiles = TestUtil.nextInt(random(), 2, 20);
+      BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 2, Integer.BYTES, 2, 0.01f, numDocs,
+                                  singleValuePerDoc, longOrds, 1, offlineSorterMaxTempFiles);
+      byte[] buffer = new byte[2*Integer.BYTES];
+      for(int i=0;i<numDocs;i++) {
+        random().nextBytes(buffer);
+        w.add(buffer, i);
+      }
+
+      IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
+      long fp = w.finish(out);
+      out.close();
+
+      IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
+      in.seek(fp);
+      BKDReader r = new BKDReader(in);
+      int[] count = new int[1];
+      r.intersect(new IntersectVisitor() {
+
+          @Override
+          public void visit(int docID) {
+            count[0]++;
+          }
+
+          @Override
+          public void visit(int docID, byte[] packedValue) {
+            visit(docID);
+          }
+
+          @Override
+          public Relation compare(byte[] minPacked, byte[] maxPacked) {
+            if (random().nextInt(7) == 1) {
+              return Relation.CELL_CROSSES_QUERY;
+            } else {
+              return Relation.CELL_INSIDE_QUERY;
+            }
+          }
+      });
+      assertEquals(numDocs, count[0]);
+      in.close();
+    }
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28a43358/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
index af0aade..73db88b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
@@ -277,10 +277,32 @@ public class RandomCodec extends AssertingCodec {
     public RandomlySplittingBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix,
int numDims,
                                       int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap,
                                       long totalPointCount, boolean singleValuePerDoc, int
randomSeed) throws IOException {
-      super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode,
maxMBSortInHeap, totalPointCount, singleValuePerDoc);
+      super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode,
maxMBSortInHeap, totalPointCount,
+            getRandomSingleValuePerDoc(singleValuePerDoc, randomSeed),
+            getRandomLongOrds(totalPointCount, singleValuePerDoc, randomSeed),
+            getRandomOfflineSorterBufferMB(randomSeed),
+            getRandomOfflineSorterMaxTempFiles(randomSeed));
       this.random = new Random(randomSeed);
     }
 
+    private static boolean getRandomSingleValuePerDoc(boolean singleValuePerDoc, int randomSeed)
{
+      // If we are single valued, sometimes pretend we aren't:
+      return singleValuePerDoc && (new Random(randomSeed).nextBoolean());
+    }
+
+    private static boolean getRandomLongOrds(long totalPointCount, boolean singleValuePerDoc,
int randomSeed) {
+      // Always use long ords if we have too many points, but sometimes randomly use it anyway
when singleValuePerDoc is false:
+      return totalPointCount > Integer.MAX_VALUE || (getRandomSingleValuePerDoc(singleValuePerDoc,
randomSeed) == false && new Random(randomSeed).nextBoolean());
+    }
+
+    private static long getRandomOfflineSorterBufferMB(int randomSeed) {
+      return TestUtil.nextInt(new Random(randomSeed), 1, 8);
+    }
+
+    private static int getRandomOfflineSorterMaxTempFiles(int randomSeed) {
+      return TestUtil.nextInt(new Random(randomSeed), 2, 20);
+    }
+
     @Override
     protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
       // BKD normally defaults by the widest dimension, to try to make as squarish cells
as possible, but we just pick a random one ;)


Mime
View raw message