lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jpou...@apache.org
Subject [2/2] lucene-solr:branch_6x: LUCENE-7258: Speed up DocIdSetBuilder allocation.
Date Mon, 23 May 2016 07:28:45 GMT
LUCENE-7258: Speed up DocIdSetBuilder allocation.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/bcc4e870
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/bcc4e870
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/bcc4e870

Branch: refs/heads/branch_6x
Commit: bcc4e8709e8de3ae9688304be32a2b4b39bc0c03
Parents: 978885c
Author: Adrien Grand <jpountz@gmail.com>
Authored: Mon May 23 09:27:11 2016 +0200
Committer: Adrien Grand <jpountz@gmail.com>
Committed: Mon May 23 09:28:08 2016 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   5 +-
 .../org/apache/lucene/util/DocIdSetBuilder.java | 247 ++++++++++++-------
 2 files changed, 165 insertions(+), 87 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc4e870/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fcf7213..981a041 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -75,8 +75,9 @@ Optimizations
 * LUCENE-7237: LRUQueryCache now prefers returning an uncached Scorer than
   waiting on a lock. (Adrien Grand)
 
-* LUCENE-7261, LUCENE-7262, LUCENE-7264: Speed up DocIdSetBuilder (which is used
-  by TermsQuery, multi-term queries and point queries). (Adrien Grand)
+* LUCENE-7261, LUCENE-7262, LUCENE-7264, LUCENE-7258: Speed up DocIdSetBuilder
+  (which is used by TermsQuery, multi-term queries and several point queries).
+  (Adrien Grand, Jeff Wartes, David Smiley)
 
 Bug Fixes
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc4e870/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java
index 0301ba8..e56e211 100644
--- a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java
@@ -17,7 +17,9 @@
 package org.apache.lucene.util;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.Terms;
@@ -56,13 +58,32 @@ public final class DocIdSetBuilder {
     }
   }
 
-  private class BufferAdder extends BulkAdder {
+  private static class Buffer {
+    int[] array;
+    int length;
+
+    Buffer(int length) {
+      this.array = new int[length];
+      this.length = 0;
+    }
+
+    Buffer(int[] array, int length) {
+      this.array = array;
+      this.length = length;
+    }
+  }
+
+  private static class BufferAdder extends BulkAdder {
+    final Buffer buffer;
+
+    BufferAdder(Buffer buffer) {
+      this.buffer = buffer;
+    }
 
     @Override
     public void add(int doc) {
-      buffer[bufferSize++] = doc;
+      buffer.array[buffer.length++] = doc;
     }
-
   }
 
   private final int maxDoc;
@@ -71,13 +92,13 @@ public final class DocIdSetBuilder {
   final boolean multivalued;
   final double numValuesPerDoc;
 
-  private int[] buffer;
-  private int bufferSize;
+  private List<Buffer> buffers = new ArrayList<>();
+  private int totalAllocated; // accumulated size of the allocated buffers
 
   private FixedBitSet bitSet;
 
   private long counter = -1;
-  private BulkAdder adder = new BufferAdder();
+  private BulkAdder adder;
 
   /**
    * Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
@@ -118,67 +139,30 @@ public final class DocIdSetBuilder {
     // of using a full bitset even for quite sparse data
     this.threshold = maxDoc >>> 7;
 
-    this.buffer = new int[0];
-    this.bufferSize = 0;
     this.bitSet = null;
   }
 
-  private void upgradeToBitSet() {
-    assert bitSet == null;
-    bitSet = new FixedBitSet(maxDoc);
-    for (int i = 0; i < bufferSize; ++i) {
-      bitSet.set(buffer[i]);
-    }
-    counter = this.bufferSize;
-    this.buffer = null;
-    this.bufferSize = 0;
-    this.adder = new FixedBitSetAdder(bitSet);
-  }
-
-  /** Grows the buffer to at least minSize, but never larger than threshold. */
-  private void growBuffer(int minSize) {
-    assert minSize < threshold;
-    if (buffer.length < minSize) {
-      int nextSize = Math.min(threshold, ArrayUtil.oversize(minSize, Integer.BYTES));
-      buffer = Arrays.copyOf(buffer, nextSize);
-    }
-  }
-
   /**
    * Add the content of the provided {@link DocIdSetIterator} to this builder.
    * NOTE: if you need to build a {@link DocIdSet} out of a single
    * {@link DocIdSetIterator}, you should rather use {@link RoaringDocIdSet.Builder}.
    */
   public void add(DocIdSetIterator iter) throws IOException {
-    grow((int) Math.min(Integer.MAX_VALUE, iter.cost()));
-
     if (bitSet != null) {
       bitSet.or(iter);
-    } else {
-      while (true) {
-        assert buffer.length <= threshold;
-        final int end = buffer.length;
-        for (int i = bufferSize; i < end; ++i) {
-          final int doc = iter.nextDoc();
-          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-            bufferSize = i;
-            return;
-          }
-          buffer[bufferSize++] = doc;
-        }
-        bufferSize = end;
-
-        if (bufferSize + 1 >= threshold) {
-          break;
-        }
-
-        growBuffer(bufferSize+1);
-      }
-
-      upgradeToBitSet();
-      for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc())
{
-        bitSet.set(doc);
+      return;
+    }
+    int cost = (int) Math.min(Integer.MAX_VALUE, iter.cost());
+    BulkAdder adder = grow(cost);
+    for (int i = 0; i < cost; ++i) {
+      int doc = iter.nextDoc();
+      if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+        return;
       }
+      adder.add(doc);
+    }
+    for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc())
{
+      grow(1).add(doc);
     }
   }
 
@@ -188,9 +172,8 @@ public final class DocIdSetBuilder {
    */
   public BulkAdder grow(int numDocs) {
     if (bitSet == null) {
-      final long newLength = (long) bufferSize + numDocs;
-      if (newLength < threshold) {
-        growBuffer((int) newLength);
+      if ((long) totalAllocated + numDocs <= threshold) {
+        ensureBufferCapacity(numDocs);
       } else {
         upgradeToBitSet();
         counter += numDocs;
@@ -201,28 +184,68 @@ public final class DocIdSetBuilder {
     return adder;
   }
 
-  private static int dedup(int[] arr, int length) {
-    if (length == 0) {
-      return 0;
+  private void ensureBufferCapacity(int numDocs) {
+    if (buffers.isEmpty()) {
+      addBuffer(additionalCapacity(numDocs));
+      return;
     }
-    int l = 1;
-    int previous = arr[0];
-    for (int i = 1; i < length; ++i) {
-      final int value = arr[i];
-      assert value >= previous;
-      if (value != previous) {
-        arr[l++] = value;
-        previous = value;
-      }
+
+    Buffer current = buffers.get(buffers.size() - 1);
+    if (current.array.length - current.length >= numDocs) {
+      // current buffer is large enough
+      return;
+    }
+    if (current.length < current.array.length - (current.array.length >>> 3))
{
+      // current buffer is less than 7/8 full, resize rather than waste space
+      growBuffer(current, additionalCapacity(numDocs));
+    } else {
+      addBuffer(additionalCapacity(numDocs));
     }
-    return l;
   }
 
-  private static boolean noDups(int[] a, int len) {
-    for (int i = 1; i < len; ++i) {
-      assert a[i-1] < a[i];
+  private int additionalCapacity(int numDocs) {
+    // exponential growth: the new array has a size equal to the sum of what
+    // has been allocated so far
+    int c = totalAllocated;
+    // but is also >= numDocs + 1 so that we can store the next batch of docs
+    // (plus an empty slot so that we are more likely to reuse the array in build())
+    c = Math.max(numDocs + 1, c);
+    // avoid cold starts
+    c = Math.max(32, c);
+    // do not go beyond the threshold
+    c = Math.min(threshold - totalAllocated, c);
+    return c;
+  }
+
+  private Buffer addBuffer(int len) {
+    Buffer buffer = new Buffer(len);
+    buffers.add(buffer);
+    adder = new BufferAdder(buffer);
+    totalAllocated += buffer.array.length;
+    return buffer;
+  }
+
+  private void growBuffer(Buffer buffer, int additionalCapacity) {
+    buffer.array = Arrays.copyOf(buffer.array, buffer.array.length + additionalCapacity);
+    totalAllocated += additionalCapacity;
+  }
+
+  private void upgradeToBitSet() {
+    assert bitSet == null;
+    FixedBitSet bitSet = new FixedBitSet(maxDoc);
+    long counter = 0;
+    for (Buffer buffer : buffers) {
+      int[] array = buffer.array;
+      int length = buffer.length;
+      counter += length;
+      for (int i = 0; i < length; ++i) {
+        bitSet.set(array[i]);
+      }
     }
-    return true;
+    this.bitSet = bitSet;
+    this.counter = counter;
+    this.buffers = null;
+    this.adder = new FixedBitSetAdder(bitSet);
   }
 
   /**
@@ -235,25 +258,79 @@ public final class DocIdSetBuilder {
         final long cost = Math.round(counter / numValuesPerDoc);
         return new BitDocIdSet(bitSet, cost);
       } else {
+        Buffer concatenated = concat(buffers);
         LSBRadixSorter sorter = new LSBRadixSorter();
-        sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, bufferSize);
+        sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length);
         final int l;
         if (multivalued) {
-          l = dedup(buffer, bufferSize);
+          l = dedup(concatenated.array, concatenated.length);
         } else {
-          assert noDups(buffer, bufferSize);
-          l = bufferSize;
+          assert noDups(concatenated.array, concatenated.length);
+          l = concatenated.length;
         }
-        assert l <= bufferSize;
-        buffer = ArrayUtil.grow(buffer, l + 1);
-        buffer[l] = DocIdSetIterator.NO_MORE_DOCS;
-        return new IntArrayDocIdSet(buffer, l);
+        assert l <= concatenated.length;
+        concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS;
+        return new IntArrayDocIdSet(concatenated.array, l);
       }
     } finally {
-      this.buffer = null;
-      this.bufferSize = 0;
+      this.buffers = null;
       this.bitSet = null;
     }
   }
 
+  /**
+   * Concatenate the buffers in any order, leaving at least one empty slot in
+   * the end
+   * NOTE: this method might reuse one of the arrays
+   */
+  private static Buffer concat(List<Buffer> buffers) {
+    int totalLength = 0;
+    Buffer largestBuffer = null;
+    for (Buffer buffer : buffers) {
+      totalLength += buffer.length;
+      if (largestBuffer == null || buffer.array.length > largestBuffer.array.length) {
+        largestBuffer = buffer;
+      }
+    }
+    if (largestBuffer == null) {
+      return new Buffer(1);
+    }
+    int[] docs = largestBuffer.array;
+    if (docs.length < totalLength + 1) {
+      docs = Arrays.copyOf(docs, totalLength + 1);
+    }
+    totalLength = largestBuffer.length;
+    for (Buffer buffer : buffers) {
+      if (buffer != largestBuffer) {
+        System.arraycopy(buffer.array, 0, docs, totalLength, buffer.length);
+        totalLength += buffer.length;
+      }
+    }
+    return new Buffer(docs, totalLength);
+  }
+
+  private static int dedup(int[] arr, int length) {
+    if (length == 0) {
+      return 0;
+    }
+    int l = 1;
+    int previous = arr[0];
+    for (int i = 1; i < length; ++i) {
+      final int value = arr[i];
+      assert value >= previous;
+      if (value != previous) {
+        arr[l++] = value;
+        previous = value;
+      }
+    }
+    return l;
+  }
+
+  private static boolean noDups(int[] a, int len) {
+    for (int i = 1; i < len; ++i) {
+      assert a[i-1] < a[i];
+    }
+    return true;
+  }
+
 }


Mime
View raw message