It uses the median of the first, middle and last values as a pivot and + * falls back to a heap sort when the number of recursion levels exceeds + * {@code 2 lg(n)}, as a consequence it runs in linear time on average and in + * {@code n log(n)} time in the worst case.

+ * @lucene.internal */ +public abstract class IntroSelector extends Selector { + + @Override + public final void select(int from, int to, int k) { + checkArgs(from, to, k); + final int maxDepth = 2 * MathUtil.log(to - from, 2); + quickSelect(from, to, k, maxDepth); + } + + // heap sort + void slowSelect(int from, int to, int k) { + new Sorter() { + + @Override + protected void swap(int i, int j) { + IntroSelector.this.swap(i, j); + } + + @Override + protected int compare(int i, int j) { + return IntroSelector.this.compare(i, j); + } + + public void sort(int from, int to) { + heapSort(from, to); + } + }.sort(from, to); + } + + private void quickSelect(int from, int to, int k, int maxDepth) { + assert from <= k; + assert k < to; + if (to - from == 1) { + return; + } + if (--maxDepth < 0) { + slowSelect(from, to, k); + return; + } + + final int mid = (from + to) >>> 1; + // heuristic: we use the median of the values at from, to-1 and mid as a pivot + if (compare(from, to - 1) > 0) { + swap(from, to - 1); + } + if (compare(to - 1, mid) > 0) { + swap(to - 1, mid); + if (compare(from, to - 1) > 0) { + swap(from, to - 1); + } + } + + setPivot(to - 1); + + int left = from + 1; + int right = to - 2; + + for (;;) { + while (comparePivot(left) > 0) { + ++left; + } + + while (left < right && comparePivot(right) <= 0) { + --right; + } + + if (left < right) { + swap(left, right); + --right; + } else { + break; + } + } + swap(left, to - 1); + + if (left == k) { + return; + } else if (left < k) { + quickSelect(left + 1, to, k, maxDepth); + } else { + quickSelect(from, left, k, maxDepth); + } + } + + /** Compare entries found in slots `i` and `j`. + * The contract for the returned value is the same as + * {@link Comparator#compare(Object, Object)}. */ + protected int compare(int i, int j) { + setPivot(i); + return comparePivot(j); + } + + /** Save the value at slot `i` so that it can later be used as a + * pivot, see {@link #comparePivot(int)}. */ + protected abstract void setPivot(int i); + + /** Compare the pivot with the slot at `j`, similarly to + * {@link #compare(int, int) compare(i, j)}. */ + protected abstract int comparePivot(int j); +} http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60975d2d/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java b/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java index 26f7e37..83d118d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java +++ b/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.util; - /** * {@link Sorter} implementation based on a variant of the quicksort algorithm * called introsort: when @@ -91,4 +90,10 @@ public abstract class IntroSorter extends Sorter { /** Compare the pivot with the slot at `j`, similarly to * {@link #compare(int, int) compare(i, j)}. */ protected abstract int comparePivot(int j); + + @Override + protected int compare(int i, int j) { + setPivot(i); + return comparePivot(j); + } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60975d2d/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java b/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java new file mode 100644 index 0000000..543204a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.util.Arrays; + +/** Radix selector. + *

This implementation works similarly to a MSB radix sort except that it + * only recurses into the sub partition that contains the desired value. + * @lucene.internal */ +public abstract class RadixSelector extends Selector { + + // after that many levels of recursion we fall back to introselect anyway + // this is used as a protection against the fact that radix sort performs + // worse when there are long common prefixes (probably because of cache + // locality) + private static final int LEVEL_THRESHOLD = 8; + // size of histograms: 256 + 1 to indicate that the string is finished + private static final int HISTOGRAM_SIZE = 257; + // buckets below this size will be sorted with introselect + private static final int LENGTH_THRESHOLD = 100; + + // we store one histogram per recursion level + private final int[] histogram = new int[HISTOGRAM_SIZE]; + + private final int maxLength; + + /** + * Sole constructor. + * @param maxLength the maximum length of keys, pass {@link Integer#MAX_VALUE} if unknown. + */ + protected RadixSelector(int maxLength) { + this.maxLength = maxLength; + } + + /** Return the k-th byte of the entry at index {@code i}, or {@code -1} if + * its length is less than or equal to {@code k}. This may only be called + * with a value of {@code i} between {@code 0} included and + * {@code maxLength} excluded. */ + protected abstract int byteAt(int i, int k); + + /** Get a fall-back selector which may assume that the first {@code d} bytes + * of all compared strings are equal. This fallback selector is used when + * the range becomes narrow or when the maximum level of recursion has + * been exceeded. */ + protected Selector getFallbackSelector(int d) { + return new IntroSelector() { + @Override + protected void swap(int i, int j) { + RadixSelector.this.swap(i, j); + } + + @Override + protected int compare(int i, int j) { + for (int o = d; o < maxLength; ++o) { + final int b1 = byteAt(i, o); + final int b2 = byteAt(j, o); + if (b1 != b2) { + return b1 - b2; + } else if (b1 == -1) { + break; + } + } + return 0; + } + + @Override + protected void setPivot(int i) { + pivot.setLength(0); + for (int o = d; o < maxLength; ++o) { + final int b = byteAt(i, o); + if (b == -1) { + break; + } + pivot.append((byte) b); + } + } + + @Override + protected int comparePivot(int j) { + for (int o = 0; o < pivot.length(); ++o) { + final int b1 = pivot.byteAt(o) & 0xff; + final int b2 = byteAt(j, d + o); + if (b1 != b2) { + return b1 - b2; + } + } + if (d + pivot.length() == maxLength) { + return 0; + } + return -1 - byteAt(j, d + pivot.length()); + } + + private final BytesRefBuilder pivot = new BytesRefBuilder(); + }; + } + + @Override + public void select(int from, int to, int k) { + checkArgs(from, to, k); + select(from, to, k, 0); + } + + private void select(int from, int to, int k, int d) { + if (to - from <= LENGTH_THRESHOLD || d >= LEVEL_THRESHOLD) { + getFallbackSelector(d).select(from, to, k); + } else { + radixSelect(from, to, k, d); + } + } + + private void radixSelect(int from, int to, int k, int d) { + final int[] histogram = this.histogram; + Arrays.fill(histogram, 0); + + buildHistogram(from, to, d, histogram); + + int bucketFrom = from; + for (int bucket = 0; bucket < HISTOGRAM_SIZE; ++bucket) { + final int bucketTo = bucketFrom + histogram[bucket]; + + if (bucketTo > k) { + partition(from, to, bucket, bucketFrom, bucketTo, d); + + if (bucket != 0 && d + 1 < maxLength) { + // all elements in bucket 0 are equal so we only need to recurse if bucket != 0 + select(bucketFrom, bucketTo, k, d + 1); + } + return; + } + bucketFrom = bucketTo; + } + throw new AssertionError("Unreachable code"); + } + + /** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */ + private int getBucket(int i, int k) { + return byteAt(i, k) + 1; + } + + /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}. */ + private int[] buildHistogram(int from, int to, int k, int[] histogram) { + for (int i = from; i < to; ++i) { + histogram[getBucket(i, k)]++; + } + return histogram; + } + + /** Reorder elements so that all of them that fall into {@code bucket} are + * between offsets {@code bucketFrom} and {@code bucketTo}. */ + private void partition(int from, int to, int bucket, int bucketFrom, int bucketTo, int d) { + int left = from; + int right = to - 1; + + int slot = bucketFrom; + + for (;;) { + int leftBucket = getBucket(left, d); + int rightBucket = getBucket(right, d); + + while (leftBucket <= bucket && left < bucketFrom) { + if (leftBucket == bucket) { + swap(left, slot++); + } else { + ++left; + } + leftBucket = getBucket(left, d); + } + + while (rightBucket >= bucket && right >= bucketTo) { + if (rightBucket == bucket) { + swap(right, slot++); + } else { + --right; + } + rightBucket = getBucket(right, d); + } + + if (left < bucketFrom && right >= bucketTo) { + swap(left++, right--); + } else { + assert left == bucketFrom; + assert right == bucketTo - 1; + break; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60975d2d/lucene/core/src/java/org/apache/lucene/util/Selector.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/Selector.java b/lucene/core/src/java/org/apache/lucene/util/Selector.java new file mode 100644 index 0000000..8c17ce4 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/Selector.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +/** An implementation of a selection algorithm, ie. computing the k-th greatest + * value from a collection. */ +public abstract class Selector { + + /** Reorder elements so that the element at position {@code k} is the same + * as if all elements were sorted and all other elements are partitioned + * around it: {@code [from, k)} only contains elements that are less than + * or equal to {@code k} and {@code (k, to)} only contains elements that + * are greater than or equal to {@code k}. */ + public abstract void select(int from, int to, int k); + + void checkArgs(int from, int to, int k) { + if (k < from) { + throw new IllegalArgumentException("k must be >= from"); + } + if (k >= to) { + throw new IllegalArgumentException("k must be < to"); + } + } + + /** Swap values at slots `i` and `j`. */ + protected abstract void swap(int i, int j); +} http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60975d2d/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index 97651e4..d0d7dca 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.function.IntFunction; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.MutablePointsReader; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; @@ -67,7 +68,7 @@ import org.apache.lucene.util.StringHelper; *

* See this paper for details. * - *

This consumes heap during writing: it allocates a `LongBitSet(numPoints)`, + *

This consumes heap during writing: it allocates a `LongBitSet(numPoints)`, * and then uses up to the specified {@code maxMBSortInHeap} heap space for writing. * *