Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 3ADDC200CD5 for ; Sat, 15 Jul 2017 10:06:49 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 395E516B57F; Sat, 15 Jul 2017 08:06:49 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 01FDA16B57C for ; Sat, 15 Jul 2017 10:06:46 +0200 (CEST) Received: (qmail 23017 invoked by uid 500); 15 Jul 2017 08:06:43 -0000 Mailing-List: contact commits-help@hive.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hive-dev@hive.apache.org Delivered-To: mailing list commits@hive.apache.org Received: (qmail 22297 invoked by uid 99); 15 Jul 2017 08:06:43 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 15 Jul 2017 08:06:43 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 5F04AF554B; Sat, 15 Jul 2017 08:06:41 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: pxiong@apache.org To: commits@hive.apache.org Date: Sat, 15 Jul 2017 08:07:00 -0000 Message-Id: <3c11733ba39344a4beb3d28e4fc8072b@git.apache.org> In-Reply-To: <6e8117c8222a443fb20ec8e2089cfa72@git.apache.org> References: <6e8117c8222a443fb20ec8e2089cfa72@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [21/24] hive git commit: HIVE-16996: Add HLL as an alternative to FM sketch to compute stats (Pengcheng Xiong, reviewed by Ashutosh Chauhan, Prasanth Jayachandran) archived-at: Sat, 15 Jul 2017 08:06:49 -0000 http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/NumDistinctValueEstimator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/NumDistinctValueEstimator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/NumDistinctValueEstimator.java deleted file mode 100644 index 92f9a84..0000000 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/NumDistinctValueEstimator.java +++ /dev/null @@ -1,367 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.metastore; -import java.util.Random; - -import javolution.util.FastBitSet; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.io.Text; - -/* - * https://en.wikipedia.org/wiki/Flajolet%E2%80%93Martin_algorithm - * We implement Flajolet–Martin algorithm in this class. - * The Flajolet–Martin algorithm is an algorithm for approximating the number of distinct elements - * in a stream with a single pass and space-consumption which is logarithmic in the maximum number - * of possible distinct elements in the stream. The algorithm was introduced by Philippe Flajolet - * and G. Nigel Martin in their 1984 paper "Probabilistic Counting Algorithms for Data Base Applications". - * Later it has been refined in the papers "LogLog counting of large cardinalities" by Marianne Durand - * and Philippe Flajolet, and "HyperLogLog: The analysis of a near-optimal cardinality estimation - * algorithm" by Philippe Flajolet et al. - */ - -/* - * The algorithm works like this. - * (1) Set the number of bit vectors, i.e., numBitVectors, based on the precision. - * (2) For each bit vector, generate hash value of the long value and mod it by 2^bitVectorSize-1. (addToEstimator) - * (3) Set the index (addToEstimator) - * (4) Take the average of the index for all the bit vectors and get the estimated NDV (estimateNumDistinctValues). - */ -public class NumDistinctValueEstimator { - - static final Log LOG = LogFactory.getLog(NumDistinctValueEstimator.class.getName()); - - /* We want a,b,x to come from a finite field of size 0 to k, where k is a prime number. - * 2^p - 1 is prime for p = 31. Hence bitvectorSize has to be 31. Pick k to be 2^p -1. - * If a,b,x didn't come from a finite field ax1 + b mod k and ax2 + b mod k will not be pair wise - * independent. As a consequence, the hash values will not distribute uniformly from 0 to 2^p-1 - * thus introducing errors in the estimates. - */ - private static final int BIT_VECTOR_SIZE = 31; - private final int numBitVectors; - - // Refer to Flajolet-Martin'86 for the value of phi - private static final double PHI = 0.77351; - - private final int[] a; - private final int[] b; - private final FastBitSet[] bitVector; - - private final Random aValue; - private final Random bValue; - - /* Create a new distinctValueEstimator - */ - public NumDistinctValueEstimator(int numBitVectors) { - this.numBitVectors = numBitVectors; - bitVector = new FastBitSet[numBitVectors]; - for (int i=0; i< numBitVectors; i++) { - bitVector[i] = new FastBitSet(BIT_VECTOR_SIZE); - } - - a = new int[numBitVectors]; - b = new int[numBitVectors]; - - /* Use a large prime number as a seed to the random number generator. - * Java's random number generator uses the Linear Congruential Generator to generate random - * numbers using the following recurrence relation, - * - * X(n+1) = (a X(n) + c ) mod m - * - * where X0 is the seed. Java implementation uses m = 2^48. This is problematic because 2^48 - * is not a prime number and hence the set of numbers from 0 to m don't form a finite field. - * If these numbers don't come from a finite field any give X(n) and X(n+1) may not be pair - * wise independent. - * - * However, empirically passing in prime numbers as seeds seems to work better than when passing - * composite numbers as seeds. Ideally Java's Random should pick m such that m is prime. - * - */ - aValue = new Random(99397); - bValue = new Random(9876413); - - for (int i = 0; i < numBitVectors; i++) { - int randVal; - /* a and b shouldn't be even; If a and b are even, then none of the values - * will set bit 0 thus introducing errors in the estimate. Both a and b can be even - * 25% of the times and as a result 25% of the bit vectors could be inaccurate. To avoid this - * always pick odd values for a and b. - */ - do { - randVal = aValue.nextInt(); - } while (randVal % 2 == 0); - - a[i] = randVal; - - do { - randVal = bValue.nextInt(); - } while (randVal % 2 == 0); - - b[i] = randVal; - - if (a[i] < 0) { - a[i] = a[i] + (1 << BIT_VECTOR_SIZE - 1); - } - - if (b[i] < 0) { - b[i] = b[i] + (1 << BIT_VECTOR_SIZE - 1); - } - } - } - - public NumDistinctValueEstimator(String s, int numBitVectors) { - this.numBitVectors = numBitVectors; - FastBitSet bitVectorDeser[] = deserialize(s, numBitVectors); - bitVector = new FastBitSet[numBitVectors]; - for(int i=0; i = '0' && c <= '9') { - String t = new String(); - t = t + c; - c = s.charAt(i); - i = i + 1; - - while (c != ',' && c!= '}') { - t = t + c; - c = s.charAt(i); - i = i + 1; - } - - int bitIndex = Integer.parseInt(t); - assert(bitIndex >= 0); - assert(vectorIndex < numBitVectors); - b[vectorIndex].set(bitIndex); - if (c == '}') { - vectorIndex = vectorIndex + 1; - } - } - } - return b; - } - - private int generateHash(long v, int hashNum) { - int mod = (1<> 1; - } - - // Set bitvector[index] := 1 - bitVector[i].set(index); - } - } - - public void addToEstimatorPCSA(long v) { - int hash = generateHashForPCSA(v); - int rho = hash/numBitVectors; - int index; - - // Find the index of the least significant bit that is 1 - for (index=0; index> 1; - } - - // Set bitvector[index] := 1 - bitVector[hash%numBitVectors].set(index); - } - - public void addToEstimator(double d) { - int v = new Double(d).hashCode(); - addToEstimator(v); - } - - public void addToEstimatorPCSA(double d) { - int v = new Double(d).hashCode(); - addToEstimatorPCSA(v); - } - - public void addToEstimator(HiveDecimal decimal) { - int v = decimal.hashCode(); - addToEstimator(v); - } - - public void addToEstimatorPCSA(HiveDecimal decimal) { - int v = decimal.hashCode(); - addToEstimatorPCSA(v); - } - - public void mergeEstimators(NumDistinctValueEstimator o) { - // Bitwise OR the bitvector with the bitvector in the agg buffer - for (int i=0; i() { @Override public AggrStats load(StatsCacheKey key) throws Exception { - int numBitVectors = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf); boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); HBaseReadWrite hrw = HBaseReadWrite.getInstance(); AggrStats aggrStats = hrw.getAggregatedStats(key.hashed); @@ -101,7 +100,7 @@ class StatsCache { if (aggregator == null) { aggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(css.iterator() .next().getStatsObj().iterator().next().getStatsData().getSetField(), - numBitVectors, useDensityFunctionForNDVEstimation); + useDensityFunctionForNDVEstimation); } ColumnStatisticsObj statsObj = aggregator .aggregate(key.colName, key.partNames, css); http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java index 31955b4..29a0539 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java @@ -26,7 +26,6 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.MetaException; public abstract class ColumnStatsAggregator { - public int numBitVectors; public boolean useDensityFunctionForNDVEstimation; public abstract ColumnStatisticsObj aggregate(String colName, List partNames, http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java index daf8569..568bf06 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java @@ -34,7 +34,7 @@ public class ColumnStatsAggregatorFactory { private ColumnStatsAggregatorFactory() { } - public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, int numBitVectors, boolean useDensityFunctionForNDVEstimation) { + public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, boolean useDensityFunctionForNDVEstimation) { ColumnStatsAggregator agg; switch (type) { case BOOLEAN_STATS: @@ -58,7 +58,6 @@ public class ColumnStatsAggregatorFactory { default: throw new RuntimeException("Woh, bad. Unknown stats type " + type.toString()); } - agg.numBitVectors = numBitVectors; agg.useDensityFunctionForNDVEstimation = useDensityFunctionForNDVEstimation; return agg; } http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java index 36b2c9c..8eb64e0 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java @@ -26,7 +26,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.StatObjectConverter; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; @@ -46,7 +47,7 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); - boolean isNDVBitVectorSet = true; + NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { if (cs.getStatsObjSize() != 1) { @@ -60,22 +61,36 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso .getStatsData().getSetField()); } - if (numBitVectors <= 0 || !cso.getStatsData().getDecimalStats().isSetBitVectors() + if (!cso.getStatsData().getDecimalStats().isSetBitVectors() || cso.getStatsData().getDecimalStats().getBitVectors().length() == 0) { - isNDVBitVectorSet = false; + ndvEstimator = null; break; + } else { + // check if all of the bit vectors can merge + NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(cso.getStatsData().getDecimalStats().getBitVectors()); + if (ndvEstimator == null) { + ndvEstimator = estimator; + } else { + if (ndvEstimator.canMerge(estimator)) { + continue; + } else { + ndvEstimator = null; + break; + } + } } } + if (ndvEstimator != null) { + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { DecimalColumnStatsData aggregateData = null; long lowerBound = 0; long higherBound = 0; double densityAvgSum = 0.0; - NumDistinctValueEstimator ndvEstimator = null; - if (isNDVBitVectorSet) { - ndvEstimator = new NumDistinctValueEstimator(numBitVectors); - } for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats(); @@ -85,9 +100,9 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils .getDoubleValue(newData.getLowValue())) / newData.getNumDVs(); } - if (isNDVBitVectorSet) { - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); + if (ndvEstimator != null) { + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); } if (aggregateData == null) { aggregateData = newData.deepCopy(); @@ -108,7 +123,7 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); } } - if (isNDVBitVectorSet) { + if (ndvEstimator != null) { // if all the ColumnStatisticsObjs contain bitvectors, we do not need to // use uniform distribution assumption because we can merge bitvectors // to get a good estimation. @@ -145,7 +160,7 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen // while we scan the css, we also get the densityAvg, lowerbound and // higerbound when useDensityFunctionForNDVEstimation is true. double densityAvgSum = 0.0; - if (!isNDVBitVectorSet) { + if (ndvEstimator == null) { // if not every partition uses bitvector for ndv, we just fall back to // the traditional extrapolation methods. for (ColumnStatistics cs : css) { @@ -162,7 +177,6 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen } else { // we first merge all the adjacent bitvectors that we could merge and // derive new partition names and index. - NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors); StringBuilder pseudoPartName = new StringBuilder(); double pseudoIndexSum = 0; int length = 0; @@ -191,6 +205,7 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen pseudoPartName = new StringBuilder(); pseudoIndexSum = 0; length = 0; + ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); } aggregateData = null; } @@ -216,8 +231,8 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen } aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); } - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); } if (length > 0) { // we have to set ndv http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java index a88ef84..b6b8612 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java @@ -26,7 +26,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; @@ -44,7 +45,7 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); - boolean isNDVBitVectorSet = true; + NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { if (cs.getStatsObjSize() != 1) { @@ -58,22 +59,36 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso .getStatsData().getSetField()); } - if (numBitVectors <= 0 || !cso.getStatsData().getDoubleStats().isSetBitVectors() + if (!cso.getStatsData().getDoubleStats().isSetBitVectors() || cso.getStatsData().getDoubleStats().getBitVectors().length() == 0) { - isNDVBitVectorSet = false; + ndvEstimator = null; break; + } else { + // check if all of the bit vectors can merge + NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(cso.getStatsData().getDoubleStats().getBitVectors()); + if (ndvEstimator == null) { + ndvEstimator = estimator; + } else { + if (ndvEstimator.canMerge(estimator)) { + continue; + } else { + ndvEstimator = null; + break; + } + } } } + if (ndvEstimator != null) { + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { DoubleColumnStatsData aggregateData = null; long lowerBound = 0; long higherBound = 0; double densityAvgSum = 0.0; - NumDistinctValueEstimator ndvEstimator = null; - if (isNDVBitVectorSet) { - ndvEstimator = new NumDistinctValueEstimator(numBitVectors); - } for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats(); @@ -82,9 +97,9 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement higherBound += newData.getNumDVs(); densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); } - if (isNDVBitVectorSet) { - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); + if (ndvEstimator != null) { + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); } if (aggregateData == null) { aggregateData = newData.deepCopy(); @@ -96,7 +111,7 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); } } - if (isNDVBitVectorSet) { + if (ndvEstimator != null) { // if all the ColumnStatisticsObjs contain bitvectors, we do not need to // use uniform distribution assumption because we can merge bitvectors // to get a good estimation. @@ -132,7 +147,7 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement // while we scan the css, we also get the densityAvg, lowerbound and // higerbound when useDensityFunctionForNDVEstimation is true. double densityAvgSum = 0.0; - if (!isNDVBitVectorSet) { + if (ndvEstimator == null) { // if not every partition uses bitvector for ndv, we just fall back to // the traditional extrapolation methods. for (ColumnStatistics cs : css) { @@ -148,7 +163,6 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement } else { // we first merge all the adjacent bitvectors that we could merge and // derive new partition names and index. - NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors); StringBuilder pseudoPartName = new StringBuilder(); double pseudoIndexSum = 0; int length = 0; @@ -176,6 +190,7 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement pseudoPartName = new StringBuilder(); pseudoIndexSum = 0; length = 0; + ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); } aggregateData = null; } @@ -192,8 +207,8 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement newData.getHighValue())); aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); } - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); } if (length > 0) { // we have to set ndv http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java index 8ac6561..2da6f60 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java @@ -26,7 +26,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; @@ -44,7 +45,7 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); - boolean isNDVBitVectorSet = true; + NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { if (cs.getStatsObjSize() != 1) { @@ -58,22 +59,36 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso .getStatsData().getSetField()); } - if (numBitVectors <= 0 || !cso.getStatsData().getLongStats().isSetBitVectors() + if (!cso.getStatsData().getLongStats().isSetBitVectors() || cso.getStatsData().getLongStats().getBitVectors().length() == 0) { - isNDVBitVectorSet = false; + ndvEstimator = null; break; + } else { + // check if all of the bit vectors can merge + NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(cso.getStatsData().getLongStats().getBitVectors()); + if (ndvEstimator == null) { + ndvEstimator = estimator; + } else { + if (ndvEstimator.canMerge(estimator)) { + continue; + } else { + ndvEstimator = null; + break; + } + } } } + if (ndvEstimator != null) { + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { LongColumnStatsData aggregateData = null; long lowerBound = 0; long higherBound = 0; double densityAvgSum = 0.0; - NumDistinctValueEstimator ndvEstimator = null; - if (isNDVBitVectorSet) { - ndvEstimator = new NumDistinctValueEstimator(numBitVectors); - } for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); LongColumnStatsData newData = cso.getStatsData().getLongStats(); @@ -82,9 +97,9 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements higherBound += newData.getNumDVs(); densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); } - if (isNDVBitVectorSet) { - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); + if (ndvEstimator != null) { + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); } if (aggregateData == null) { aggregateData = newData.deepCopy(); @@ -96,7 +111,7 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); } } - if (isNDVBitVectorSet) { + if (ndvEstimator != null) { // if all the ColumnStatisticsObjs contain bitvectors, we do not need to // use uniform distribution assumption because we can merge bitvectors // to get a good estimation. @@ -132,7 +147,7 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements // while we scan the css, we also get the densityAvg, lowerbound and // higerbound when useDensityFunctionForNDVEstimation is true. double densityAvgSum = 0.0; - if (!isNDVBitVectorSet) { + if (ndvEstimator == null) { // if not every partition uses bitvector for ndv, we just fall back to // the traditional extrapolation methods. for (ColumnStatistics cs : css) { @@ -148,7 +163,6 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements } else { // we first merge all the adjacent bitvectors that we could merge and // derive new partition names and index. - NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors); StringBuilder pseudoPartName = new StringBuilder(); double pseudoIndexSum = 0; int length = 0; @@ -176,6 +190,7 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements pseudoPartName = new StringBuilder(); pseudoIndexSum = 0; length = 0; + ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); } aggregateData = null; } @@ -192,8 +207,8 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements newData.getHighValue())); aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); } - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); } if (length > 0) { // we have to set ndv http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java index 2aa4046..83c6c54 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java @@ -21,7 +21,8 @@ package org.apache.hadoop.hive.metastore.hbase.stats; import java.util.List; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; @@ -39,7 +40,7 @@ public class StringColumnStatsAggregator extends ColumnStatsAggregator { // bitvectors. Only when both of the conditions are true, we merge bit // vectors. Otherwise, just use the maximum function. boolean doAllPartitionContainStats = partNames.size() == css.size(); - boolean isNDVBitVectorSet = true; + NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { if (cs.getStatsObjSize() != 1) { @@ -53,21 +54,37 @@ public class StringColumnStatsAggregator extends ColumnStatsAggregator { statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso .getStatsData().getSetField()); } - if (numBitVectors <= 0 || !cso.getStatsData().getStringStats().isSetBitVectors() + if (!cso.getStatsData().getStringStats().isSetBitVectors() || cso.getStatsData().getStringStats().getBitVectors().length() == 0) { - isNDVBitVectorSet = false; + ndvEstimator = null; break; + } else { + // check if all of the bit vectors can merge + NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(cso.getStatsData().getStringStats().getBitVectors()); + if (ndvEstimator == null) { + ndvEstimator = estimator; + } else { + if (ndvEstimator.canMerge(estimator)) { + continue; + } else { + ndvEstimator = null; + break; + } + } } } + if (ndvEstimator != null) { + ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); + } ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); - if (doAllPartitionContainStats && isNDVBitVectorSet) { + if (doAllPartitionContainStats && ndvEstimator!=null) { StringColumnStatsData aggregateData = null; - NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors); for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); StringColumnStatsData newData = cso.getStatsData().getStringStats(); - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); if (aggregateData == null) { aggregateData = newData.deepCopy(); } else { http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java index 33c7e3e..d3051a2 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.metastore.hbase.stats.merge; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,8 +26,6 @@ import org.slf4j.LoggerFactory; public abstract class ColumnStatsMerger { protected final Logger LOG = LoggerFactory.getLogger(ColumnStatsMerger.class.getName()); - NumDistinctValueEstimator ndvEstimator = null; - public abstract void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats); } http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java index fe890e4..c013ba5 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java @@ -20,7 +20,8 @@ package org.apache.hadoop.hive.metastore.hbase.stats.merge; import org.apache.commons.lang.StringUtils; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; +import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; @@ -37,15 +38,6 @@ public class ColumnStatsMergerFactory { private ColumnStatsMergerFactory() { } - // we depend on the toString() method for javolution.util.FastCollection. - private static int countNumBitVectors(String s) { - if (s != null) { - return StringUtils.countMatches(s, "{"); - } else { - return 0; - } - } - public static ColumnStatsMerger getColumnStatsMerger(ColumnStatisticsObj statsObjNew, ColumnStatisticsObj statsObjOld) { ColumnStatsMerger agg; @@ -53,30 +45,20 @@ public class ColumnStatsMergerFactory { _Fields typeOld = statsObjOld.getStatsData().getSetField(); // make sure that they have the same type typeNew = typeNew == typeOld ? typeNew : null; - int numBitVectors = 0; switch (typeNew) { case BOOLEAN_STATS: agg = new BooleanColumnStatsMerger(); break; case LONG_STATS: { agg = new LongColumnStatsMerger(); - int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getLongStats().getBitVectors()); - int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getLongStats().getBitVectors()); - numBitVectors = nbvNew == nbvOld ? nbvNew : 0; break; } case DOUBLE_STATS: { agg = new DoubleColumnStatsMerger(); - int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDoubleStats().getBitVectors()); - int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDoubleStats().getBitVectors()); - numBitVectors = nbvNew == nbvOld ? nbvNew : 0; break; } case STRING_STATS: { agg = new StringColumnStatsMerger(); - int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getStringStats().getBitVectors()); - int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getStringStats().getBitVectors()); - numBitVectors = nbvNew == nbvOld ? nbvNew : 0; break; } case BINARY_STATS: @@ -84,24 +66,15 @@ public class ColumnStatsMergerFactory { break; case DECIMAL_STATS: { agg = new DecimalColumnStatsMerger(); - int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDecimalStats().getBitVectors()); - int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDecimalStats().getBitVectors()); - numBitVectors = nbvNew == nbvOld ? nbvNew : 0; break; } case DATE_STATS: { agg = new DateColumnStatsMerger(); - int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDateStats().getBitVectors()); - int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDateStats().getBitVectors()); - numBitVectors = nbvNew == nbvOld ? nbvNew : 0; break; } default: throw new IllegalArgumentException("Unknown stats type " + typeNew.toString()); } - if (numBitVectors > 0) { - agg.ndvEstimator = new NumDistinctValueEstimator(numBitVectors); - } return agg; } http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java index 3179b23..e899bfe 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java @@ -19,7 +19,8 @@ package org.apache.hadoop.hive.metastore.hbase.stats.merge; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Date; import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; @@ -29,27 +30,32 @@ public class DateColumnStatsMerger extends ColumnStatsMerger { public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats) { DateColumnStatsData aggregateData = aggregateColStats.getStatsData().getDateStats(); DateColumnStatsData newData = newColStats.getStatsData().getDateStats(); - Date lowValue = - aggregateData.getLowValue().compareTo(newData.getLowValue()) < 0 ? aggregateData - .getLowValue() : newData.getLowValue(); + Date lowValue = aggregateData.getLowValue().compareTo(newData.getLowValue()) < 0 ? aggregateData + .getLowValue() : newData.getLowValue(); aggregateData.setLowValue(lowValue); - Date highValue = - aggregateData.getHighValue().compareTo(newData.getHighValue()) >= 0 ? aggregateData - .getHighValue() : newData.getHighValue(); + Date highValue = aggregateData.getHighValue().compareTo(newData.getHighValue()) >= 0 ? aggregateData + .getHighValue() : newData.getHighValue(); aggregateData.setHighValue(highValue); aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { + if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0 + || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); } else { - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - long ndv = ndvEstimator.estimateNumDistinctValues(); + NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(aggregateData.getBitVectors()); + NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors()); + long ndv = -1; + if (oldEst.canMerge(newEst)) { + oldEst.mergeEstimators(newEst); + ndv = oldEst.estimateNumDistinctValues(); + aggregateData.setBitVectors(oldEst.serialize()); + } else { + ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs()); + } LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv); aggregateData.setNumDVs(ndv); - aggregateData.setBitVectors(ndvEstimator.serialize().toString()); } } } http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java index c13add9..4099ffc 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java @@ -19,7 +19,8 @@ package org.apache.hadoop.hive.metastore.hbase.stats.merge; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Decimal; import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; @@ -38,18 +39,25 @@ public class DecimalColumnStatsMerger extends ColumnStatsMerger { .getHighValue() : newData.getHighValue(); aggregateData.setHighValue(highValue); aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { + if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0 + || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); } else { - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - long ndv = ndvEstimator.estimateNumDistinctValues(); + NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(aggregateData.getBitVectors()); + NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors()); + long ndv = -1; + if (oldEst.canMerge(newEst)) { + oldEst.mergeEstimators(newEst); + ndv = oldEst.estimateNumDistinctValues(); + aggregateData.setBitVectors(oldEst.serialize()); + } else { + ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs()); + } LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv); aggregateData.setNumDVs(ndv); - aggregateData.setBitVectors(ndvEstimator.serialize().toString()); } } } http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java index fbdba24..1691fc9 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java @@ -19,7 +19,8 @@ package org.apache.hadoop.hive.metastore.hbase.stats.merge; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; @@ -31,18 +32,25 @@ public class DoubleColumnStatsMerger extends ColumnStatsMerger { aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue())); aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue())); aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { + if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0 + || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); } else { - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - long ndv = ndvEstimator.estimateNumDistinctValues(); + NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(aggregateData.getBitVectors()); + NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors()); + long ndv = -1; + if (oldEst.canMerge(newEst)) { + oldEst.mergeEstimators(newEst); + ndv = oldEst.estimateNumDistinctValues(); + aggregateData.setBitVectors(oldEst.serialize()); + } else { + ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs()); + } LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv); aggregateData.setNumDVs(ndv); - aggregateData.setBitVectors(ndvEstimator.serialize().toString()); } } } http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java index ac65590..361af35 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java @@ -19,7 +19,8 @@ package org.apache.hadoop.hive.metastore.hbase.stats.merge; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; @@ -31,18 +32,25 @@ public class LongColumnStatsMerger extends ColumnStatsMerger { aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue())); aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue())); aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { + if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0 + || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); } else { - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - long ndv = ndvEstimator.estimateNumDistinctValues(); + NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(aggregateData.getBitVectors()); + NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors()); + long ndv = -1; + if (oldEst.canMerge(newEst)) { + oldEst.mergeEstimators(newEst); + ndv = oldEst.estimateNumDistinctValues(); + aggregateData.setBitVectors(oldEst.serialize()); + } else { + ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs()); + } LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv); aggregateData.setNumDVs(ndv); - aggregateData.setBitVectors(ndvEstimator.serialize().toString()); } } } http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java index 4158747..8e28f90 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java @@ -19,10 +19,10 @@ package org.apache.hadoop.hive.metastore.hbase.stats.merge; -import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; -import org.apache.parquet.Log; public class StringColumnStatsMerger extends ColumnStatsMerger { @Override @@ -32,18 +32,25 @@ public class StringColumnStatsMerger extends ColumnStatsMerger { aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen())); aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen())); aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { + if (!aggregateData.isSetBitVectors() || aggregateData.getBitVectors().length() == 0 + || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) { aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); } else { - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), - ndvEstimator.getnumBitVectors())); - long ndv = ndvEstimator.estimateNumDistinctValues(); + NumDistinctValueEstimator oldEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(aggregateData.getBitVectors()); + NumDistinctValueEstimator newEst = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors()); + long ndv = -1; + if (oldEst.canMerge(newEst)) { + oldEst.mergeEstimators(newEst); + ndv = oldEst.estimateNumDistinctValues(); + aggregateData.setBitVectors(oldEst.serialize()); + } else { + ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs()); + } LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv); aggregateData.setNumDVs(ndv); - aggregateData.setBitVectors(ndvEstimator.serialize().toString()); } } } http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java ---------------------------------------------------------------------- diff --git a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java index 87b1ac8..74e1669 100644 --- a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java +++ b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/TestHBaseAggregateStatsNDVUniformDist.java @@ -60,7 +60,7 @@ public class TestHBaseAggregateStatsNDVUniformDist { private HBaseStore store; SortedMap rows = new TreeMap<>(); - // NDV will be 3 for bitVectors[0] and 12 for bitVectors[1] + // NDV will be 3 for bitVectors[0] and 1 for bitVectors[1] String bitVectors[] = { "{0, 4, 5, 7}{0, 1}{0, 1, 2}{0, 1, 4}{0}{0, 2}{0, 3}{0, 2, 3, 4}{0, 1, 4}{0, 1}{0}{0, 1, 3, 8}{0, 2}{0, 2}{0, 9}{0, 1, 4}", "{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}{1, 2}" }; @@ -278,7 +278,7 @@ public class TestHBaseAggregateStatsNDVUniformDist { Assert.assertEquals(1010, lcsd.getHighValue(), 0.01); Assert.assertEquals(-1010, lcsd.getLowValue(), 0.01); Assert.assertEquals(45, lcsd.getNumNulls()); - Assert.assertEquals(12, lcsd.getNumDVs()); + Assert.assertEquals(3, lcsd.getNumDVs()); } }; List partNames = new ArrayList<>(); @@ -422,7 +422,7 @@ public class TestHBaseAggregateStatsNDVUniformDist { Assert.assertEquals(1010, lcsd.getHighValue(), 0.01); Assert.assertEquals(-1010, lcsd.getLowValue(), 0.01); Assert.assertEquals(40, lcsd.getNumNulls()); - Assert.assertEquals(12, lcsd.getNumDVs()); + Assert.assertEquals(3, lcsd.getNumDVs()); } }; List partNames = new ArrayList<>(); @@ -494,7 +494,7 @@ public class TestHBaseAggregateStatsNDVUniformDist { Assert.assertEquals(1010, HBaseUtils.getDoubleValue(lcsd.getHighValue()), 0.01); Assert.assertEquals(-1010, HBaseUtils.getDoubleValue(lcsd.getLowValue()), 0.01); Assert.assertEquals(40, lcsd.getNumNulls()); - Assert.assertEquals(12, lcsd.getNumDVs()); + Assert.assertEquals(3, lcsd.getNumDVs()); } }; List partNames = new ArrayList<>(); @@ -566,7 +566,7 @@ public class TestHBaseAggregateStatsNDVUniformDist { Assert.assertEquals(1010, lcsd.getHighValue(), 0.01); Assert.assertEquals(-1010, lcsd.getLowValue(), 0.01); Assert.assertEquals(40, lcsd.getNumNulls()); - Assert.assertEquals(12, lcsd.getNumDVs()); + Assert.assertEquals(3, lcsd.getNumDVs()); } }; List partNames = new ArrayList<>(); http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/ql/pom.xml ---------------------------------------------------------------------- diff --git a/ql/pom.xml b/ql/pom.xml index 5732965..e17fe50 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -364,6 +364,11 @@ ${datanucleus-core.version} + javolution + javolution + ${javolution.version} + + org.apache.calcite calcite-core ${calcite.version} http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java index 0a5cf00..1923a9b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java @@ -27,6 +27,7 @@ import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.HiveStatsUtils; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.conf.HiveVariableSource; import org.apache.hadoop.hive.conf.VariableSubstitution; @@ -37,8 +38,6 @@ import org.apache.hadoop.hive.ql.QueryState; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Table; -import org.apache.hadoop.hive.ql.session.OperationLog; -import org.apache.hadoop.hive.ql.session.OperationLog.LoggingLevel; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; import org.apache.hadoop.hive.serde.serdeConstants; @@ -246,7 +245,7 @@ public class ColumnStatsSemanticAnalyzer extends SemanticAnalyzer { return colName.replaceAll("`", "``"); } - private String genRewrittenQuery(List colNames, int numBitVectors, Map partSpec, + private String genRewrittenQuery(List colNames, HiveConf conf, Map partSpec, boolean isPartitionStats) throws SemanticException{ StringBuilder rewrittenQueryBuilder = new StringBuilder("select "); String rewrittenQuery; @@ -255,11 +254,20 @@ public class ColumnStatsSemanticAnalyzer extends SemanticAnalyzer { if (i > 0) { rewrittenQueryBuilder.append(" , "); } + String func = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_STATS_NDV_ALGO).toLowerCase(); rewrittenQueryBuilder.append("compute_stats(`"); rewrittenQueryBuilder.append(escapeBackTicks(colNames.get(i))); - rewrittenQueryBuilder.append("` , "); - rewrittenQueryBuilder.append(numBitVectors); - rewrittenQueryBuilder.append(" )"); + rewrittenQueryBuilder.append("`, '" + func + "'"); + if (func.equals("fm")) { + int numBitVectors = 0; + try { + numBitVectors = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf); + } catch (Exception e) { + throw new SemanticException(e.getMessage()); + } + rewrittenQueryBuilder.append(", " + numBitVectors); + } + rewrittenQueryBuilder.append(")"); } if (isPartitionStats) { @@ -377,13 +385,7 @@ public class ColumnStatsSemanticAnalyzer extends SemanticAnalyzer { isTableLevel = true; } colType = getColumnTypes(colNames); - int numBitVectors; - try { - numBitVectors = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf); - } catch (Exception e) { - throw new SemanticException(e.getMessage()); - } - rewrittenQuery = genRewrittenQuery(colNames, numBitVectors, partSpec, isPartitionStats); + rewrittenQuery = genRewrittenQuery(colNames, conf, partSpec, isPartitionStats); rewrittenTree = genRewrittenTree(rewrittenQuery); } else { // Not an analyze table column compute statistics statement - don't do any rewrites @@ -447,13 +449,7 @@ public class ColumnStatsSemanticAnalyzer extends SemanticAnalyzer { isTableLevel = true; } colType = getColumnTypes(colNames); - int numBitVectors = 0; - try { - numBitVectors = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf); - } catch (Exception e) { - throw new SemanticException(e.getMessage()); - } - rewrittenQuery = genRewrittenQuery(colNames, numBitVectors, partSpec, isPartitionStats); + rewrittenQuery = genRewrittenQuery(colNames, conf, partSpec, isPartitionStats); rewrittenTree = genRewrittenTree(rewrittenQuery); context.analyzeRewrite = new AnalyzeRewriteContext(); http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 76f7dae..3b9ab41 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1640,60 +1640,6 @@ public class StatsUtils { } } - public static int getNumBitVectorsForNDVEstimation(HiveConf conf) throws SemanticException { - int numBitVectors; - float percentageError = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_NDV_ERROR); - - if (percentageError < 0.0) { - throw new SemanticException("hive.stats.ndv.error can't be negative"); - } else if (percentageError <= 2.4) { - numBitVectors = 1024; - LOG.info("Lowest error achievable is 2.4% but error requested is " + percentageError + "%"); - LOG.info("Choosing 1024 bit vectors.."); - } else if (percentageError <= 3.4 ) { - numBitVectors = 1024; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 1024 bit vectors.."); - } else if (percentageError <= 4.8) { - numBitVectors = 512; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 512 bit vectors.."); - } else if (percentageError <= 6.8) { - numBitVectors = 256; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 256 bit vectors.."); - } else if (percentageError <= 9.7) { - numBitVectors = 128; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 128 bit vectors.."); - } else if (percentageError <= 13.8) { - numBitVectors = 64; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 64 bit vectors.."); - } else if (percentageError <= 19.6) { - numBitVectors = 32; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 32 bit vectors.."); - } else if (percentageError <= 28.2) { - numBitVectors = 16; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 16 bit vectors.."); - } else if (percentageError <= 40.9) { - numBitVectors = 8; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 8 bit vectors.."); - } else if (percentageError <= 61.0) { - numBitVectors = 4; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 4 bit vectors.."); - } else { - numBitVectors = 2; - LOG.info("Error requested is " + percentageError + "%"); - LOG.info("Choosing 2 bit vectors.."); - } - return numBitVectors; - } - public static boolean hasDiscreteRange(ColStatistics colStat) { if (colStat.getRange() != null) { TypeInfo colType = TypeInfoUtils.getTypeInfoFromTypeString(colStat.getColumnType()); http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DecimalNumDistinctValueEstimator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DecimalNumDistinctValueEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DecimalNumDistinctValueEstimator.java deleted file mode 100644 index a05906e..0000000 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DecimalNumDistinctValueEstimator.java +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.udf.generic; - -import org.apache.hadoop.hive.common.type.HiveDecimal; - -public class DecimalNumDistinctValueEstimator extends NumDistinctValueEstimator { - - public DecimalNumDistinctValueEstimator(int numBitVectors) { - super(numBitVectors); - } - - public DecimalNumDistinctValueEstimator(String s, int numBitVectors) { - super(s, numBitVectors); - } - - public void addToEstimator(HiveDecimal decimal) { - int v = decimal.hashCode(); - super.addToEstimator(v); - } - - public void addToEstimatorPCSA(HiveDecimal decimal) { - int v = decimal.hashCode(); - super.addToEstimatorPCSA(v); - } -} http://git-wip-us.apache.org/repos/asf/hive/blob/b883d313/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java deleted file mode 100644 index e76fc74..0000000 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/DoubleNumDistinctValueEstimator.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.ql.udf.generic; - -public class DoubleNumDistinctValueEstimator extends NumDistinctValueEstimator { - - public DoubleNumDistinctValueEstimator(int numBitVectors) { - super(numBitVectors); - } - - public DoubleNumDistinctValueEstimator(String s, int numVectors) { - super(s, numVectors); - } - - public void addToEstimator(double d) { - int v = new Double(d).hashCode(); - super.addToEstimator(v); - } - - public void addToEstimatorPCSA(double d) { - int v = new Double(d).hashCode(); - super.addToEstimatorPCSA(v); - } -}