hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pxi...@apache.org
Subject hive git commit: HIVE-16274: Support tuning of NDV of columns using lower/upper bounds (Pengcheng Xiong, reviewed by Jason Dere)
Date Sun, 26 Mar 2017 00:31:18 GMT
Repository: hive
Updated Branches:
  refs/heads/branch-2 d65a24919 -> e356de8e6


HIVE-16274: Support tuning of NDV of columns using lower/upper bounds (Pengcheng Xiong, reviewed
by Jason Dere)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e356de8e
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e356de8e
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e356de8e

Branch: refs/heads/branch-2
Commit: e356de8e68011ab58ad7799a7ce70970e781fa27
Parents: d65a249
Author: Pengcheng Xiong <pxiong@apache.org>
Authored: Sat Mar 25 17:28:01 2017 -0700
Committer: Pengcheng Xiong <pxiong@apache.org>
Committed: Sat Mar 25 17:32:20 2017 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/conf/HiveConf.java   |   5 +
 .../hive/metastore/MetaStoreDirectSql.java      |  22 +-
 .../hadoop/hive/metastore/ObjectStore.java      |   7 +-
 .../hive/metastore/StatObjectConverter.java     |  42 ++--
 .../test/queries/clientpositive/tunable_ndv.q   |  64 ++++++
 .../results/clientpositive/tunable_ndv.q.out    | 220 +++++++++++++++++++
 6 files changed, 332 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/e356de8e/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 6e16200..1984a8d 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -272,6 +272,7 @@ public class HiveConf extends Configuration {
       HiveConf.ConfVars.HIVE_TXN_HEARTBEAT_THREADPOOL_SIZE,
       HiveConf.ConfVars.HIVE_TXN_MAX_OPEN_BATCH,
       HiveConf.ConfVars.HIVE_TXN_RETRYABLE_SQLEX_REGEX,
+      HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER,
       HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION,
       HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_ENABLED,
       HiveConf.ConfVars.METASTORE_AGGREGATE_STATS_CACHE_SIZE,
@@ -1683,6 +1684,10 @@ public class HiveConf extends Configuration {
     HIVE_STATS_NDV_ERROR("hive.stats.ndv.error", (float)20.0,
         "Standard error expressed in percentage. Provides a tradeoff between accuracy and
compute cost. \n" +
         "A lower value for error indicates higher accuracy and a higher compute cost."),
+    HIVE_METASTORE_STATS_NDV_TUNER("hive.metastore.stats.ndv.tuner", (float)0.0,
+         "Provides a tunable parameter between the lower bound and the higher bound of ndv
for aggregate ndv across all the partitions. \n" +
+         "The lower bound is equal to the maximum of ndv of all the partitions. The higher
bound is equal to the sum of ndv of all the partitions.\n" +
+         "Its value should be between 0.0 (i.e., choose lower bound) and 1.0 (i.e., choose
higher bound)"),
     HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION("hive.metastore.stats.ndv.densityfunction",
false,
         "Whether to use density function to estimate the NDV for the whole table based on
the NDV of partitions"),
     HIVE_STATS_KEY_PREFIX("hive.stats.key.prefix", "", "", true), // internal usage only

http://git-wip-us.apache.org/repos/asf/hive/blob/e356de8e/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
----------------------------------------------------------------------
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
index 85a6d0d..3ee1fee9b 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
@@ -1197,7 +1197,7 @@ class MetaStoreDirectSql {
   }
 
   public AggrStats aggrColStatsForPartitions(String dbName, String tableName,
-      List<String> partNames, List<String> colNames, boolean useDensityFunctionForNDVEstimation)
+      List<String> partNames, List<String> colNames, boolean useDensityFunctionForNDVEstimation,
double  ndvTuner)
       throws MetaException {
     if (colNames.isEmpty() || partNames.isEmpty()) {
       LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval");
@@ -1232,7 +1232,7 @@ class MetaStoreDirectSql {
           // Read aggregated stats for one column
           colStatsAggrFromDB =
               columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB,
-                  partsFound, useDensityFunctionForNDVEstimation);
+                  partsFound, useDensityFunctionForNDVEstimation, ndvTuner);
           if (!colStatsAggrFromDB.isEmpty()) {
             ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
             colStatsList.add(colStatsAggr);
@@ -1245,7 +1245,7 @@ class MetaStoreDirectSql {
       partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
       colStatsList =
           columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound,
-              useDensityFunctionForNDVEstimation);
+              useDensityFunctionForNDVEstimation, ndvTuner);
     }
     LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation
         + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = "
@@ -1308,14 +1308,14 @@ class MetaStoreDirectSql {
 
   private List<ColumnStatisticsObj> columnStatisticsObjForPartitions(final String dbName,
     final String tableName, final List<String> partNames, List<String> colNames,
long partsFound,
-    final boolean useDensityFunctionForNDVEstimation) throws MetaException {
+    final boolean useDensityFunctionForNDVEstimation, final double  ndvTuner) throws MetaException
{
     final boolean areAllPartsFound = (partsFound == partNames.size());
     return runBatched(colNames, new Batchable<String, ColumnStatisticsObj>() {
       public List<ColumnStatisticsObj> run(final List<String> inputColNames)
throws MetaException {
         return runBatched(partNames, new Batchable<String, ColumnStatisticsObj>() {
           public List<ColumnStatisticsObj> run(List<String> inputPartNames) throws
MetaException {
             return columnStatisticsObjForPartitionsBatch(dbName, tableName, inputPartNames,
-                inputColNames, areAllPartsFound, useDensityFunctionForNDVEstimation);
+                inputColNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner);
           }
         });
       }
@@ -1325,7 +1325,7 @@ class MetaStoreDirectSql {
   /** Should be called with the list short enough to not trip up Oracle/etc. */
   private List<ColumnStatisticsObj> columnStatisticsObjForPartitionsBatch(String dbName,
       String tableName, List<String> partNames, List<String> colNames, boolean
areAllPartsFound,
-      boolean useDensityFunctionForNDVEstimation) throws MetaException {
+      boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
     // TODO: all the extrapolation logic should be moved out of this class,
     // only mechanical data retrieval should remain here.
     String commonPrefix = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", "
@@ -1377,7 +1377,7 @@ class MetaStoreDirectSql {
       List<Object[]> list = ensureList(qResult);
       List<ColumnStatisticsObj> colStats = new ArrayList<ColumnStatisticsObj>(list.size());
       for (Object[] row : list) {
-        colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation));
+        colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation,
ndvTuner));
         Deadline.checkTimeout();
       }
       query.closeAll();
@@ -1436,7 +1436,7 @@ class MetaStoreDirectSql {
         }
         list = ensureList(qResult);
         for (Object[] row : list) {
-          colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation));
+          colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation,
ndvTuner));
           Deadline.checkTimeout();
         }
         end = doTrace ? System.nanoTime() : 0;
@@ -1583,7 +1583,7 @@ class MetaStoreDirectSql {
               query.closeAll();
             }
           }
-          colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation));
+          colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation,
ndvTuner));
           Deadline.checkTimeout();
         }
       }
@@ -1603,13 +1603,13 @@ class MetaStoreDirectSql {
   }
 
   private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i,
-      boolean useDensityFunctionForNDVEstimation) throws MetaException {
+      boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
     ColumnStatisticsData data = new ColumnStatisticsData();
     ColumnStatisticsObj cso = new ColumnStatisticsObj((String) row[i++], (String) row[i++],
data);
     Object llow = row[i++], lhigh = row[i++], dlow = row[i++], dhigh = row[i++], declow =
row[i++], dechigh = row[i++], nulls = row[i++], dist = row[i++], avglen = row[i++], maxlen
= row[i++], trues = row[i++], falses = row[i++], avgLong = row[i++], avgDouble = row[i++],
avgDecimal = row[i++], sumDist = row[i++];
     StatObjectConverter.fillColumnStatisticsData(cso.getColType(), data, llow, lhigh, dlow,
dhigh,
         declow, dechigh, nulls, dist, avglen, maxlen, trues, falses, avgLong, avgDouble,
-        avgDecimal, sumDist, useDensityFunctionForNDVEstimation);
+        avgDecimal, sumDist, useDensityFunctionForNDVEstimation, ndvTuner);
     return cso;
   }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/e356de8e/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java
----------------------------------------------------------------------
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java
index 1f7b9ef..c39c725 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java
@@ -7337,13 +7337,16 @@ public class ObjectStore implements RawStore, Configurable {
   @Override
   public AggrStats get_aggr_stats_for(String dbName, String tblName,
       final List<String> partNames, final List<String> colNames) throws MetaException,
NoSuchObjectException {
-    final boolean  useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(getConf(), HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION);
+    final boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(getConf(),
+        HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION);
+    final double ndvTuner = HiveConf.getFloatVar(getConf(),
+        HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER);
     return new GetHelper<AggrStats>(dbName, tblName, true, false) {
       @Override
       protected AggrStats getSqlResult(GetHelper<AggrStats> ctx)
           throws MetaException {
         return directSql.aggrColStatsForPartitions(dbName, tblName, partNames,
-            colNames, useDensityFunctionForNDVEstimation);
+            colNames, useDensityFunctionForNDVEstimation, ndvTuner);
       }
       @Override
       protected AggrStats getJdoResult(GetHelper<AggrStats> ctx)

http://git-wip-us.apache.org/repos/asf/hive/blob/e356de8e/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
----------------------------------------------------------------------
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
index b259dfa..fcf6f27 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
@@ -527,7 +527,7 @@ public class StatObjectConverter {
       Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh,
       Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses,
       Object avgLong, Object avgDouble, Object avgDecimal, Object sumDist,
-      boolean useDensityFunctionForNDVEstimation) throws MetaException {
+      boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
     colType = colType.toLowerCase();
     if (colType.equals("boolean")) {
       BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
@@ -561,23 +561,29 @@ public class StatObjectConverter {
       }
       long lowerBound = MetaStoreDirectSql.extractSqlLong(dist);
       long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist);
+      long rangeBound = Long.MAX_VALUE;
+      if (lhigh != null && llow != null) {
+        rangeBound = MetaStoreDirectSql.extractSqlLong(lhigh)
+            - MetaStoreDirectSql.extractSqlLong(llow) + 1;
+      }
+      long estimation;
       if (useDensityFunctionForNDVEstimation && lhigh != null && llow !=
null && avgLong != null
           && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) {
         // We have estimation, lowerbound and higherbound. We use estimation if
         // it is between lowerbound and higherbound.
-        long estimation = MetaStoreDirectSql
+        estimation = MetaStoreDirectSql
             .extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql
                 .extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong));
         if (estimation < lowerBound) {
-          longStats.setNumDVs(lowerBound);
+          estimation = lowerBound;
         } else if (estimation > higherBound) {
-          longStats.setNumDVs(higherBound);
-        } else {
-          longStats.setNumDVs(estimation);
+          estimation = higherBound;
         }
       } else {
-        longStats.setNumDVs(lowerBound);
+        estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
       }
+      estimation = Math.min(estimation, rangeBound);
+      longStats.setNumDVs(estimation);
       data.setLongStats(longStats);
     } else if (colType.equals("date")) {
       DateColumnStatsData dateStats = new DateColumnStatsData();
@@ -590,23 +596,29 @@ public class StatObjectConverter {
       }
       long lowerBound = MetaStoreDirectSql.extractSqlLong(dist);
       long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist);
+      long rangeBound = Long.MAX_VALUE;
+      if (lhigh != null && llow != null) {
+        rangeBound = MetaStoreDirectSql.extractSqlLong(lhigh)
+            - MetaStoreDirectSql.extractSqlLong(llow) + 1;
+      }
+      long estimation;
       if (useDensityFunctionForNDVEstimation && lhigh != null && llow !=
null && avgLong != null
           && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) {
         // We have estimation, lowerbound and higherbound. We use estimation if
         // it is between lowerbound and higherbound.
-        long estimation = MetaStoreDirectSql
+        estimation = MetaStoreDirectSql
             .extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql
                 .extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong));
         if (estimation < lowerBound) {
-          dateStats.setNumDVs(lowerBound);
+          estimation = lowerBound;
         } else if (estimation > higherBound) {
-          dateStats.setNumDVs(higherBound);
-        } else {
-          dateStats.setNumDVs(estimation);
+          estimation = higherBound;
         }
       } else {
-        dateStats.setNumDVs(lowerBound);
+        estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
       }
+      estimation = Math.min(estimation, rangeBound);
+      dateStats.setNumDVs(estimation);
       data.setDateStats(dateStats);
     } else if (colType.equals("double") || colType.equals("float")) {
       DoubleColumnStatsData doubleStats = new DoubleColumnStatsData();
@@ -632,7 +644,7 @@ public class StatObjectConverter {
           doubleStats.setNumDVs(estimation);
         }
       } else {
-        doubleStats.setNumDVs(lowerBound);
+        doubleStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner));
       }
       data.setDoubleStats(doubleStats);
     } else if (colType.startsWith("decimal")) {
@@ -673,7 +685,7 @@ public class StatObjectConverter {
           decimalStats.setNumDVs(estimation);
         }
       } else {
-        decimalStats.setNumDVs(lowerBound);
+        decimalStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner));
       }
       data.setDecimalStats(decimalStats);
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/e356de8e/ql/src/test/queries/clientpositive/tunable_ndv.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/tunable_ndv.q b/ql/src/test/queries/clientpositive/tunable_ndv.q
new file mode 100644
index 0000000..fed51f6
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/tunable_ndv.q
@@ -0,0 +1,64 @@
+set hive.mapred.mode=nonstrict;
+set hive.stats.fetch.column.stats=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.metastore.aggregate.stats.cache.enabled=false;
+
+create table if not exists ext_loc (
+  state string,
+  locid int,
+  zip int,
+  year string
+) row format delimited fields terminated by '|' stored as textfile;
+
+LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_full.txt' OVERWRITE INTO TABLE
ext_loc;
+
+create table if not exists loc_orc_1d (
+  state string,
+  locid int,
+  zip int
+) partitioned by(year string) stored as orc;
+
+insert overwrite table loc_orc_1d partition(year) select * from ext_loc;
+
+analyze table loc_orc_1d compute statistics for columns state,locid;
+
+describe formatted loc_orc_1d partition(year=2000) locid;
+describe formatted loc_orc_1d partition(year=2001) locid;
+
+describe formatted loc_orc_1d locid;
+
+set hive.metastore.stats.ndv.tuner=1.0;
+
+describe formatted loc_orc_1d locid;
+
+set hive.metastore.stats.ndv.tuner=0.5;
+
+describe formatted loc_orc_1d locid;
+
+create table if not exists loc_orc_2d (
+  state string,
+  locid int
+) partitioned by(zip int, year string) stored as orc;
+
+insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc;
+
+analyze table loc_orc_2d partition(zip=94086, year='2000') compute statistics for columns
state,locid;
+
+analyze table loc_orc_2d partition(zip=94087, year='2000') compute statistics for columns
state,locid;
+
+analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics for columns
state,locid;
+
+analyze table loc_orc_2d partition(zip=94087, year='2001') compute statistics for columns
state,locid;
+
+set hive.metastore.stats.ndv.tuner=0.0;
+
+describe formatted loc_orc_2d locid;
+
+set hive.metastore.stats.ndv.tuner=1.0;
+
+describe formatted loc_orc_2d locid;
+
+set hive.metastore.stats.ndv.tuner=0.5;
+
+describe formatted loc_orc_2d locid;

http://git-wip-us.apache.org/repos/asf/hive/blob/e356de8e/ql/src/test/results/clientpositive/tunable_ndv.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tunable_ndv.q.out b/ql/src/test/results/clientpositive/tunable_ndv.q.out
new file mode 100644
index 0000000..6ae54b4
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tunable_ndv.q.out
@@ -0,0 +1,220 @@
+PREHOOK: query: create table if not exists ext_loc (
+  state string,
+  locid int,
+  zip int,
+  year string
+) row format delimited fields terminated by '|' stored as textfile
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ext_loc
+POSTHOOK: query: create table if not exists ext_loc (
+  state string,
+  locid int,
+  zip int,
+  year string
+) row format delimited fields terminated by '|' stored as textfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ext_loc
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_full.txt' OVERWRITE
INTO TABLE ext_loc
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@ext_loc
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/extrapolate_stats_full.txt' OVERWRITE
INTO TABLE ext_loc
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@ext_loc
+PREHOOK: query: create table if not exists loc_orc_1d (
+  state string,
+  locid int,
+  zip int
+) partitioned by(year string) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@loc_orc_1d
+POSTHOOK: query: create table if not exists loc_orc_1d (
+  state string,
+  locid int,
+  zip int
+) partitioned by(year string) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@loc_orc_1d
+PREHOOK: query: insert overwrite table loc_orc_1d partition(year) select * from ext_loc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ext_loc
+PREHOOK: Output: default@loc_orc_1d
+POSTHOOK: query: insert overwrite table loc_orc_1d partition(year) select * from ext_loc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ext_loc
+POSTHOOK: Output: default@loc_orc_1d@year=2000
+POSTHOOK: Output: default@loc_orc_1d@year=2001
+POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid,
type:int, comment:null), ]
+POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state,
type:string, comment:null), ]
+POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2000).zip SIMPLE [(ext_loc)ext_loc.FieldSchema(name:zip,
type:int, comment:null), ]
+POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid,
type:int, comment:null), ]
+POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state,
type:string, comment:null), ]
+POSTHOOK: Lineage: loc_orc_1d PARTITION(year=2001).zip SIMPLE [(ext_loc)ext_loc.FieldSchema(name:zip,
type:int, comment:null), ]
+PREHOOK: query: analyze table loc_orc_1d compute statistics for columns state,locid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@loc_orc_1d
+PREHOOK: Input: default@loc_orc_1d@year=2000
+PREHOOK: Input: default@loc_orc_1d@year=2001
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table loc_orc_1d compute statistics for columns state,locid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@loc_orc_1d
+POSTHOOK: Input: default@loc_orc_1d@year=2000
+POSTHOOK: Input: default@loc_orc_1d@year=2001
+#### A masked pattern was here ####
+PREHOOK: query: describe formatted loc_orc_1d partition(year=2000) locid
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@loc_orc_1d
+POSTHOOK: query: describe formatted loc_orc_1d partition(year=2000) locid
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@loc_orc_1d
+# col_name            	data_type           	min                 	max                 	num_nulls
          	distinct_count      	avg_col_len         	max_col_len         	num_trues      
    	num_falses          	comment             
+	 	 	 	 	 	 	 	 	 	 
+locid               	int                 	1                   	2                   	0   
               	2                   	                    	                    	          
         	                    	from deserializer   
+PREHOOK: query: describe formatted loc_orc_1d partition(year=2001) locid
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@loc_orc_1d
+POSTHOOK: query: describe formatted loc_orc_1d partition(year=2001) locid
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@loc_orc_1d
+# col_name            	data_type           	min                 	max                 	num_nulls
          	distinct_count      	avg_col_len         	max_col_len         	num_trues      
    	num_falses          	comment             
+	 	 	 	 	 	 	 	 	 	 
+locid               	int                 	1                   	4                   	0   
               	5                   	                    	                    	          
         	                    	from deserializer   
+PREHOOK: query: describe formatted loc_orc_1d locid
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@loc_orc_1d
+POSTHOOK: query: describe formatted loc_orc_1d locid
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@loc_orc_1d
+# col_name            	data_type           	min                 	max                 	num_nulls
          	distinct_count      	avg_col_len         	max_col_len         	num_trues      
    	num_falses          	comment             
+	 	 	 	 	 	 	 	 	 	 
+locid               	int                 	1                   	4                   	0   
               	4                   	                    	                    	          
         	                    	from deserializer   
+COLUMN_STATS_ACCURATE	{\"COLUMN_STATS\":{\"locid\":\"true\"}}	 	 	 	 	 	 	 	 	 
+PREHOOK: query: describe formatted loc_orc_1d locid
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@loc_orc_1d
+POSTHOOK: query: describe formatted loc_orc_1d locid
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@loc_orc_1d
+# col_name            	data_type           	min                 	max                 	num_nulls
          	distinct_count      	avg_col_len         	max_col_len         	num_trues      
    	num_falses          	comment             
+	 	 	 	 	 	 	 	 	 	 
+locid               	int                 	1                   	4                   	0   
               	4                   	                    	                    	          
         	                    	from deserializer   
+COLUMN_STATS_ACCURATE	{\"COLUMN_STATS\":{\"locid\":\"true\"}}	 	 	 	 	 	 	 	 	 
+PREHOOK: query: describe formatted loc_orc_1d locid
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@loc_orc_1d
+POSTHOOK: query: describe formatted loc_orc_1d locid
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@loc_orc_1d
+# col_name            	data_type           	min                 	max                 	num_nulls
          	distinct_count      	avg_col_len         	max_col_len         	num_trues      
    	num_falses          	comment             
+	 	 	 	 	 	 	 	 	 	 
+locid               	int                 	1                   	4                   	0   
               	4                   	                    	                    	          
         	                    	from deserializer   
+COLUMN_STATS_ACCURATE	{\"COLUMN_STATS\":{\"locid\":\"true\"}}	 	 	 	 	 	 	 	 	 
+PREHOOK: query: create table if not exists loc_orc_2d (
+  state string,
+  locid int
+) partitioned by(zip int, year string) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@loc_orc_2d
+POSTHOOK: query: create table if not exists loc_orc_2d (
+  state string,
+  locid int
+) partitioned by(zip int, year string) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@loc_orc_2d
+PREHOOK: query: insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ext_loc
+PREHOOK: Output: default@loc_orc_2d
+POSTHOOK: query: insert overwrite table loc_orc_2d partition(zip, year) select * from ext_loc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ext_loc
+POSTHOOK: Output: default@loc_orc_2d@zip=94086/year=2000
+POSTHOOK: Output: default@loc_orc_2d@zip=94086/year=2001
+POSTHOOK: Output: default@loc_orc_2d@zip=94087/year=2000
+POSTHOOK: Output: default@loc_orc_2d@zip=94087/year=2001
+POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2000).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid,
type:int, comment:null), ]
+POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state,
type:string, comment:null), ]
+POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2001).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid,
type:int, comment:null), ]
+POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94086,year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state,
type:string, comment:null), ]
+POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2000).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid,
type:int, comment:null), ]
+POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2000).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state,
type:string, comment:null), ]
+POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2001).locid SIMPLE [(ext_loc)ext_loc.FieldSchema(name:locid,
type:int, comment:null), ]
+POSTHOOK: Lineage: loc_orc_2d PARTITION(zip=94087,year=2001).state SIMPLE [(ext_loc)ext_loc.FieldSchema(name:state,
type:string, comment:null), ]
+PREHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2000') compute statistics
for columns state,locid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@loc_orc_2d
+PREHOOK: Input: default@loc_orc_2d@zip=94086/year=2000
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2000') compute statistics
for columns state,locid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@loc_orc_2d
+POSTHOOK: Input: default@loc_orc_2d@zip=94086/year=2000
+#### A masked pattern was here ####
+PREHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2000') compute statistics
for columns state,locid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@loc_orc_2d
+PREHOOK: Input: default@loc_orc_2d@zip=94087/year=2000
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2000') compute statistics
for columns state,locid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@loc_orc_2d
+POSTHOOK: Input: default@loc_orc_2d@zip=94087/year=2000
+#### A masked pattern was here ####
+PREHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics
for columns state,locid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@loc_orc_2d
+PREHOOK: Input: default@loc_orc_2d@zip=94086/year=2001
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table loc_orc_2d partition(zip=94086, year='2001') compute statistics
for columns state,locid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@loc_orc_2d
+POSTHOOK: Input: default@loc_orc_2d@zip=94086/year=2001
+#### A masked pattern was here ####
+PREHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2001') compute statistics
for columns state,locid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@loc_orc_2d
+PREHOOK: Input: default@loc_orc_2d@zip=94087/year=2001
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table loc_orc_2d partition(zip=94087, year='2001') compute statistics
for columns state,locid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@loc_orc_2d
+POSTHOOK: Input: default@loc_orc_2d@zip=94087/year=2001
+#### A masked pattern was here ####
+PREHOOK: query: describe formatted loc_orc_2d locid
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@loc_orc_2d
+POSTHOOK: query: describe formatted loc_orc_2d locid
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@loc_orc_2d
+# col_name            	data_type           	min                 	max                 	num_nulls
          	distinct_count      	avg_col_len         	max_col_len         	num_trues      
    	num_falses          	comment             
+	 	 	 	 	 	 	 	 	 	 
+locid               	int                 	1                   	4                   	0   
               	3                   	                    	                    	          
         	                    	from deserializer   
+COLUMN_STATS_ACCURATE	{\"COLUMN_STATS\":{\"locid\":\"true\"}}	 	 	 	 	 	 	 	 	 
+PREHOOK: query: describe formatted loc_orc_2d locid
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@loc_orc_2d
+POSTHOOK: query: describe formatted loc_orc_2d locid
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@loc_orc_2d
+# col_name            	data_type           	min                 	max                 	num_nulls
          	distinct_count      	avg_col_len         	max_col_len         	num_trues      
    	num_falses          	comment             
+	 	 	 	 	 	 	 	 	 	 
+locid               	int                 	1                   	4                   	0   
               	4                   	                    	                    	          
         	                    	from deserializer   
+COLUMN_STATS_ACCURATE	{\"COLUMN_STATS\":{\"locid\":\"true\"}}	 	 	 	 	 	 	 	 	 
+PREHOOK: query: describe formatted loc_orc_2d locid
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@loc_orc_2d
+POSTHOOK: query: describe formatted loc_orc_2d locid
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@loc_orc_2d
+# col_name            	data_type           	min                 	max                 	num_nulls
          	distinct_count      	avg_col_len         	max_col_len         	num_trues      
    	num_falses          	comment             
+	 	 	 	 	 	 	 	 	 	 
+locid               	int                 	1                   	4                   	0   
               	4                   	                    	                    	          
         	                    	from deserializer   
+COLUMN_STATS_ACCURATE	{\"COLUMN_STATS\":{\"locid\":\"true\"}}	 	 	 	 	 	 	 	 	 


Mime
View raw message