impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mi...@apache.org
Subject [2/4] incubator-impala git commit: IMPALA-5955: Use totalSize tblproperty instead of rawDataSize.
Date Fri, 22 Sep 2017 17:13:23 GMT
IMPALA-5955: Use totalSize tblproperty instead of rawDataSize.

Today, Impala populates the 'rawDataSize' property
during COMPUTE STATS for the purpose of extrapolating
row counts based on file sizes.

After this patch Impala will populate 'totalSize' instead of
'rawDataSize'. The 'rawDataSize' is not populated or used.

Intended meaning/use of tblproperties:
- rawDataSize' is the estimated in-memory size of a table
  (without encoding and compression)
- 'totalSize' represents the on-disk size

Using the fields correctly is important for compatibility
with other users of the HMS such as Hive and SparkSQL.
For example, SparkSQL relies on the 'totalSize' for
join ordering.

Testing:
- core/hdfs run passed

Change-Id: If7c2c4e1e99b297c849f9f0d18b2bef34ad811c6
Reviewed-on: http://gerrit.cloudera.org:8080/8110
Tested-by: Impala Public Jenkins
Reviewed-by: Alex Behm <alex.behm@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/71fd1941
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/71fd1941
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/71fd1941

Branch: refs/heads/master
Commit: 71fd1941f006bb8e7629c8bcfbcfd1da050deed1
Parents: eb276d4
Author: Alex Behm <alex.behm@cloudera.com>
Authored: Mon Sep 18 20:40:58 2017 -0700
Committer: Alex Behm <alex.behm@cloudera.com>
Committed: Fri Sep 22 03:39:04 2017 +0000

----------------------------------------------------------------------
 .../java/org/apache/impala/catalog/Table.java   |  6 ++--
 .../impala/service/CatalogOpExecutor.java       | 10 +++---
 .../impala/planner/StatsExtrapolationTest.java  | 32 ++++++++++----------
 .../PlannerTest/fk-pk-join-detection.test       |  2 +-
 4 files changed, 25 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/71fd1941/fe/src/main/java/org/apache/impala/catalog/Table.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/Table.java b/fe/src/main/java/org/apache/impala/catalog/Table.java
index 6b27353..23fa7a4 100644
--- a/fe/src/main/java/org/apache/impala/catalog/Table.java
+++ b/fe/src/main/java/org/apache/impala/catalog/Table.java
@@ -129,7 +129,7 @@ public abstract class Table implements CatalogObject {
    */
   public void setTableStats(org.apache.hadoop.hive.metastore.api.Table msTbl) {
     tableStats_ = new TTableStats(getRowCount(msTbl.getParameters()));
-    tableStats_.setTotal_file_bytes(getRawDataSize(msTbl.getParameters()));
+    tableStats_.setTotal_file_bytes(getTotalSize(msTbl.getParameters()));
   }
 
   public void addColumn(Column col) {
@@ -213,8 +213,8 @@ public abstract class Table implements CatalogObject {
     return getLongParam(StatsSetupConst.ROW_COUNT, parameters);
   }
 
-  protected static long getRawDataSize(Map<String, String> parameters) {
-    return getLongParam(StatsSetupConst.RAW_DATA_SIZE, parameters);
+  protected static long getTotalSize(Map<String, String> parameters) {
+    return getLongParam(StatsSetupConst.TOTAL_SIZE, parameters);
   }
 
   private static long getLongParam(String key, Map<String, String> parameters) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/71fd1941/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
index cf1e677..edba72c 100644
--- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
+++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
@@ -813,11 +813,11 @@ public class CatalogOpExecutor {
       }
     }
 
-    // Update the table's ROW_COUNT and RAW_DATA_SIZE parameters.
+    // Update the table's ROW_COUNT and TOTAL_SIZE parameters.
     msTbl.putToParameters(StatsSetupConst.ROW_COUNT,
         String.valueOf(params.getTable_stats().num_rows));
     if (params.getTable_stats().isSetTotal_file_bytes()) {
-      msTbl.putToParameters(StatsSetupConst.RAW_DATA_SIZE,
+      msTbl.putToParameters(StatsSetupConst.TOTAL_SIZE,
           String.valueOf(params.getTable_stats().total_file_bytes));
     }
     // HMS requires this param for stats changes to take effect.
@@ -1177,9 +1177,9 @@ public class CatalogOpExecutor {
     int numTargetedPartitions = 0;
     boolean droppedRowCount =
         msTbl.getParameters().remove(StatsSetupConst.ROW_COUNT) != null;
-    boolean droppedRawDataSize =
-        msTbl.getParameters().remove(StatsSetupConst.RAW_DATA_SIZE) != null;
-    if (droppedRowCount || droppedRawDataSize) {
+    boolean droppedTotalSize =
+        msTbl.getParameters().remove(StatsSetupConst.TOTAL_SIZE) != null;
+    if (droppedRowCount || droppedTotalSize) {
       applyAlterTable(msTbl);
       ++numTargetedPartitions;
     }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/71fd1941/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java b/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java
index f1e1a70..92b2f93 100644
--- a/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java
@@ -38,10 +38,10 @@ import com.google.common.base.Preconditions;
 public class StatsExtrapolationTest extends FrontendTestBase {
 
   /**
-   * Sets the row count and raw data size stats in the given table.
+   * Sets the row count and total file size stats in the given table.
    * Unsets the corresponding statistic if a null value is passed.
    */
-  private void setStats(Table tbl, Long rowCount, Long rawDataSize) {
+  private void setStats(Table tbl, Long rowCount, Long totalSize) {
     org.apache.hadoop.hive.metastore.api.Table msTbl =
         new org.apache.hadoop.hive.metastore.api.Table();
     msTbl.setParameters(new HashMap<String, String>());
@@ -49,29 +49,29 @@ public class StatsExtrapolationTest extends FrontendTestBase {
       msTbl.getParameters().put(StatsSetupConst.ROW_COUNT,
           String.valueOf(rowCount));
     }
-    if (rawDataSize != null) {
-      msTbl.getParameters().put(StatsSetupConst.RAW_DATA_SIZE,
-          String.valueOf(rawDataSize));
+    if (totalSize != null) {
+      msTbl.getParameters().put(StatsSetupConst.TOTAL_SIZE,
+          String.valueOf(totalSize));
     }
     tbl.setMetaStoreTable(msTbl);
     tbl.setTableStats(msTbl);
   }
 
-  private void runTest(Table tbl, Long rowCount, Long rawDataSize,
+  private void runTest(Table tbl, Long rowCount, Long totalSize,
       long fileBytes, long expectedExtrapNumRows) {
     Preconditions.checkState(tbl instanceof HdfsTable);
-    setStats(tbl, rowCount, rawDataSize);
+    setStats(tbl, rowCount, totalSize);
     long actualExrtapNumRows = ((HdfsTable)tbl).getExtrapolatedNumRows(fileBytes);
     assertEquals(expectedExtrapNumRows, actualExrtapNumRows);
   }
 
-  private void testInvalidStats(Table tbl, Long rowCount, Long rawDataSize) {
-    runTest(tbl, rowCount, rawDataSize, 0, 0);
-    runTest(tbl, rowCount, rawDataSize, 1, -1);
-    runTest(tbl, rowCount, rawDataSize, 100, -1);
-    runTest(tbl, rowCount, rawDataSize, 1000000000, -1);
-    runTest(tbl, rowCount, rawDataSize, Long.MAX_VALUE, -1);
-    runTest(tbl, rowCount, rawDataSize, Long.MIN_VALUE, -1);
+  private void testInvalidStats(Table tbl, Long rowCount, Long totalSize) {
+    runTest(tbl, rowCount, totalSize, 0, 0);
+    runTest(tbl, rowCount, totalSize, 1, -1);
+    runTest(tbl, rowCount, totalSize, 100, -1);
+    runTest(tbl, rowCount, totalSize, 1000000000, -1);
+    runTest(tbl, rowCount, totalSize, Long.MAX_VALUE, -1);
+    runTest(tbl, rowCount, totalSize, Long.MIN_VALUE, -1);
   }
 
   @Test
@@ -99,13 +99,13 @@ public class StatsExtrapolationTest extends FrontendTestBase {
       runTest(tbl, 1000000000L, 123456789L, 123456789*3, 3000000000L);
       runTest(tbl, 7777777777L, 33333333L, 33333333L*2, 15555555554L);
       runTest(tbl, 7777777777L, 33333333L, 33333333L*3, 23333333331L);
-      // Very small row count and very big raw data size.
+      // Very small row count and very big total file size.
       runTest(tbl, 1L, Long.MAX_VALUE, 1, 1);
       runTest(tbl, 1L, Long.MAX_VALUE, 100, 1);
       runTest(tbl, 1L, Long.MAX_VALUE, 1000000000, 1);
       runTest(tbl, 1L, Long.MAX_VALUE, Long.MAX_VALUE, 1);
       runTest(tbl, 1L, Long.MAX_VALUE, -100, -1);
-      // Very large row count and very small raw data size.
+      // Very large row count and very small total file size.
       runTest(tbl, Long.MAX_VALUE, 1L, 1, Long.MAX_VALUE);
       runTest(tbl, Long.MAX_VALUE, 1L, 100, Long.MAX_VALUE);
       runTest(tbl, Long.MAX_VALUE, 1L, 1000000000, Long.MAX_VALUE);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/71fd1941/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
b/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
index 4af1bef..9dc9f22 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
@@ -360,7 +360,7 @@ PLAN-ROOT SINK
 |--01:SCAN HDFS [tpcds_seq_snap.customer]
 |     partitions=1/1 files=1 size=8.58MB
 |     stats-rows=unavailable extrapolated-rows=disabled
-|     table stats: rows=unavailable size=unavailable
+|     table stats: rows=unavailable size=8.59MB
 |     column stats: unavailable
 |     mem-estimate=48.00MB mem-reservation=0B
 |     tuple-ids=1 row-size=4B cardinality=unavailable


Mime
View raw message