hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vihan...@apache.org
Subject hive git commit: HIVE-17874 : Parquet vectorization fails on tables with complex columns when there are no projected columns (Vihang Karajgaonkar, reviewed by Ferdinand Xu)
Date Mon, 06 Nov 2017 22:26:06 GMT
Repository: hive
Updated Branches:
  refs/heads/branch-2 e2d5d0005 -> 307f58270


HIVE-17874 : Parquet vectorization fails on tables with complex columns when there are no
projected columns (Vihang Karajgaonkar, reviewed by Ferdinand Xu)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/307f5827
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/307f5827
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/307f5827

Branch: refs/heads/branch-2
Commit: 307f58270b7f7abd7311f192540fc969cd379869
Parents: e2d5d00
Author: Vihang Karajgaonkar <vihang@cloudera.com>
Authored: Mon Nov 6 14:16:07 2017 -0800
Committer: Vihang Karajgaonkar <vihang@cloudera.com>
Committed: Mon Nov 6 14:16:07 2017 -0800

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |   1 +
 .../vector/VectorizedParquetRecordReader.java   |  28 +-
 .../vectorization_parquet_projection.q          |  77 ++++
 .../vectorization_parquet_projection.q.out      | 459 +++++++++++++++++++
 .../vectorization_parquet_projection.q.out      | 426 +++++++++++++++++
 5 files changed, 979 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 639ffa8..e2c59f2 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -1343,6 +1343,7 @@ spark.query.files=add_part_multiple.q, \
   vectorization_not.q, \
   vectorization_part.q, \
   vectorization_part_project.q, \
+  vectorization_parquet_projection.q, \
   vectorization_pushdown.q, \
   vectorization_short_regress.q, \
   vectorized_case.q, \

http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
index 9359098..190c639 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
@@ -72,7 +72,6 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
   private List<String> columnNamesList;
   private List<TypeInfo> columnTypesList;
   private VectorizedRowBatchCtx rbCtx;
-  private List<Integer> indexColumnsWanted;
   private Object[] partitionValues;
 
   /**
@@ -105,8 +104,6 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
       serDeStats = new SerDeStats();
       projectionPusher = new ProjectionPusher();
       initialize(inputSplit, conf);
-      colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf);
-      rbCtx = Utilities.getVectorizedRowBatchCtx(conf);
     } catch (Throwable e) {
       LOG.error("Failed to create the vectorized reader due to exception " + e);
       throw new RuntimeException(e);
@@ -123,8 +120,6 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
       if (inputSplit != null) {
         initialize(inputSplit, conf);
       }
-      colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf);
-      rbCtx = Utilities.getVectorizedRowBatchCtx(conf);
       initPartitionValues((FileSplit) oldInputSplit, conf);
     } catch (Throwable e) {
       LOG.error("Failed to create the vectorized reader due to exception " + e);
@@ -145,11 +140,14 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
   public void initialize(
     InputSplit oldSplit,
     JobConf configuration) throws IOException, InterruptedException {
+    colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
+    //initialize the rowbatchContext
+    jobConf = configuration;
+    rbCtx = Utilities.getVectorizedRowBatchCtx(jobConf);
     // the oldSplit may be null during the split phase
     if (oldSplit == null) {
       return;
     }
-    jobConf = configuration;
     ParquetMetadata footer;
     List<BlockMetaData> blocks;
     ParquetInputSplit split = (ParquetInputSplit) oldSplit;
@@ -206,7 +204,7 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
     }
     this.fileSchema = footer.getFileMetaData().getSchema();
 
-    indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
+    colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
     requestedSchema = DataWritableReadSupport
       .getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
 
@@ -294,11 +292,17 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
     List<Type> types = requestedSchema.getFields();
     columnReaders = new VectorizedColumnReader[columns.size()];
 
-    if (!ColumnProjectionUtils.isReadAllColumns(jobConf) && !indexColumnsWanted.isEmpty())
{
-      for (int i = 0; i < types.size(); ++i) {
-        columnReaders[i] =
-          buildVectorizedParquetReader(columnTypesList.get(indexColumnsWanted.get(i)), types.get(i),
-            pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
+    if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) {
+      //certain queries like select count(*) from table do not have
+      //any projected columns and still have isReadAllColumns as false
+      //in such cases columnReaders are not needed
+      //However, if colsToInclude is not empty we should initialize each columnReader
+      if(!colsToInclude.isEmpty()) {
+        for (int i = 0; i < types.size(); ++i) {
+          columnReaders[i] =
+              buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i),
+                  pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
+        }
       }
     } else {
       for (int i = 0; i < types.size(); ++i) {

http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q
new file mode 100644
index 0000000..76fbf0e
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q
@@ -0,0 +1,77 @@
+set hive.fetch.task.conversion=none;
+set hive.compute.query.using.stats=false;
+set hive.vectorized.use.row.serde.deserialize=false;
+set hive.vectorized.use.vector.serde.deserialize=false;
+set hive.vectorized.execution.enabled=true;
+set hive.vectorized.execution.reduce.enabled=true;
+set hive.mapred.mode=nonstrict;
+set hive.llap.cache.allow.synthetic.fileid=true;
+
+-- SORT_QUERY_RESULTS
+
+DROP TABLE IF EXISTS parquet_types_staging;
+
+CREATE TABLE parquet_types_staging (
+  cint int,
+  ctinyint tinyint,
+  csmallint smallint,
+  cfloat float,
+  cdouble double,
+  cstring1 string,
+  t timestamp,
+  cchar char(5),
+  cvarchar varchar(10),
+  cbinary string,
+  m1 map<string, varchar(3)>,
+  l1 array<int>,
+  st1 struct<c1:int, c2:char(1)>,
+  d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':';
+
+LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging;
+
+-- test various number of projected columns
+
+DROP TABLE IF EXISTS parquet_project_test;
+
+CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET;
+
+insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1;
+
+insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2;
+
+insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3;
+
+-- no columns in the projection
+explain vectorization select * from parquet_project_test;
+select * from parquet_project_test;
+
+-- no columns in the projection, just count(*)
+explain vectorization select count(*) from parquet_project_test;
+select count(*) from parquet_project_test;
+
+-- project a primitive type
+explain vectorization select cint, count(*) from parquet_project_test
+group by cint;
+
+select cint, count(*) from parquet_project_test
+group by cint;
+
+-- test complex type in projection, this should not get vectorized
+explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"];
+
+select m1["color"], count(*) from parquet_project_test
+group by m1["color"];

http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out
b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out
new file mode 100644
index 0000000..c27e61c
--- /dev/null
+++ b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out
@@ -0,0 +1,459 @@
+PREHOOK: query: DROP TABLE IF EXISTS parquet_types_staging
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_types_staging
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_types_staging (
+  cint int,
+  ctinyint tinyint,
+  csmallint smallint,
+  cfloat float,
+  cdouble double,
+  cstring1 string,
+  t timestamp,
+  cchar char(5),
+  cvarchar varchar(10),
+  cbinary string,
+  m1 map<string, varchar(3)>,
+  l1 array<int>,
+  st1 struct<c1:int, c2:char(1)>,
+  d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_types_staging
+POSTHOOK: query: CREATE TABLE parquet_types_staging (
+  cint int,
+  ctinyint tinyint,
+  csmallint smallint,
+  cfloat float,
+  cdouble double,
+  cstring1 string,
+  t timestamp,
+  cchar char(5),
+  cvarchar varchar(10),
+  cbinary string,
+  m1 map<string, varchar(3)>,
+  l1 array<int>,
+  st1 struct<c1:int, c2:char(1)>,
+  d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_types_staging
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO
TABLE parquet_types_staging
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@parquet_types_staging
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO
TABLE parquet_types_staging
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@parquet_types_staging
+PREHOOK: query: DROP TABLE IF EXISTS parquet_project_test
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_project_test
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_project_test
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: explain vectorization select * from parquet_project_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select * from parquet_project_test
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Spark
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: parquet_project_test
+                  Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                  Select Operator
+                    expressions: cint (type: int), m1 (type: map<string,string>)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column
stats: NONE
+                      table:
+                          input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                          output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                          serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+                inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+                notVectorizedReason: Select expression for SELECT operator: Data type map<string,string>
of Column[m1] not supported
+                vectorized: false
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select * from parquet_project_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parquet_project_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+PREHOOK: query: explain vectorization select count(*) from parquet_project_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select count(*) from parquet_project_test
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Spark
+      Edges:
+        Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: parquet_project_test
+                  Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
COMPLETE
+                  Select Operator
+                    Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
COMPLETE
+                    Group By Operator
+                      aggregations: count()
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
COMPLETE
+                      Reduce Output Operator
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: COMPLETE
+                        value expressions: _col0 (type: bigint)
+            Execution mode: vectorized
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+                groupByVectorOutput: true
+                inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+        Reducer 2 
+            Execution mode: vectorized
+            Reduce Vectorization:
+                enabled: true
+                enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine
spark IN [tez, spark] IS true
+                groupByVectorOutput: true
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(*) from parquet_project_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from parquet_project_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+22
+PREHOOK: query: explain vectorization select cint, count(*) from parquet_project_test
+group by cint
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select cint, count(*) from parquet_project_test
+group by cint
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Spark
+      Edges:
+        Reducer 2 <- Map 1 (GROUP, 2)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: parquet_project_test
+                  Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                  Select Operator
+                    expressions: cint (type: int)
+                    outputColumnNames: cint
+                    Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                    Group By Operator
+                      aggregations: count()
+                      keys: cint (type: int)
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column
stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column
stats: NONE
+                        value expressions: _col1 (type: bigint)
+            Execution mode: vectorized
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+                groupByVectorOutput: true
+                inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+        Reducer 2 
+            Execution mode: vectorized
+            Reduce Vectorization:
+                enabled: true
+                enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine
spark IN [tez, spark] IS true
+                groupByVectorOutput: true
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                keys: KEY._col0 (type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats:
NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats:
NONE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select cint, count(*) from parquet_project_test
+group by cint
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select cint, count(*) from parquet_project_test
+group by cint
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+1	8
+2	7
+3	7
+PREHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Spark
+      Edges:
+        Reducer 2 <- Map 1 (GROUP, 2)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: parquet_project_test
+                  Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                  Select Operator
+                    expressions: m1['color'] (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                    Group By Operator
+                      aggregations: count()
+                      keys: _col0 (type: string)
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column
stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column
stats: NONE
+                        value expressions: _col1 (type: bigint)
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+                inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+                notVectorizedReason: Select expression for SELECT operator: Data type map<string,string>
of Column[m1] not supported
+                vectorized: false
+        Reducer 2 
+            Execution mode: vectorized
+            Reduce Vectorization:
+                enabled: true
+                enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine
spark IN [tez, spark] IS true
+                groupByVectorOutput: true
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats:
NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats:
NONE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+blue	7
+green	7
+red	8

http://git-wip-us.apache.org/repos/asf/hive/blob/307f5827/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out
new file mode 100644
index 0000000..02a28de
--- /dev/null
+++ b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out
@@ -0,0 +1,426 @@
+PREHOOK: query: DROP TABLE IF EXISTS parquet_types_staging
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_types_staging
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_types_staging (
+  cint int,
+  ctinyint tinyint,
+  csmallint smallint,
+  cfloat float,
+  cdouble double,
+  cstring1 string,
+  t timestamp,
+  cchar char(5),
+  cvarchar varchar(10),
+  cbinary string,
+  m1 map<string, varchar(3)>,
+  l1 array<int>,
+  st1 struct<c1:int, c2:char(1)>,
+  d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_types_staging
+POSTHOOK: query: CREATE TABLE parquet_types_staging (
+  cint int,
+  ctinyint tinyint,
+  csmallint smallint,
+  cfloat float,
+  cdouble double,
+  cstring1 string,
+  t timestamp,
+  cchar char(5),
+  cvarchar varchar(10),
+  cbinary string,
+  m1 map<string, varchar(3)>,
+  l1 array<int>,
+  st1 struct<c1:int, c2:char(1)>,
+  d date
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_types_staging
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO
TABLE parquet_types_staging
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@parquet_types_staging
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO
TABLE parquet_types_staging
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@parquet_types_staging
+PREHOOK: query: DROP TABLE IF EXISTS parquet_project_test
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS parquet_project_test
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: CREATE TABLE parquet_project_test(
+cint int,
+m1 map<string, string>
+) STORED AS PARQUET
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@parquet_project_test
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","red") from parquet_types_staging
+where ctinyint = 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","green") from parquet_types_staging
+where ctinyint = 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_types_staging
+PREHOOK: Output: default@parquet_project_test
+POSTHOOK: query: insert into parquet_project_test
+select ctinyint, map("color","blue") from parquet_types_staging
+where ctinyint = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_types_staging
+POSTHOOK: Output: default@parquet_project_test
+POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION []
+POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION []
+PREHOOK: query: explain vectorization select * from parquet_project_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select * from parquet_project_test
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parquet_project_test
+            Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+            Select Operator
+              expressions: cint (type: int), m1 (type: map<string,string>)
+              outputColumnNames: _col0, _col1
+              Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+              File Output Operator
+                compressed: false
+                Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                table:
+                    input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                    output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+      Map Vectorization:
+          enabled: true
+          enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+          inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+          notVectorizedReason: Select expression for SELECT operator: Data type map<string,string>
of Column[m1] not supported
+          vectorized: false
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select * from parquet_project_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from parquet_project_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+1	{"color":"red"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+2	{"color":"green"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+3	{"color":"blue"}
+PREHOOK: query: explain vectorization select count(*) from parquet_project_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select count(*) from parquet_project_test
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parquet_project_test
+            Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE
+            Select Operator
+              Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
COMPLETE
+              Group By Operator
+                aggregations: count()
+                mode: hash
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
COMPLETE
+                  value expressions: _col0 (type: bigint)
+      Execution mode: vectorized
+      Map Vectorization:
+          enabled: true
+          enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+          groupByVectorOutput: true
+          inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+          allNative: false
+          usesVectorUDFAdaptor: false
+          vectorized: true
+      Reduce Vectorization:
+          enabled: false
+          enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true
+          enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          mode: mergepartial
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(*) from parquet_project_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from parquet_project_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+22
+PREHOOK: query: explain vectorization select cint, count(*) from parquet_project_test
+group by cint
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select cint, count(*) from parquet_project_test
+group by cint
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parquet_project_test
+            Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+            Select Operator
+              expressions: cint (type: int)
+              outputColumnNames: cint
+              Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+              Group By Operator
+                aggregations: count()
+                keys: cint (type: int)
+                mode: hash
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: int)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: int)
+                  Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                  value expressions: _col1 (type: bigint)
+      Execution mode: vectorized
+      Map Vectorization:
+          enabled: true
+          enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+          groupByVectorOutput: true
+          inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+          allNative: false
+          usesVectorUDFAdaptor: false
+          vectorized: true
+      Reduce Vectorization:
+          enabled: false
+          enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true
+          enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          keys: KEY._col0 (type: int)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select cint, count(*) from parquet_project_test
+group by cint
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select cint, count(*) from parquet_project_test
+group by cint
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+1	8
+2	7
+3	7
+PREHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: parquet_project_test
+            Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE
+            Select Operator
+              expressions: m1['color'] (type: string)
+              outputColumnNames: _col0
+              Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+              Group By Operator
+                aggregations: count()
+                keys: _col0 (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats:
NONE
+                  value expressions: _col1 (type: bigint)
+      Map Vectorization:
+          enabled: true
+          enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
+          inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+          notVectorizedReason: Select expression for SELECT operator: Data type map<string,string>
of Column[m1] not supported
+          vectorized: false
+      Reduce Vectorization:
+          enabled: false
+          enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true
+          enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          keys: KEY._col0 (type: string)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+PREHOOK: type: QUERY
+PREHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+POSTHOOK: query: select m1["color"], count(*) from parquet_project_test
+group by m1["color"]
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@parquet_project_test
+#### A masked pattern was here ####
+blue	7
+green	7
+red	8


Mime
View raw message