spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wenc...@apache.org
Subject spark git commit: [SPARK-22475][SQL] show histogram in DESC COLUMN command
Date Tue, 21 Nov 2017 19:55:40 GMT
Repository: spark
Updated Branches:
  refs/heads/master 6d7ebf2f9 -> b96f61b6b


[SPARK-22475][SQL] show histogram in DESC COLUMN command

## What changes were proposed in this pull request?

Added the histogram representation to the output of the `DESCRIBE EXTENDED table_name column_name`
command.

## How was this patch tested?

Modified SQL UT and checked output

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19774 from mgaido91/SPARK-22475.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b96f61b6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b96f61b6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b96f61b6

Branch: refs/heads/master
Commit: b96f61b6b262836e6be3f7657a3fe136d58b4dfe
Parents: 6d7ebf2
Author: Marco Gaido <mgaido@hortonworks.com>
Authored: Tue Nov 21 20:55:24 2017 +0100
Committer: Wenchen Fan <wenchen@databricks.com>
Committed: Tue Nov 21 20:55:24 2017 +0100

----------------------------------------------------------------------
 .../spark/sql/execution/command/tables.scala    | 17 +++++
 .../sql-tests/inputs/describe-table-column.sql  | 10 +++
 .../results/describe-table-column.sql.out       | 74 +++++++++++++++++---
 3 files changed, 93 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/b96f61b6/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 95f16b0..c9f6e57 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.plans.logical.Histogram
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils}
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
@@ -689,9 +690,25 @@ case class DescribeColumnCommand(
       buffer += Row("distinct_count", cs.map(_.distinctCount.toString).getOrElse("NULL"))
       buffer += Row("avg_col_len", cs.map(_.avgLen.toString).getOrElse("NULL"))
       buffer += Row("max_col_len", cs.map(_.maxLen.toString).getOrElse("NULL"))
+      val histDesc = for {
+        c <- cs
+        hist <- c.histogram
+      } yield histogramDescription(hist)
+      buffer ++= histDesc.getOrElse(Seq(Row("histogram", "NULL")))
     }
     buffer
   }
+
+  private def histogramDescription(histogram: Histogram): Seq[Row] = {
+    val header = Row("histogram",
+      s"height: ${histogram.height}, num_of_bins: ${histogram.bins.length}")
+    val bins = histogram.bins.zipWithIndex.map {
+      case (bin, index) =>
+        Row(s"bin_$index",
+          s"lower_bound: ${bin.lo}, upper_bound: ${bin.hi}, distinct_count: ${bin.ndv}")
+    }
+    header +: bins
+  }
 }
 
 /**

http://git-wip-us.apache.org/repos/asf/spark/blob/b96f61b6/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
index a6ddcd9..2d180d1 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
@@ -34,6 +34,16 @@ DESC FORMATTED desc_complex_col_table col;
 -- Describe a nested column
 DESC FORMATTED desc_complex_col_table col.x;
 
+-- Test output for histogram statistics
+SET spark.sql.statistics.histogram.enabled=true;
+SET spark.sql.statistics.histogram.numBins=2;
+
+INSERT INTO desc_col_table values 1, 2, 3, 4;
+
+ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key;
+
+DESC EXTENDED desc_col_table key;
+
 DROP VIEW desc_col_temp_view;
 
 DROP TABLE desc_col_table;

http://git-wip-us.apache.org/repos/asf/spark/blob/b96f61b6/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
index 30d0a2d..6ef8af6 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 18
+-- Number of queries: 23
 
 
 -- !query 0
@@ -34,6 +34,7 @@ num_nulls	NULL
 distinct_count	NULL
 avg_col_len	NULL
 max_col_len	NULL
+histogram	NULL
 
 
 -- !query 3
@@ -50,6 +51,7 @@ num_nulls	NULL
 distinct_count	NULL
 avg_col_len	NULL
 max_col_len	NULL
+histogram	NULL
 
 
 -- !query 4
@@ -66,6 +68,7 @@ num_nulls	NULL
 distinct_count	NULL
 avg_col_len	NULL
 max_col_len	NULL
+histogram	NULL
 
 
 -- !query 5
@@ -117,6 +120,7 @@ num_nulls	0
 distinct_count	0
 avg_col_len	4
 max_col_len	4
+histogram	NULL
 
 
 -- !query 10
@@ -133,6 +137,7 @@ num_nulls	0
 distinct_count	0
 avg_col_len	4
 max_col_len	4
+histogram	NULL
 
 
 -- !query 11
@@ -157,6 +162,7 @@ num_nulls	NULL
 distinct_count	NULL
 avg_col_len	NULL
 max_col_len	NULL
+histogram	NULL
 
 
 -- !query 13
@@ -173,6 +179,7 @@ num_nulls	NULL
 distinct_count	NULL
 avg_col_len	NULL
 max_col_len	NULL
+histogram	NULL
 
 
 -- !query 14
@@ -185,24 +192,75 @@ DESC TABLE COLUMN command does not support nested data types: col.x;
 
 
 -- !query 15
-DROP VIEW desc_col_temp_view
+SET spark.sql.statistics.histogram.enabled=true
 -- !query 15 schema
-struct<>
+struct<key:string,value:string>
 -- !query 15 output
-
+spark.sql.statistics.histogram.enabled	true
 
 
 -- !query 16
-DROP TABLE desc_col_table
+SET spark.sql.statistics.histogram.numBins=2
 -- !query 16 schema
-struct<>
+struct<key:string,value:string>
 -- !query 16 output
-
+spark.sql.statistics.histogram.numBins	2
 
 
 -- !query 17
-DROP TABLE desc_complex_col_table
+INSERT INTO desc_col_table values 1, 2, 3, 4
 -- !query 17 schema
 struct<>
 -- !query 17 output
 
+
+
+-- !query 18
+ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key
+-- !query 18 schema
+struct<>
+-- !query 18 output
+
+
+
+-- !query 19
+DESC EXTENDED desc_col_table key
+-- !query 19 schema
+struct<info_name:string,info_value:string>
+-- !query 19 output
+col_name	key
+data_type	int
+comment	column_comment
+min	1
+max	4
+num_nulls	0
+distinct_count	4
+avg_col_len	4
+max_col_len	4
+histogram	height: 2.0, num_of_bins: 2
+bin_0	lower_bound: 1.0, upper_bound: 2.0, distinct_count: 2
+bin_1	lower_bound: 2.0, upper_bound: 4.0, distinct_count: 2
+
+
+-- !query 20
+DROP VIEW desc_col_temp_view
+-- !query 20 schema
+struct<>
+-- !query 20 output
+
+
+
+-- !query 21
+DROP TABLE desc_col_table
+-- !query 21 schema
+struct<>
+-- !query 21 output
+
+
+
+-- !query 22
+DROP TABLE desc_complex_col_table
+-- !query 22 schema
+struct<>
+-- !query 22 output
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message