Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 1C824200B38 for ; Fri, 8 Jul 2016 23:36:56 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 1B017160A5A; Fri, 8 Jul 2016 21:36:56 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 1639A160A36 for ; Fri, 8 Jul 2016 23:36:54 +0200 (CEST) Received: (qmail 29341 invoked by uid 500); 8 Jul 2016 21:36:54 -0000 Mailing-List: contact commits-help@spark.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list commits@spark.apache.org Received: (qmail 29332 invoked by uid 99); 8 Jul 2016 21:36:54 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 08 Jul 2016 21:36:54 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 2050EDFFF8; Fri, 8 Jul 2016 21:36:54 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: rxin@apache.org To: commits@spark.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: spark git commit: [SPARK-16429][SQL] Include `StringType` columns in `describe()` Date: Fri, 8 Jul 2016 21:36:54 +0000 (UTC) archived-at: Fri, 08 Jul 2016 21:36:56 -0000 Repository: spark Updated Branches: refs/heads/master 67e085ef6 -> 142df4834 [SPARK-16429][SQL] Include `StringType` columns in `describe()` ## What changes were proposed in this pull request? Currently, Spark `describe` supports `StringType`. However, `describe()` returns a dataset for only all numeric columns. This PR aims to include `StringType` columns in `describe()`, `describe` without argument. **Background** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe("age", "name").show() +-------+------------------+-------+ |summary| age| name| +-------+------------------+-------+ | count| 2| 3| | mean| 24.5| null| | stddev|7.7781745930520225| null| | min| 19| Andy| | max| 30|Michael| +-------+------------------+-------+ ``` **Before** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe().show() +-------+------------------+ |summary| age| +-------+------------------+ | count| 2| | mean| 24.5| | stddev|7.7781745930520225| | min| 19| | max| 30| +-------+------------------+ ``` **After** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe().show() +-------+------------------+-------+ |summary| age| name| +-------+------------------+-------+ | count| 2| 3| | mean| 24.5| null| | stddev|7.7781745930520225| null| | min| 19| Andy| | max| 30|Michael| +-------+------------------+-------+ ``` ## How was this patch tested? Pass the Jenkins with a update testcase. Author: Dongjoon Hyun Closes #14095 from dongjoon-hyun/SPARK-16429. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/142df483 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/142df483 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/142df483 Branch: refs/heads/master Commit: 142df4834bc33dc7b84b626c6ee3508ab1abe015 Parents: 67e085e Author: Dongjoon Hyun Authored: Fri Jul 8 14:36:50 2016 -0700 Committer: Reynold Xin Committed: Fri Jul 8 14:36:50 2016 -0700 ---------------------------------------------------------------------- R/pkg/R/DataFrame.R | 4 +-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 +-- python/pyspark/sql/dataframe.py | 8 ++--- .../scala/org/apache/spark/sql/Dataset.scala | 16 +++++++-- .../org/apache/spark/sql/DataFrameSuite.scala | 36 ++++++++++---------- 5 files changed, 39 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/142df483/R/pkg/R/DataFrame.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a18eee3..47f9203 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2587,8 +2587,8 @@ setMethod("saveAsTable", #' summary #' -#' Computes statistics for numeric columns. -#' If no columns are given, this function computes statistics for all numerical columns. +#' Computes statistics for numeric and string columns. +#' If no columns are given, this function computes statistics for all numerical or string columns. #' #' @param x A SparkDataFrame to be computed. #' @param col A string of name http://git-wip-us.apache.org/repos/asf/spark/blob/142df483/R/pkg/inst/tests/testthat/test_sparkSQL.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index e2a1da0..fdd6020 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1824,11 +1824,11 @@ test_that("describe() and summarize() on a DataFrame", { expect_equal(collect(stats)[2, "age"], "24.5") expect_equal(collect(stats)[3, "age"], "7.7781745930520225") stats <- describe(df) - expect_equal(collect(stats)[4, "name"], NULL) + expect_equal(collect(stats)[4, "name"], "Andy") expect_equal(collect(stats)[5, "age"], "30") stats2 <- summary(df) - expect_equal(collect(stats2)[4, "name"], NULL) + expect_equal(collect(stats2)[4, "name"], "Andy") expect_equal(collect(stats2)[5, "age"], "30") # SPARK-16425: SparkR summary() fails on column of type logical http://git-wip-us.apache.org/repos/asf/spark/blob/142df483/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index dd670a9..ab41e88 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -751,15 +751,15 @@ class DataFrame(object): @since("1.3.1") def describe(self, *cols): - """Computes statistics for numeric columns. + """Computes statistics for numeric and string columns. This include count, mean, stddev, min, and max. If no columns are - given, this function computes statistics for all numerical columns. + given, this function computes statistics for all numerical or string columns. .. note:: This function is meant for exploratory data analysis, as we make no \ guarantee about the backward compatibility of the schema of the resulting DataFrame. - >>> df.describe().show() + >>> df.describe(['age']).show() +-------+------------------+ |summary| age| +-------+------------------+ @@ -769,7 +769,7 @@ class DataFrame(object): | min| 2| | max| 5| +-------+------------------+ - >>> df.describe(['age', 'name']).show() + >>> df.describe().show() +-------+------------------+-----+ |summary| age| name| +-------+------------------+-----+ http://git-wip-us.apache.org/repos/asf/spark/blob/142df483/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index ededf7f..ed4ccdb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -228,6 +228,15 @@ class Dataset[T] private[sql]( } } + private def aggregatableColumns: Seq[Expression] = { + schema.fields + .filter(f => f.dataType.isInstanceOf[NumericType] || f.dataType.isInstanceOf[StringType]) + .map { n => + queryExecution.analyzed.resolveQuoted(n.name, sparkSession.sessionState.analyzer.resolver) + .get + } + } + /** * Compose the string representing rows for output * @@ -1886,8 +1895,9 @@ class Dataset[T] private[sql]( } /** - * Computes statistics for numeric columns, including count, mean, stddev, min, and max. - * If no columns are given, this function computes statistics for all numerical columns. + * Computes statistics for numeric and string columns, including count, mean, stddev, min, and + * max. If no columns are given, this function computes statistics for all numerical or string + * columns. * * This function is meant for exploratory data analysis, as we make no guarantee about the * backward compatibility of the schema of the resulting Dataset. If you want to @@ -1920,7 +1930,7 @@ class Dataset[T] private[sql]( "max" -> ((child: Expression) => Max(child).toAggregateExpression())) val outputCols = - (if (cols.isEmpty) numericColumns.map(usePrettyExpression(_).sql) else cols).toList + (if (cols.isEmpty) aggregatableColumns.map(usePrettyExpression(_).sql) else cols).toList val ret: Seq[Row] = if (outputCols.nonEmpty) { val aggExprs = statistics.flatMap { case (_, colToAgg) => http://git-wip-us.apache.org/repos/asf/spark/blob/142df483/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 9d53be8..905da55 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -651,44 +651,44 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { ("Amy", 24, 180)).toDF("name", "age", "height") val describeResult = Seq( - Row("count", "4", "4"), - Row("mean", "33.0", "178.0"), - Row("stddev", "19.148542155126762", "11.547005383792516"), - Row("min", "16", "164"), - Row("max", "60", "192")) + Row("count", "4", "4", "4"), + Row("mean", null, "33.0", "178.0"), + Row("stddev", null, "19.148542155126762", "11.547005383792516"), + Row("min", "Alice", "16", "164"), + Row("max", "David", "60", "192")) val emptyDescribeResult = Seq( - Row("count", "0", "0"), - Row("mean", null, null), - Row("stddev", null, null), - Row("min", null, null), - Row("max", null, null)) + Row("count", "0", "0", "0"), + Row("mean", null, null, null), + Row("stddev", null, null, null), + Row("min", null, null, null), + Row("max", null, null, null)) def getSchemaAsSeq(df: DataFrame): Seq[String] = df.schema.map(_.name) - val describeTwoCols = describeTestData.describe("age", "height") - assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "age", "height")) + val describeTwoCols = describeTestData.describe("name", "age", "height") + assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "name", "age", "height")) checkAnswer(describeTwoCols, describeResult) // All aggregate value should have been cast to string describeTwoCols.collect().foreach { row => - assert(row.get(1).isInstanceOf[String], "expected string but found " + row.get(1).getClass) assert(row.get(2).isInstanceOf[String], "expected string but found " + row.get(2).getClass) + assert(row.get(3).isInstanceOf[String], "expected string but found " + row.get(3).getClass) } val describeAllCols = describeTestData.describe() - assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "age", "height")) + assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "name", "age", "height")) checkAnswer(describeAllCols, describeResult) val describeOneCol = describeTestData.describe("age") assert(getSchemaAsSeq(describeOneCol) === Seq("summary", "age")) - checkAnswer(describeOneCol, describeResult.map { case Row(s, d, _) => Row(s, d)} ) + checkAnswer(describeOneCol, describeResult.map { case Row(s, _, d, _) => Row(s, d)} ) val describeNoCol = describeTestData.select("name").describe() - assert(getSchemaAsSeq(describeNoCol) === Seq("summary")) - checkAnswer(describeNoCol, describeResult.map { case Row(s, _, _) => Row(s)} ) + assert(getSchemaAsSeq(describeNoCol) === Seq("summary", "name")) + checkAnswer(describeNoCol, describeResult.map { case Row(s, n, _, _) => Row(s, n)} ) val emptyDescription = describeTestData.limit(0).describe() - assert(getSchemaAsSeq(emptyDescription) === Seq("summary", "age", "height")) + assert(getSchemaAsSeq(emptyDescription) === Seq("summary", "name", "age", "height")) checkAnswer(emptyDescription, emptyDescribeResult) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org For additional commands, e-mail: commits-help@spark.apache.org