spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wenc...@apache.org
Subject spark git commit: [SPARK-23666][SQL] Do not display exprIds of Alias in user-facing info.
Date Wed, 21 Mar 2018 06:17:54 GMT
Repository: spark
Updated Branches:
  refs/heads/master 477d6bd72 -> 983e8d9d6


[SPARK-23666][SQL] Do not display exprIds of Alias in user-facing info.

## What changes were proposed in this pull request?
To drop `exprId`s for `Alias` in user-facing info., this pr added an entry for `Alias` in
`NonSQLExpression.sql`

## How was this patch tested?
Added tests in `UDFSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20827 from maropu/SPARK-23666.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/983e8d9d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/983e8d9d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/983e8d9d

Branch: refs/heads/master
Commit: 983e8d9d64b6b1304c43ea6e5dffdc1078138ef9
Parents: 477d6bd
Author: Takeshi Yamamuro <yamamuro@apache.org>
Authored: Tue Mar 20 23:17:49 2018 -0700
Committer: Wenchen Fan <wenchen@databricks.com>
Committed: Tue Mar 20 23:17:49 2018 -0700

----------------------------------------------------------------------
 docs/sql-programming-guide.md                   |   1 +
 .../sql/catalyst/expressions/Expression.scala   |   1 +
 .../scala/org/apache/spark/sql/UDFSuite.scala   | 132 +++++++++++--------
 3 files changed, 78 insertions(+), 56 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/983e8d9d/docs/sql-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 0e092e0..5b47fd7 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1806,6 +1806,7 @@ working with timestamps in `pandas_udf`s to get the best performance,
see
   - Since Spark 2.4, Spark maximizes the usage of a vectorized ORC reader for ORC files by
default. To do that, `spark.sql.orc.impl` and `spark.sql.orc.filterPushdown` change their
default values to `native` and `true` respectively.
   - In PySpark, when Arrow optimization is enabled, previously `toPandas` just failed when
Arrow optimization is unabled to be used whereas `createDataFrame` from Pandas DataFrame allowed
the fallback to non-optimization. Now, both `toPandas` and `createDataFrame` from Pandas DataFrame
allow the fallback by default, which can be switched off by `spark.sql.execution.arrow.fallback.enabled`.
  - Since Spark 2.4, writing an empty dataframe to a directory launches at least one write
task, even if physically the dataframe has no partition. This introduces a small behavior
change that for self-describing file formats like Parquet and Orc, Spark creates a metadata-only
file in the target directory when writing a 0-partition dataframe, so that schema inference
can still work if users read that directory later. The new behavior is more reasonable and
more consistent regarding writing empty dataframe.
+ - Since Spark 2.4, expression IDs in UDF arguments do not appear in column names. For example,
an column name in Spark 2.4 is not `UDF:f(col0 AS colA#28)` but ``UDF:f(col0 AS `colA`)``.
 
 ## Upgrading From Spark SQL 2.2 to 2.3
 

http://git-wip-us.apache.org/repos/asf/spark/blob/983e8d9d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index d7f9e38..38caf67 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -288,6 +288,7 @@ trait NonSQLExpression extends Expression {
   final override def sql: String = {
     transform {
       case a: Attribute => new PrettyAttribute(a)
+      case a: Alias => PrettyAttribute(a.sql, a.dataType)
     }.toString
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/983e8d9d/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index af6a10b..21afdc7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -144,73 +144,81 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDF in a WHERE") {
-    spark.udf.register("oneArgFilter", (n: Int) => { n > 80 })
+    withTempView("integerData") {
+      spark.udf.register("oneArgFilter", (n: Int) => { n > 80 })
 
-    val df = sparkContext.parallelize(
-      (1 to 100).map(i => TestData(i, i.toString))).toDF()
-    df.createOrReplaceTempView("integerData")
+      val df = sparkContext.parallelize(
+        (1 to 100).map(i => TestData(i, i.toString))).toDF()
+      df.createOrReplaceTempView("integerData")
 
-    val result =
-      sql("SELECT * FROM integerData WHERE oneArgFilter(key)")
-    assert(result.count() === 20)
+      val result =
+        sql("SELECT * FROM integerData WHERE oneArgFilter(key)")
+      assert(result.count() === 20)
+    }
   }
 
   test("UDF in a HAVING") {
-    spark.udf.register("havingFilter", (n: Long) => { n > 5 })
-
-    val df = Seq(("red", 1), ("red", 2), ("blue", 10),
-      ("green", 100), ("green", 200)).toDF("g", "v")
-    df.createOrReplaceTempView("groupData")
-
-    val result =
-      sql(
-        """
-         | SELECT g, SUM(v) as s
-         | FROM groupData
-         | GROUP BY g
-         | HAVING havingFilter(s)
-        """.stripMargin)
-
-    assert(result.count() === 2)
+    withTempView("groupData") {
+      spark.udf.register("havingFilter", (n: Long) => { n > 5 })
+
+      val df = Seq(("red", 1), ("red", 2), ("blue", 10),
+        ("green", 100), ("green", 200)).toDF("g", "v")
+      df.createOrReplaceTempView("groupData")
+
+      val result =
+        sql(
+          """
+           | SELECT g, SUM(v) as s
+           | FROM groupData
+           | GROUP BY g
+           | HAVING havingFilter(s)
+          """.stripMargin)
+
+      assert(result.count() === 2)
+    }
   }
 
   test("UDF in a GROUP BY") {
-    spark.udf.register("groupFunction", (n: Int) => { n > 10 })
-
-    val df = Seq(("red", 1), ("red", 2), ("blue", 10),
-      ("green", 100), ("green", 200)).toDF("g", "v")
-    df.createOrReplaceTempView("groupData")
-
-    val result =
-      sql(
-        """
-         | SELECT SUM(v)
-         | FROM groupData
-         | GROUP BY groupFunction(v)
-        """.stripMargin)
-    assert(result.count() === 2)
+    withTempView("groupData") {
+      spark.udf.register("groupFunction", (n: Int) => { n > 10 })
+
+      val df = Seq(("red", 1), ("red", 2), ("blue", 10),
+        ("green", 100), ("green", 200)).toDF("g", "v")
+      df.createOrReplaceTempView("groupData")
+
+      val result =
+        sql(
+          """
+           | SELECT SUM(v)
+           | FROM groupData
+           | GROUP BY groupFunction(v)
+          """.stripMargin)
+      assert(result.count() === 2)
+    }
   }
 
   test("UDFs everywhere") {
-    spark.udf.register("groupFunction", (n: Int) => { n > 10 })
-    spark.udf.register("havingFilter", (n: Long) => { n > 2000 })
-    spark.udf.register("whereFilter", (n: Int) => { n < 150 })
-    spark.udf.register("timesHundred", (n: Long) => { n * 100 })
-
-    val df = Seq(("red", 1), ("red", 2), ("blue", 10),
-      ("green", 100), ("green", 200)).toDF("g", "v")
-    df.createOrReplaceTempView("groupData")
-
-    val result =
-      sql(
-        """
-         | SELECT timesHundred(SUM(v)) as v100
-         | FROM groupData
-         | WHERE whereFilter(v)
-         | GROUP BY groupFunction(v)
-         | HAVING havingFilter(v100)
-        """.stripMargin)
-    assert(result.count() === 1)
+    withTempView("groupData") {
+      spark.udf.register("groupFunction", (n: Int) => { n > 10 })
+      spark.udf.register("havingFilter", (n: Long) => { n > 2000 })
+      spark.udf.register("whereFilter", (n: Int) => { n < 150 })
+      spark.udf.register("timesHundred", (n: Long) => { n * 100 })
+
+      val df = Seq(("red", 1), ("red", 2), ("blue", 10),
+        ("green", 100), ("green", 200)).toDF("g", "v")
+      df.createOrReplaceTempView("groupData")
+
+      val result =
+        sql(
+          """
+           | SELECT timesHundred(SUM(v)) as v100
+           | FROM groupData
+           | WHERE whereFilter(v)
+           | GROUP BY groupFunction(v)
+           | HAVING havingFilter(v100)
+          """.stripMargin)
+      assert(result.count() === 1)
+    }
   }
 
   test("struct UDF") {
@@ -304,4 +312,16 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     assert(explainStr(spark.range(1).select(udf1(udf2(functions.lit(1)))))
       .contains(s"UDF:$udf1Name(UDF:$udf2Name(1))"))
   }
+
+  test("SPARK-23666 Do not display exprId in argument names") {
+    withTempView("x") {
+      Seq(((1, 2), 3)).toDF("a", "b").createOrReplaceTempView("x")
+      spark.udf.register("f", (a: Int) => a)
+      val outputStream = new java.io.ByteArrayOutputStream()
+      Console.withOut(outputStream) {
+        spark.sql("SELECT f(a._1) FROM x").show
+      }
+      assert(outputStream.toString.contains("UDF:f(a._1 AS `_1`)"))
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message