spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject spark git commit: [SPARK-7982][SQL] DataFrame.stat.crosstab should use 0 instead of null for pairs that don't appear
Date Tue, 02 Jun 2015 04:11:28 GMT
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 cbfb682ab -> efc0e0532


[SPARK-7982][SQL] DataFrame.stat.crosstab should use 0 instead of null for pairs that don't
appear

Author: Reynold Xin <rxin@databricks.com>

Closes #6566 from rxin/crosstab and squashes the following commits:

e0ace1c [Reynold Xin] [SPARK-7982][SQL] DataFrame.stat.crosstab should use 0 instead of null
for pairs that don't appear

(cherry picked from commit 6396cc0303ceabea53c4df436ffa50b82b7e233f)
Signed-off-by: Reynold Xin <rxin@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/efc0e053
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/efc0e053
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/efc0e053

Branch: refs/heads/branch-1.4
Commit: efc0e053230d8cb268479c1bd808d69adbeb132c
Parents: cbfb682a
Author: Reynold Xin <rxin@databricks.com>
Authored: Mon Jun 1 21:11:19 2015 -0700
Committer: Reynold Xin <rxin@databricks.com>
Committed: Mon Jun 1 21:11:26 2015 -0700

----------------------------------------------------------------------
 .../org/apache/spark/sql/execution/stat/StatFunctions.scala | 9 ++++++---
 .../scala/org/apache/spark/sql/DataFrameStatSuite.scala     | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/efc0e053/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index b1a8204..93383e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.stat
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.{Column, DataFrame}
+import org.apache.spark.sql.{Row, Column, DataFrame}
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, Cast}
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.functions._
@@ -116,7 +116,10 @@ private[sql] object StatFunctions extends Logging {
       s"exceed 1e4. Currently $columnSize")
     val table = counts.groupBy(_.get(0)).map { case (col1Item, rows) =>
       val countsRow = new GenericMutableRow(columnSize + 1)
-      rows.foreach { row =>
+      rows.foreach { (row: Row) =>
+        // row.get(0) is column 1
+        // row.get(1) is column 2
+        // row.get(3) is the frequency
         countsRow.setLong(distinctCol2.get(row.get(1)).get + 1, row.getLong(2))
       }
       // the value of col1 is the first value, the rest are the counts
@@ -126,6 +129,6 @@ private[sql] object StatFunctions extends Logging {
     val headerNames = distinctCol2.map(r => StructField(r._1.toString, LongType)).toSeq
     val schema = StructType(StructField(tableName, StringType) +: headerNames)
 
-    new DataFrame(df.sqlContext, LocalRelation(schema.toAttributes, table))
+    new DataFrame(df.sqlContext, LocalRelation(schema.toAttributes, table)).na.fill(0.0)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/efc0e053/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index d0a9a7b..10e0e06 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -74,10 +74,10 @@ class DataFrameStatSuite extends FunSuite  {
     val rows: Array[Row] = crosstab.collect().sortBy(_.getString(0))
     assert(rows(0).get(0).toString === "0")
     assert(rows(0).getLong(1) === 2L)
-    assert(rows(0).get(2) === null)
+    assert(rows(0).get(2) === 0L)
     assert(rows(1).get(0).toString === "1")
     assert(rows(1).getLong(1) === 1L)
-    assert(rows(1).get(2) === null)
+    assert(rows(1).get(2) === 0L)
     assert(rows(2).get(0).toString === "2")
     assert(rows(2).getLong(1) === 2L)
     assert(rows(2).getLong(2) === 1L)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message