spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From marmb...@apache.org
Subject spark git commit: [SPARK-11303][SQL] filter should not be pushed down into sample
Date Tue, 27 Oct 2015 10:29:09 GMT
Repository: spark
Updated Branches:
  refs/heads/master 958a0ec8f -> 360ed832f


[SPARK-11303][SQL] filter should not be pushed down into sample

When sampling and then filtering DataFrame, the SQL Optimizer will push down filter into sample
and produce wrong result. This is due to the sampler is calculated based on the original scope
rather than the scope after filtering.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9294 from yanboliang/spark-11303.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/360ed832
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/360ed832
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/360ed832

Branch: refs/heads/master
Commit: 360ed832f5213b805ac28cf1d2828be09480f2d6
Parents: 958a0ec
Author: Yanbo Liang <ybliang8@gmail.com>
Authored: Tue Oct 27 11:28:59 2015 +0100
Committer: Michael Armbrust <michael@databricks.com>
Committed: Tue Oct 27 11:28:59 2015 +0100

----------------------------------------------------------------------
 .../apache/spark/sql/catalyst/optimizer/Optimizer.scala   |  4 ----
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala   | 10 ++++++++++
 2 files changed, 10 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/360ed832/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0139b9e..d37f438 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -74,10 +74,6 @@ object DefaultOptimizer extends Optimizer {
 object SamplePushDown extends Rule[LogicalPlan] {
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    // Push down filter into sample
-    case Filter(condition, s @ Sample(lb, up, replace, seed, child)) =>
-      Sample(lb, up, replace, seed,
-        Filter(condition, child))
     // Push down projection into sample
     case Project(projectList, s @ Sample(lb, up, replace, seed, child)) =>
       Sample(lb, up, replace, seed,

http://git-wip-us.apache.org/repos/asf/spark/blob/360ed832/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 298c322..f5ae3ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1860,4 +1860,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         Row(1))
     }
   }
+
+  test("SPARK-11303: filter should not be pushed down into sample") {
+    val df = sqlContext.range(100)
+    List(true, false).foreach { withReplacement =>
+      val sampled = df.sample(withReplacement, 0.1, 1)
+      val sampledOdd = sampled.filter("id % 2 != 0")
+      val sampledEven = sampled.filter("id % 2 = 0")
+      assert(sampled.count() == sampledOdd.count() + sampledEven.count())
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message