spark-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (SPARK-26327) Metrics in FileSourceScanExec not update correctly while relation.partitionSchema is set
Date Fri, 14 Dec 2018 21:11:00 GMT

    [ https://issues.apache.org/jira/browse/SPARK-26327?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16721788#comment-16721788
] 

ASF GitHub Bot commented on SPARK-26327:
----------------------------------------

dongjoon-hyun closed pull request #23300: [SPARK-26327][SQL][BACKPORT-2.2] Bug fix for `FileSourceScanExec`
metrics update
URL: https://github.com/apache/spark/pull/23300
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 6fb41b6425c4b..1f6a7c089bf07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -170,19 +170,14 @@ case class FileSourceScanExec(
     false
   }
 
+  private var metadataTime = 0L
+
   @transient private lazy val selectedPartitions: Seq[PartitionDirectory] = {
     val optimizerMetadataTimeNs = relation.location.metadataOpsTimeNs.getOrElse(0L)
     val startTime = System.nanoTime()
     val ret = relation.location.listFiles(partitionFilters, dataFilters)
     val timeTakenMs = ((System.nanoTime() - startTime) + optimizerMetadataTimeNs) / 1000
/ 1000
-
-    metrics("numFiles").add(ret.map(_.files.size.toLong).sum)
-    metrics("metadataTime").add(timeTakenMs)
-
-    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
-    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId,
-      metrics("numFiles") :: metrics("metadataTime") :: Nil)
-
+    metadataTime = timeTakenMs
     ret
   }
 
@@ -281,6 +276,8 @@ case class FileSourceScanExec(
   }
 
   private lazy val inputRDD: RDD[InternalRow] = {
+    // Update metrics for taking effect in both code generation node and normal node.
+    updateDriverMetrics()
     val readFile: (PartitionedFile) => Iterator[InternalRow] =
       relation.fileFormat.buildReaderWithPartitionValues(
         sparkSession = relation.sparkSession,
@@ -514,6 +511,19 @@ case class FileSourceScanExec(
     }
   }
 
+  /**
+   * Send the updated metrics to driver, while this function calling, selectedPartitions
has
+   * been initialized. See SPARK-26327 for more detail.
+   */
+  private def updateDriverMetrics() = {
+    metrics("numFiles").add(selectedPartitions.map(_.files.size.toLong).sum)
+    metrics("metadataTime").add(metadataTime)
+
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId,
+      metrics("numFiles") :: metrics("metadataTime") :: Nil)
+  }
+
   override lazy val canonicalized: FileSourceScanExec = {
     FileSourceScanExec(
       relation,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 79d1fbfa3f072..26b822fe6208f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -372,6 +372,21 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       assert(res3 === (10L, 0L, 10L) :: (30L, 0L, 30L) :: (0L, 30L, 300L) :: (0L, 300L, 0L)
:: Nil)
     }
   }
+
+  test("SPARK-26327: FileSourceScanExec metrics") {
+    withTable("testDataForScan") {
+      spark.range(10).selectExpr("id", "id % 3 as p")
+        .write.partitionBy("p").saveAsTable("testDataForScan")
+      // The execution plan only has 1 FileScan node.
+      val df = spark.sql(
+        "SELECT * FROM testDataForScan WHERE p = 1")
+      testSparkPlanMetrics(df, 1, Map(
+        0L -> (("Scan parquet default.testdataforscan", Map(
+          "number of output rows" -> 3L,
+          "number of files" -> 2L))))
+      )
+    }
+  }
 }
 
 object InputOutputMetricsHelper {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Metrics in FileSourceScanExec not update correctly while relation.partitionSchema is
set
> ----------------------------------------------------------------------------------------
>
>                 Key: SPARK-26327
>                 URL: https://issues.apache.org/jira/browse/SPARK-26327
>             Project: Spark
>          Issue Type: Sub-task
>          Components: SQL
>    Affects Versions: 2.4.0
>            Reporter: Yuanjian Li
>            Assignee: Yuanjian Li
>            Priority: Major
>             Fix For: 3.0.0
>
>
> As currently approach in `FileSourceScanExec`, the metrics of "numFiles" and "metadataTime"(fileListingTime)
were updated while lazy val `selectedPartitions` initialized in the scenario of relation.partitionSchema
is set. But `selectedPartitions` will be initialized by `metadata` at first, which is called
by `queryExecution.toString` in `SQLExecution.withNewExecutionId`. So while the `SQLMetrics.postDriverMetricUpdates`
called, there's no corresponding liveExecutions in SQLAppStatusListener, the metrics update
is not work.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org


Mime
View raw message