spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From felixche...@apache.org
Subject spark git commit: [SPARK-18817][SPARKR][SQL] change derby log output to temp dir
Date Sun, 19 Mar 2017 17:37:19 GMT
Repository: spark
Updated Branches:
  refs/heads/master 60262bc95 -> 422aa67d1


[SPARK-18817][SPARKR][SQL] change derby log output to temp dir

## What changes were proposed in this pull request?

Passes R `tempdir()` (this is the R session temp dir, shared with other temp files/dirs) to
JVM, set System.Property for derby home dir to move derby.log

## How was this patch tested?

Manually, unit tests

With this, these are relocated to under /tmp
```
# ls /tmp/RtmpG2M0cB/
derby.log
```
And they are removed automatically when the R session is ended.

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #16330 from felixcheung/rderby.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/422aa67d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/422aa67d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/422aa67d

Branch: refs/heads/master
Commit: 422aa67d1bb84f913b06e6d94615adb6557e2870
Parents: 60262bc
Author: Felix Cheung <felixcheung_m@hotmail.com>
Authored: Sun Mar 19 10:37:15 2017 -0700
Committer: Felix Cheung <felixcheung@apache.org>
Committed: Sun Mar 19 10:37:15 2017 -0700

----------------------------------------------------------------------
 R/pkg/R/sparkR.R                                | 15 ++++++++-
 R/pkg/inst/tests/testthat/test_sparkSQL.R       | 34 ++++++++++++++++++++
 R/pkg/tests/run-all.R                           |  6 ++++
 .../scala/org/apache/spark/api/r/RRDD.scala     |  9 ++++++
 4 files changed, 63 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/422aa67d/R/pkg/R/sparkR.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 61773ed..d0a12b7 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -322,10 +322,19 @@ sparkRHive.init <- function(jsc = NULL) {
 #' SparkSession or initializes a new SparkSession.
 #' Additional Spark properties can be set in \code{...}, and these named parameters take
priority
 #' over values in \code{master}, \code{appName}, named lists of \code{sparkConfig}.
-#' When called in an interactive session, this checks for the Spark installation, and, if
not
+#'
+#' When called in an interactive session, this method checks for the Spark installation,
and, if not
 #' found, it will be downloaded and cached automatically. Alternatively, \code{install.spark}
can
 #' be called manually.
 #'
+#' A default warehouse is created automatically in the current directory when a managed table
is
+#' created via \code{sql} statement \code{CREATE TABLE}, for example. To change the location
of the
+#' warehouse, set the named parameter \code{spark.sql.warehouse.dir} to the SparkSession.
Along with
+#' the warehouse, an accompanied metastore may also be automatically created in the current
+#' directory when a new SparkSession is initialized with \code{enableHiveSupport} set to
+#' \code{TRUE}, which is the default. For more details, refer to Hive configuration at
+#' \url{http://spark.apache.org/docs/latest/sql-programming-guide.html#hive-tables}.
+#'
 #' For details on how to initialize and use SparkR, refer to SparkR programming guide at
 #' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession}.
 #'
@@ -381,6 +390,10 @@ sparkR.session <- function(
     deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
   }
 
+  if (!exists("spark.r.sql.derby.temp.dir", envir = sparkConfigMap)) {
+    sparkConfigMap[["spark.r.sql.derby.temp.dir"]] <- tempdir()
+  }
+
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
     retHome <- sparkCheckInstall(sparkHome, master, deployMode)
     if (!is.null(retHome)) sparkHome <- retHome

http://git-wip-us.apache.org/repos/asf/spark/blob/422aa67d/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index f7081cb..32856b3 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -60,6 +60,7 @@ unsetHiveContext <- function() {
 
 # Tests for SparkSQL functions in SparkR
 
+filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
 sparkSession <- sparkR.session()
 sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
 
@@ -2909,6 +2910,39 @@ test_that("Collect on DataFrame when NAs exists at the top of a timestamp
column
   expect_equal(class(ldf3$col3), c("POSIXct", "POSIXt"))
 })
 
+compare_list <- function(list1, list2) {
+  # get testthat to show the diff by first making the 2 lists equal in length
+  expect_equal(length(list1), length(list2))
+  l <- max(length(list1), length(list2))
+  length(list1) <- l
+  length(list2) <- l
+  expect_equal(sort(list1, na.last = TRUE), sort(list2, na.last = TRUE))
+}
+
+# This should always be the **very last test** in this test file.
+test_that("No extra files are created in SPARK_HOME by starting session and making calls",
{
+  # Check that it is not creating any extra file.
+  # Does not check the tempdir which would be cleaned up after.
+  filesAfter <- list.files(path = sparkRDir, all.files = TRUE)
+
+  expect_true(length(sparkRFilesBefore) > 0)
+  # first, ensure derby.log is not there
+  expect_false("derby.log" %in% filesAfter)
+  # second, ensure only spark-warehouse is created when calling SparkSession, enableHiveSupport
= F
+  # note: currently all other test files have enableHiveSupport = F, so we capture the list
of files
+  # before creating a SparkSession with enableHiveSupport = T at the top of this test file
+  # (filesBefore). The test here is to compare that (filesBefore) against the list of files
before
+  # any test is run in run-all.R (sparkRFilesBefore).
+  # sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted
dirs,
+  # here allow the first value, spark-warehouse, in the diff, everything else should be exactly
the
+  # same as before any test is run.
+  compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
+  # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport
= T
+  # note: as the note above, after running all tests in this file while enableHiveSupport
= T, we
+  # check the list of files again. This time we allow both whitelisted dirs to be in the
diff.
+  compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
+})
+
 unlink(parquetPath)
 unlink(orcPath)
 unlink(jsonPath)

http://git-wip-us.apache.org/repos/asf/spark/blob/422aa67d/R/pkg/tests/run-all.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index ab8d1ca..cefaadd 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -22,6 +22,12 @@ library(SparkR)
 options("warn" = 2)
 
 # Setup global test environment
+sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
+sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
+invisible(lapply(sparkRWhitelistSQLDirs,
+                 function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force =
TRUE)}))
+
 install.spark()
 
 test_package("SparkR")

http://git-wip-us.apache.org/repos/asf/spark/blob/422aa67d/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index a1a5eb8..72ae034 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.api.r
 
+import java.io.File
 import java.util.{Map => JMap}
 
 import scala.collection.JavaConverters._
@@ -127,6 +128,14 @@ private[r] object RRDD {
       sparkConf.setExecutorEnv(name.toString, value.toString)
     }
 
+    if (sparkEnvirMap.containsKey("spark.r.sql.derby.temp.dir") &&
+        System.getProperty("derby.stream.error.file") == null) {
+      // This must be set before SparkContext is instantiated.
+      System.setProperty("derby.stream.error.file",
+                         Seq(sparkEnvirMap.get("spark.r.sql.derby.temp.dir").toString, "derby.log")
+                         .mkString(File.separator))
+    }
+
     val jsc = new JavaSparkContext(sparkConf)
     jars.foreach { jar =>
       jsc.addJar(jar)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message