spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From lix...@apache.org
Subject spark git commit: [SPARK-20594][SQL] The staging directory should be a child directory starts with "." to avoid being deleted if we set hive.exec.stagingdir under the table directory.
Date Fri, 12 May 2017 18:24:22 GMT
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 f14246959 -> d99165b97


[SPARK-20594][SQL] The staging directory should be a child directory starts with "." to avoid
being deleted if we set hive.exec.stagingdir under the table directory.

JIRA Issue: https://issues.apache.org/jira/browse/SPARK-20594

## What changes were proposed in this pull request?

The staging directory should be a child directory starts with "." to avoid being deleted before
moving staging directory to table directory if we set hive.exec.stagingdir under the table
directory.

## How was this patch tested?

Added unit tests

Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #17858 from zuotingbing/spark-stagingdir.

(cherry picked from commit e3d2022e4b73c5bc352eb001ed334d91ce36109c)
Signed-off-by: Xiao Li <gatorsmile@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d99165b9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d99165b9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d99165b9

Branch: refs/heads/branch-2.2
Commit: d99165b970b81d0f3a6de5bb940bbbeeb0172a9c
Parents: f142469
Author: zuotingbing <zuo.tingbing9@zte.com.cn>
Authored: Fri May 12 11:24:07 2017 -0700
Committer: Xiao Li <gatorsmile@gmail.com>
Committed: Fri May 12 11:24:17 2017 -0700

----------------------------------------------------------------------
 .../sql/hive/execution/InsertIntoHiveTable.scala    | 16 ++++++++++++++--
 .../spark/sql/hive/InsertIntoHiveTableSuite.scala   | 11 +++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d99165b9/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 3682dc8..3facf9f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
-import java.io.IOException
+import java.io.{File, IOException}
 import java.net.URI
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale, Random}
@@ -97,12 +97,24 @@ case class InsertIntoHiveTable(
     val inputPathUri: URI = inputPath.toUri
     val inputPathName: String = inputPathUri.getPath
     val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    val stagingPathName: String =
+    var stagingPathName: String =
       if (inputPathName.indexOf(stagingDir) == -1) {
         new Path(inputPathName, stagingDir).toString
       } else {
         inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
       }
+
+    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
+    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
+    // under the table directory.
+    if (FileUtils.isSubDir(new Path(stagingPathName), inputPath, fs) &&
+      !stagingPathName.stripPrefix(inputPathName).stripPrefix(File.separator).startsWith("."))
{
+      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts "
+
+        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table "
+
+        "directory.")
+      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+    }
+
     val dir: Path =
       fs.makeQualified(
         new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))

http://git-wip-us.apache.org/repos/asf/spark/blob/d99165b9/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index d6999af..2c724f8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -494,4 +494,15 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton
with Bef
         spark.table("t").write.insertInto(tableName)
       }
   }
+
+  test("SPARK-20594: hive.exec.stagingdir was deleted by Hive") {
+    // Set hive.exec.stagingdir under the table directory without start with ".".
+    withSQLConf("hive.exec.stagingdir" -> "./test") {
+      withTable("test_table") {
+        sql("CREATE TABLE test_table (key int)")
+        sql("INSERT OVERWRITE TABLE test_table SELECT 1")
+        checkAnswer(sql("SELECT * FROM test_table"), Row(1))
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message