Return-Path: X-Original-To: apmail-spark-commits-archive@minotaur.apache.org Delivered-To: apmail-spark-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id E5A1118C2E for ; Wed, 4 Nov 2015 18:56:37 +0000 (UTC) Received: (qmail 34275 invoked by uid 500); 4 Nov 2015 18:56:37 -0000 Delivered-To: apmail-spark-commits-archive@spark.apache.org Received: (qmail 34181 invoked by uid 500); 4 Nov 2015 18:56:37 -0000 Mailing-List: contact commits-help@spark.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list commits@spark.apache.org Received: (qmail 34050 invoked by uid 99); 4 Nov 2015 18:56:37 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 04 Nov 2015 18:56:37 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 99B4FDFD7B; Wed, 4 Nov 2015 18:56:37 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: yhuai@apache.org To: commits@spark.apache.org Message-Id: <5e70a781fa5a4a37a0ac4735cfd8f7b5@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: spark git commit: [SPARK-10304][SQL] Following up checking valid dir structure for partition discovery Date: Wed, 4 Nov 2015 18:56:37 +0000 (UTC) Repository: spark Updated Branches: refs/heads/master 987df4bfc -> de289bf27 [SPARK-10304][SQL] Following up checking valid dir structure for partition discovery This patch follows up #8840. Author: Liang-Chi Hsieh Closes #9459 from viirya/detect_invalid_part_dir_following. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de289bf2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de289bf2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de289bf2 Branch: refs/heads/master Commit: de289bf279e14e47859b5fbcd70e97b9d0759f14 Parents: 987df4b Author: Liang-Chi Hsieh Authored: Wed Nov 4 10:56:32 2015 -0800 Committer: Yin Huai Committed: Wed Nov 4 10:56:32 2015 -0800 ---------------------------------------------------------------------- .../execution/datasources/PartitioningUtils.scala | 14 +++++++++++++- .../parquet/ParquetPartitionDiscoverySuite.scala | 16 ++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/de289bf2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index 16dc236..86bc3a1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -81,6 +81,8 @@ private[sql] object PartitioningUtils { parsePartition(path, defaultPartitionName, typeInference) }.unzip + // We create pairs of (path -> path's partition value) here + // If the corresponding partition value is None, the pair will be skiped val pathsWithPartitionValues = paths.zip(partitionValues).flatMap(x => x._2.map(x._1 -> _)) if (pathsWithPartitionValues.isEmpty) { @@ -89,11 +91,21 @@ private[sql] object PartitioningUtils { } else { // This dataset is partitioned. We need to check whether all partitions have the same // partition columns and resolve potential type conflicts. + + // Check if there is conflicting directory structure. + // For the paths such as: + // var paths = Seq( + // "hdfs://host:9000/invalidPath", + // "hdfs://host:9000/path/a=10/b=20", + // "hdfs://host:9000/path/a=10.5/b=hello") + // It will be recognised as conflicting directory structure: + // "hdfs://host:9000/invalidPath" + // "hdfs://host:9000/path" val basePaths = optBasePaths.flatMap(x => x) assert( basePaths.distinct.size == 1, "Conflicting directory structures detected. Suspicious paths:\b" + - basePaths.mkString("\n\t", "\n\t", "\n\n")) + basePaths.distinct.mkString("\n\t", "\n\t", "\n\n")) val resolvedPartitionValues = resolvePartitions(pathsWithPartitionValues) http://git-wip-us.apache.org/repos/asf/spark/blob/de289bf2/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index 67b6a37..61cc0da 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -88,6 +88,22 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha parsePartitions(paths.map(new Path(_)), defaultPartitionName, true) } assert(exception.getMessage().contains("Conflicting directory structures detected")) + + // Invalid + // Conflicting directory structure: + // "hdfs://host:9000/tmp/tables/partitionedTable" + // "hdfs://host:9000/tmp/tables/nonPartitionedTable1" + // "hdfs://host:9000/tmp/tables/nonPartitionedTable2" + paths = Seq( + "hdfs://host:9000/tmp/tables/partitionedTable", + "hdfs://host:9000/tmp/tables/partitionedTable/p=1/", + "hdfs://host:9000/tmp/tables/nonPartitionedTable1", + "hdfs://host:9000/tmp/tables/nonPartitionedTable2") + + exception = intercept[AssertionError] { + parsePartitions(paths.map(new Path(_)), defaultPartitionName, true) + } + assert(exception.getMessage().contains("Conflicting directory structures detected")) } test("parse partition") { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org For additional commands, e-mail: commits-help@spark.apache.org