spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hvanhov...@apache.org
Subject spark git commit: [SPARK-15056][SQL] Parse Unsupported Sampling Syntax and Issue Better Exceptions
Date Tue, 03 May 2016 21:20:32 GMT
Repository: spark
Updated Branches:
  refs/heads/master 2e2a6211c -> 71296c041


[SPARK-15056][SQL] Parse Unsupported Sampling Syntax and Issue Better Exceptions

#### What changes were proposed in this pull request?
Compared with the current Spark parser, there are two extra syntax are supported in Hive for
sampling
- In `On` clauses, `rand()` is used for indicating sampling on the entire row instead of an
individual column. For example,

   ```SQL
   SELECT * FROM source TABLESAMPLE(BUCKET 3 OUT OF 32 ON rand()) s;
   ```
- Users can specify the total length to be read. For example,

   ```SQL
   SELECT * FROM source TABLESAMPLE(100M) s;
   ```

Below is the link for references:
   https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Sampling

This PR is to parse and capture these two extra syntax, and issue a better error message.

#### How was this patch tested?
Added test cases to verify the thrown exceptions

Author: gatorsmile <gatorsmile@gmail.com>

Closes #12838 from gatorsmile/bucketOnRand.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71296c04
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71296c04
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71296c04

Branch: refs/heads/master
Commit: 71296c041e59159bd7c5836cf652c02843974077
Parents: 2e2a621
Author: gatorsmile <gatorsmile@gmail.com>
Authored: Tue May 3 23:20:18 2016 +0200
Committer: Herman van Hovell <hvanhovell@questtec.nl>
Committed: Tue May 3 23:20:18 2016 +0200

----------------------------------------------------------------------
 .../org/apache/spark/sql/catalyst/parser/SqlBase.g4     |  7 ++++++-
 .../apache/spark/sql/catalyst/parser/AstBuilder.scala   | 12 +++++++++++-
 .../spark/sql/catalyst/parser/PlanParserSuite.scala     |  6 +++++-
 3 files changed, 22 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/71296c04/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index cc4e5c8..3ab448d 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -389,7 +389,8 @@ sample
     : TABLESAMPLE '('
       ( (percentage=(INTEGER_VALUE | DECIMAL_VALUE) sampleType=PERCENTLIT)
       | (expression sampleType=ROWS)
-      | (sampleType=BUCKET numerator=INTEGER_VALUE OUT OF denominator=INTEGER_VALUE (ON identifier)?))
+      | sampleType=BYTELENGTH_LITERAL
+      | (sampleType=BUCKET numerator=INTEGER_VALUE OUT OF denominator=INTEGER_VALUE (ON (identifier
| qualifiedName '(' ')'))?))
       ')'
     ;
 
@@ -895,6 +896,10 @@ TINYINT_LITERAL
     : DIGIT+ 'Y'
     ;
 
+BYTELENGTH_LITERAL
+    : DIGIT+ ('B' | 'K' | 'M' | 'G')
+    ;
+
 INTEGER_VALUE
     : DIGIT+
     ;

http://git-wip-us.apache.org/repos/asf/spark/blob/71296c04/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index c397462..1d4e1ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -632,8 +632,18 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
         val fraction = ctx.percentage.getText.toDouble
         sample(fraction / 100.0d)
 
+      case SqlBaseParser.BYTELENGTH_LITERAL =>
+        throw new ParseException(
+          "TABLESAMPLE(byteLengthLiteral) is not supported", ctx)
+
       case SqlBaseParser.BUCKET if ctx.ON != null =>
-        throw new ParseException("TABLESAMPLE(BUCKET x OUT OF y ON id) is not supported",
ctx)
+        if (ctx.identifier != null) {
+          throw new ParseException(
+            "TABLESAMPLE(BUCKET x OUT OF y ON colname) is not supported", ctx)
+        } else {
+          throw new ParseException(
+            "TABLESAMPLE(BUCKET x OUT OF y ON function) is not supported", ctx)
+        }
 
       case SqlBaseParser.BUCKET =>
         sample(ctx.numerator.getText.toDouble / ctx.denominator.getText.toDouble)

http://git-wip-us.apache.org/repos/asf/spark/blob/71296c04/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index b7af2ce..aaf8426 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -372,9 +372,13 @@ class PlanParserSuite extends PlanTest {
     assertEqual(s"$sql tablesample(bucket 4 out of 10) as x",
       Sample(0, .4d, withReplacement = false, 10L, table("t").as("x"))(true).select(star()))
     intercept(s"$sql tablesample(bucket 4 out of 10 on x) as x",
-      "TABLESAMPLE(BUCKET x OUT OF y ON id) is not supported")
+      "TABLESAMPLE(BUCKET x OUT OF y ON colname) is not supported")
     intercept(s"$sql tablesample(bucket 11 out of 10) as x",
       s"Sampling fraction (${11.0/10.0}) must be on interval [0, 1]")
+    intercept("SELECT * FROM parquet_t0 TABLESAMPLE(300M) s",
+      "TABLESAMPLE(byteLengthLiteral) is not supported")
+    intercept("SELECT * FROM parquet_t0 TABLESAMPLE(BUCKET 3 OUT OF 32 ON rand()) s",
+      "TABLESAMPLE(BUCKET x OUT OF y ON function) is not supported")
   }
 
   test("sub-query") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message