spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression
Date Thu, 28 Nov 2019 01:48:49 GMT
beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL
filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r351563035
 
 

 ##########
 File path: sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
 ##########
 @@ -0,0 +1,332 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 27
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function.
Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) or wrap 'testdata.`a`'
in first() (or first_value) if you don't care which value you get.;
+
+
+-- !query 2
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM testData
+-- !query 2 schema
+struct<count(a):bigint,count(b):bigint>
+-- !query 2 output
+2	4
+
+
+-- !query 3
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a
+-- !query 3 schema
+struct<a:int,count(b):bigint>
+-- !query 3 output
+1	0
+2	2
+3	2
+NULL	0
+
+
+-- !query 4
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function.
Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
+
+
+-- !query 5
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM testData
GROUP BY a
+-- !query 5 schema
+struct<count(a):bigint,count(b):bigint>
+-- !query 5 output
+0	0
+2	0
+2	0
+3	2
+
+
+-- !query 6
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1
+-- !query 6 schema
+struct<foo:string,count(a):bigint>
+-- !query 6 output
+foo	6
+
+
+-- !query 7
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData WHERE a = 0
GROUP BY 1
+-- !query 7 schema
+struct<foo:string,approx_count_distinct(a):bigint>
+-- !query 7 output
+
+
+
+-- !query 8
+SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 GROUP BY
1
+-- !query 8 schema
+struct<foo:string,max(named_struct(a, a)):struct<a:int>>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b
+-- !query 9 schema
+struct<(a + b):int,count(b):bigint>
+-- !query 9 output
+2	0
+3	1
+4	1
+5	1
+NULL	0
+
+
+-- !query 10
+SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function.
Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
+
+
+-- !query 11
+SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1
+-- !query 11 schema
+struct<((a + 1) + 1):int,count(b):bigint>
+-- !query 11 output
+3	2
+4	2
+5	2
+NULL	1
+
+
+-- !query 12
+SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER (WHERE b >
0 AND c > 2)
+FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a
+-- !query 12 schema
+struct<count(DISTINCT b):bigint,count(DISTINCT b, c):bigint>
+-- !query 12 output
+1	1
+
+
+-- !query 13
+SELECT a AS k, COUNT(b) FILTER (WHERE b = 1 OR b = 2) FROM testData GROUP BY k
+-- !query 13 schema
+struct<k:int,count(b):bigint>
+-- !query 13 output
+1	2
+2	2
+3	2
+NULL	1
+
+
+-- !query 14
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k HAVING k >
1
+-- !query 14 schema
+struct<k:int,count(b):bigint>
+-- !query 14 output
+2	2
+3	2
+
+
+-- !query 15
+SELECT COUNT(b) FILTER (WHERE a > 0) AS k FROM testData GROUP BY k
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.AnalysisException
+aggregate functions are not allowed in GROUP BY, but found count(testdata.`b`);
+
+
+-- !query 16
+SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k
+-- !query 16 schema
+struct<k:int,count(b):bigint>
+-- !query 16 output
+1	2
+2	2
+3	2
+NULL	1
+
+
+-- !query 17
+SELECT a, COUNT(1) FILTER (WHERE b > 1) FROM testData WHERE false GROUP BY a
+-- !query 17 schema
+struct<a:int,count(1):bigint>
+-- !query 17 output
+
+
+
+-- !query 18
+SELECT COUNT(1) FILTER (WHERE b = 2) FROM testData WHERE false
+-- !query 18 schema
+struct<count(1):bigint>
+-- !query 18 output
+0
+
+
+-- !query 19
+SELECT 1 FROM (SELECT COUNT(1) FILTER (WHERE a >= 3 OR b <= 1) FROM testData WHERE
false) t
+-- !query 19 schema
+struct<1:int>
+-- !query 19 output
+1
+
+
+-- !query 20
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id)
+-- !query 20 schema
+struct<>
+-- !query 20 output
+
+
+
+-- !query 21
+CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state)
+-- !query 21 schema
+struct<>
+-- !query 21 output
+
+
+
+-- !query 22
+SELECT emp.dept_id,
+       avg(salary),
+       avg(salary) FILTER (WHERE EXISTS (SELECT state
+               FROM dept
+               WHERE dept.dept_id = emp.dept_id))
+FROM emp
+GROUP BY dept_id
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate
[dept_id#x], [dept_id#x, avg(salary#x) AS avg(salary)#x, avg(salary#x) AS avg(salary)#x]
 
 Review comment:
   ```
   SELECT COMPANY.DEP_ID,
          avg(SALARY),
          avg(SALARY) FILTER (WHERE EXISTS (SELECT ID
                  FROM DEPARTMENT
                  WHERE DEPARTMENT.ID = COMPANY.DEP_ID))
   FROM COMPANY GROUP BY DEP_ID;
   ```
   I'm sorry, I lost the group by clause.
   PgSQL supports this query.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message