tajo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hyun...@apache.org
Subject git commit: TAJO-934: Multiple DISTINCT returns null grouping key value. (Hyoungjun Kim via hyunsik)
Date Fri, 11 Jul 2014 07:10:38 GMT
Repository: tajo
Updated Branches:
  refs/heads/master 329e508ca -> 95292d29d


TAJO-934: Multiple DISTINCT returns null grouping key value. (Hyoungjun Kim via hyunsik)

Closes #63


Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/95292d29
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/95292d29
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/95292d29

Branch: refs/heads/master
Commit: 95292d29d96963c769ce2fd17a3350375145683e
Parents: 329e508
Author: Hyunsik Choi <hyunsik@apache.org>
Authored: Fri Jul 11 15:52:59 2014 +0900
Committer: Hyunsik Choi <hyunsik@apache.org>
Committed: Fri Jul 11 15:52:59 2014 +0900

----------------------------------------------------------------------
 CHANGES                                         |  3 +
 .../tajo/engine/planner/LogicalPlanner.java     |  3 +-
 .../DistinctGroupbyHashAggregationExec.java     | 30 ++++++---
 .../tajo/engine/query/TestGroupByQuery.java     | 68 +++++++++++++++++++-
 .../java/org/apache/tajo/storage/v2/RCFile.java |  5 --
 5 files changed, 93 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index 921183e..43075f0 100644
--- a/CHANGES
+++ b/CHANGES
@@ -82,6 +82,9 @@ Release 0.9.0 - unreleased
 
   BUG FIXES
  
+    TAJO-934: Multiple DISTINCT returns null grouping key value.
+    (Hyoungjun Kim via hyunsik)
+
     TAJO-929: Broadcast join with empty outer join table returns empty result.
     (Hyoungjun Kim via hyunsik)
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java
----------------------------------------------------------------------
diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java b/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java
index 80390d3..ea517c0 100644
--- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java
+++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/LogicalPlanner.java
@@ -676,7 +676,8 @@ public class LogicalPlanner extends BaseAlgebraVisitor<LogicalPlanner.PlanContex
     for (Iterator<NamedExpr> it = block.namedExprsMgr.getIteratorForUnevaluatedExprs();
it.hasNext();) {
       NamedExpr rawTarget = it.next();
       try {
-        includeDistinctFunction = PlannerUtil.existsDistinctAggregationFunction(rawTarget.getExpr());
+        // check if at least distinct aggregation function
+        includeDistinctFunction |= PlannerUtil.existsDistinctAggregationFunction(rawTarget.getExpr());
         EvalNode evalNode = exprAnnotator.createEvalNode(context.plan, context.queryBlock,
rawTarget.getExpr());
         if (evalNode.getType() == EvalType.AGG_FUNCTION) {
           aggEvalNames.add(rawTarget.getAlias());

http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java
----------------------------------------------------------------------
diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java
b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java
index 1a4b706..3fac509 100644
--- a/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java
+++ b/tajo-core/src/main/java/org/apache/tajo/engine/planner/physical/DistinctGroupbyHashAggregationExec.java
@@ -114,6 +114,7 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec {
     }
     if (first) {
       loadChildHashTable();
+
       progress = 0.5f;
       first = false;
     }
@@ -141,9 +142,12 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec
{
     //--------------------------------------------------------------------------------------
 
     List<List<Tuple>> tupleSlots = new ArrayList<List<Tuple>>();
+
+    // aggregation with single grouping key
     for (int i = 0; i < hashAggregators.length; i++) {
       if (!hashAggregators[i].iterator.hasNext()) {
         nullCount++;
+        tupleSlots.add(new ArrayList<Tuple>());
         continue;
       }
       Entry<Tuple, Map<Tuple, FunctionContext[]>> entry = hashAggregators[i].iterator.next();
@@ -158,10 +162,10 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec
{
       finished = true;
       progress = 1.0f;
 
-      // If DistinctGroupbyHashAggregationExec didn't has any rows,
+      // If DistinctGroupbyHashAggregationExec does not have any rows,
       // it should return NullDatum.
       if (totalNumRows == 0 && groupbyNodeNum == 0) {
-        Tuple tuple = new VTuple(hashAggregators.length);
+        Tuple tuple = new VTuple(outputColumnNum);
         for (int i = 0; i < tuple.size(); i++) {
           tuple.put(i, DatumFactory.createNullDatum());
         }
@@ -199,9 +203,11 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec
{
 
     */
 
+    // currentAggregatedTuples has tuples which has same group key.
     currentAggregatedTuples = new ArrayList<Tuple>();
     int listIndex = 0;
     while (true) {
+      // Each item in tuples is VTuple. So the tuples variable is two dimensions(tuple[aggregator][datum]).
       Tuple[] tuples = new Tuple[hashAggregators.length];
       for (int i = 0; i < hashAggregators.length; i++) {
         List<Tuple> aggregatedTuples = tupleSlots.get(i);
@@ -212,7 +218,7 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec {
 
       //merge
       Tuple mergedTuple = new VTuple(outputColumnNum);
-      int mergeTupleIndex = 0;
+      int resultColumnIdx = 0;
 
       boolean allNull = true;
       for (int i = 0; i < hashAggregators.length; i++) {
@@ -222,14 +228,22 @@ public class DistinctGroupbyHashAggregationExec extends PhysicalExec
{
 
         int tupleSize = hashAggregators[i].getTupleSize();
         for (int j = 0; j < tupleSize; j++) {
-          if (resultColumnIdIndexes[mergeTupleIndex] >= 0) {
-            if (tuples[i] != null) {
-              mergedTuple.put(resultColumnIdIndexes[mergeTupleIndex], tuples[i].get(j));
+          int mergeTupleIndex = resultColumnIdIndexes[resultColumnIdx];
+          if (mergeTupleIndex >= 0) {
+            if (mergeTupleIndex < distinctGroupingKey.size()) {
+              // set group key tuple
+              // Because each hashAggregator has different number of tuples,
+              // sometimes getting group key from each hashAggregator will be null value.
+              mergedTuple.put(mergeTupleIndex, distinctGroupingKey.get(mergeTupleIndex));
             } else {
-              mergedTuple.put(resultColumnIdIndexes[mergeTupleIndex], NullDatum.get());
+              if (tuples[i] != null) {
+                mergedTuple.put(mergeTupleIndex, tuples[i].get(j));
+              } else {
+                mergedTuple.put(mergeTupleIndex, NullDatum.get());
+              }
             }
           }
-          mergeTupleIndex++;
+          resultColumnIdx++;
         }
       }
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java
index 40ee54e..935e520 100644
--- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java
+++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestGroupByQuery.java
@@ -288,20 +288,84 @@ public class TestGroupByQuery extends QueryTestCaseBase {
     schema.addColumn("code", Type.TEXT);
     schema.addColumn("qty", Type.INT4);
     schema.addColumn("qty2", Type.FLOAT8);
-    String[] data = new String[]{ "1|a|3|3.0", "1|a|4|4.0", "1|b|5|5.0", "2|a|1|6.0", "2|c|2|7.0",
"2|d|3|8.0" };
+    String[] data = new String[]{"1|a|3|3.0", "1|a|4|4.0", "1|b|5|5.0", "2|a|1|6.0", "2|c|2|7.0",
"2|d|3|8.0"};
     TajoTestingCluster.createTable("table10", schema, tableOptions, data);
 
     res = executeString("select id, count(distinct code), " +
         "avg(qty), min(qty), max(qty), sum(qty), " +
         "cast(avg(qty2) as INT8), cast(min(qty2) as INT8), cast(max(qty2) as INT8), cast(sum(qty2)
as INT8) " +
         "from table10 group by id");
-    String result = resultSetToString(res);
 
     String expected = "id,?count_4,?avg_5,?min_6,?max_7,?sum_8,?cast_9,?cast_10,?cast_11,?cast_12\n"
+
         "-------------------------------\n" +
         "1,2,4.0,0,5,12,4,0,5,12\n" +
         "2,3,2.0,0,3,6,7,0,8,21\n";
 
+    assertEquals(expected, resultSetToString(res));
+
+  // multiple distinct with expression
+    res = executeString(
+        "select count(distinct code) + count(distinct qty) from table10"
+    );
+
+    expected = "?plus_2\n" +
+        "-------------------------------\n" +
+        "9\n";
+
+    assertEquals(expected, resultSetToString(res));
+    res.close();
+
+    res = executeString(
+        "select id, count(distinct code) + count(distinct qty) from table10 group by id"
+    );
+
+    expected = "id,?plus_2\n" +
+        "-------------------------------\n" +
+        "1,5\n" +
+        "2,6\n";
+
+    assertEquals(expected, resultSetToString(res));
+    res.close();
+
+    executeString("DROP TABLE table10 PURGE").close();
+  }
+
+  @Test
+  public final void testDistinctAggregationCasebyCase2() throws Exception {
+    // first distinct is smaller than second distinct.
+    KeyValueSet tableOptions = new KeyValueSet();
+    tableOptions.put(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER);
+    tableOptions.put(StorageConstants.CSVFILE_NULL, "\\\\N");
+
+    Schema schema = new Schema();
+    schema.addColumn("col1", Type.TEXT);
+    schema.addColumn("col2", Type.TEXT);
+    schema.addColumn("col3", Type.TEXT);
+
+    String[] data = new String[]{
+        "a|b-1|\\N",
+        "a|b-2|\\N",
+        "a|b-2|\\N",
+        "a|b-3|\\N",
+        "a|b-3|\\N",
+        "a|b-3|\\N"
+    };
+
+    TajoTestingCluster.createTable("table10", schema, tableOptions, data);
+
+    ResultSet res = executeString(
+        "select col1 \n" +
+            ",count(distinct col2) as cnt1\n" +
+            ",count(distinct case when col3 is not null then col2 else null end) as cnt2\n"
+
+            "from table10 \n" +
+            "group by col1"
+    );
+    String result = resultSetToString(res);
+
+    String expected = "col1,cnt1,cnt2\n" +
+        "-------------------------------\n" +
+        "a,3,1\n";
+
     assertEquals(expected, result);
 
     executeString("DROP TABLE table10 PURGE").close();

http://git-wip-us.apache.org/repos/asf/tajo/blob/95292d29/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java
----------------------------------------------------------------------
diff --git a/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java b/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java
index 4b79c51..47dce74 100644
--- a/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java
+++ b/tajo-storage/src/main/java/org/apache/tajo/storage/v2/RCFile.java
@@ -1457,11 +1457,6 @@ public class RCFile {
       currentKeyLength = sin.readInt();
       compressedKeyLen = sin.readInt();
       
-//      System.out.println(">>>currentRecordLength=" + currentRecordLength + 
-//    		  ",currentKeyLength=" + currentKeyLength + 
-//    		  ",compressedKeyLen=" + compressedKeyLen + 
-//    		  ",decompress=" + decompress);
-      
       if (decompress) {
         keyTempBuffer.reset();
         keyTempBuffer.write(sin, compressedKeyLen);


Mime
View raw message