asterixdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wangs...@apache.org
Subject asterixdb git commit: ASTERIXDB-1892: Sets a proper hash table cardinality during hash-group by
Date Tue, 02 May 2017 17:26:58 GMT
Repository: asterixdb
Updated Branches:
  refs/heads/master 4d2c7cd57 -> 2065eab84


ASTERIXDB-1892: Sets a proper hash table cardinality during hash-group by

 - Set a proper hash table cardinality during the merge phase
   of the external hash group-by operator.
 - Currently, the number of tuples in a spilled partition is
   used as the hash table cardinality. And this can cause an issue
   since compiler.groupmemory size is not considered.
 - So, like the initial group-by build phase, the hash table
   cardinality will be set properly based on the memory budget for
   the group-by operator.

Change-Id: I651139b2b559ad4d2f6137a5c844814606516a90
Reviewed-on: https://asterix-gerrit.ics.uci.edu/1702
Sonar-Qube: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
BAD: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Yingyi Bu <buyingyi@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo
Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/2065eab8
Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/2065eab8
Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/2065eab8

Branch: refs/heads/master
Commit: 2065eab84de129ee4c757e9c02d0a37a58138fdb
Parents: 4d2c7cd
Author: Taewoo Kim <wangsaeu@yahoo.com>
Authored: Mon May 1 21:41:37 2017 -0700
Committer: Taewoo Kim <wangsaeu@gmail.com>
Committed: Tue May 2 10:26:38 2017 -0700

----------------------------------------------------------------------
 .../algebricks/algebricks-core/pom.xml          |  11 --
 .../physical/ExternalGroupByPOperator.java      |  52 +-------
 .../physical/ExternalGroupByPOperatorTest.java  | 126 -------------------
 .../hyracks/hyracks-dataflow-std/pom.xml        |   6 +
 .../ExternalGroupOperatorDescriptor.java        |  48 +++++++
 .../ExternalGroupWriteOperatorNodePushable.java |   8 +-
 .../ExternalGroupOperatorDescriptorTest.java    | 117 +++++++++++++++++
 7 files changed, 180 insertions(+), 188 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/asterixdb/blob/2065eab8/hyracks-fullstack/algebricks/algebricks-core/pom.xml
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/algebricks/algebricks-core/pom.xml b/hyracks-fullstack/algebricks/algebricks-core/pom.xml
index 6fdaec5..3c2912e 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/pom.xml
+++ b/hyracks-fullstack/algebricks/algebricks-core/pom.xml
@@ -81,16 +81,5 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-lang3</artifactId>
     </dependency>
-    <dependency>
-      <groupId>com.e-movimento.tinytools</groupId>
-      <artifactId>privilegedaccessor</artifactId>
-      <version>1.2.2</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/2065eab8/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperator.java
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperator.java
b/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperator.java
index 8555ade..9e7daf0 100644
--- a/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperator.java
+++ b/hyracks-fullstack/algebricks/algebricks-core/src/main/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperator.java
@@ -65,7 +65,6 @@ import org.apache.hyracks.api.job.IOperatorDescriptorRegistry;
 import org.apache.hyracks.dataflow.std.group.HashSpillableTableFactory;
 import org.apache.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
 import org.apache.hyracks.dataflow.std.group.external.ExternalGroupOperatorDescriptor;
-import org.apache.hyracks.dataflow.std.structures.SerializableHashTable;
 
 public class ExternalGroupByPOperator extends AbstractPhysicalOperator {
 
@@ -259,8 +258,8 @@ public class ExternalGroupByPOperator extends AbstractPhysicalOperator
{
         // Calculates the hash table size (# of unique hash values) based on the budget and
a tuple size.
         int memoryBudgetInBytes = context.getFrameSize() * frameLimit;
         int groupByColumnsCount = gby.getGroupByList().size() + numFds;
-        int hashTableSize = calculateGroupByTableCardinality(memoryBudgetInBytes, groupByColumnsCount,
-                context.getFrameSize());
+        int hashTableSize = ExternalGroupOperatorDescriptor.calculateGroupByTableCardinality(memoryBudgetInBytes,
+                groupByColumnsCount, context.getFrameSize());
 
         ExternalGroupOperatorDescriptor gbyOpDesc = new ExternalGroupOperatorDescriptor(spec,
hashTableSize, inputSize,
                 keyAndDecFields, frameLimit, comparatorFactories, normalizedKeyFactory, aggregatorFactory,
mergeFactory,
@@ -282,51 +281,4 @@ public class ExternalGroupByPOperator extends AbstractPhysicalOperator
{
         return true;
     }
 
-    /**
-     * Based on a rough estimation of a tuple (each field size: 4 bytes) size and the number
of possible hash values
-     * for the given number of group-by columns, calculates the number of hash entries for
the hash table in Group-by.
-     * The formula is min(# of possible hash values, # of possible tuples in the data table).
-     * This method assumes that the group-by table consists of hash table that stores hash
value of tuple pointer
-     * and data table actually stores the aggregated tuple.
-     * For more details, refer to this JIRA issue: https://issues.apache.org/jira/browse/ASTERIXDB-1556
-     *
-     * @param memoryBudgetByteSize
-     * @param numberOfGroupByColumns
-     * @return group-by table size (the cardinality of group-by table)
-     */
-    public static int calculateGroupByTableCardinality(long memoryBudgetByteSize, int numberOfGroupByColumns,
-            int frameSize) {
-        // Estimates a minimum tuple size with n fields:
-        // (4:tuple offset in a frame, 4n:each field offset in a tuple, 4n:each field size
4 bytes)
-        int tupleByteSize = 4 + 8 * numberOfGroupByColumns;
-
-        // Maximum number of tuples
-        long maxNumberOfTuplesInDataTable = memoryBudgetByteSize / tupleByteSize;
-
-        // To calculate possible hash values, this counts the number of bits.
-        // We assume that each field consists of 4 bytes.
-        // Also, too high range that is greater than Long.MAXVALUE (64 bits) is not necessary
for our calculation.
-        // And, this should not generate negative numbers when shifting the number.
-        int numberOfBits = Math.min(61, numberOfGroupByColumns * 4 * 8);
-
-        // Possible number of unique hash entries
-        long possibleNumberOfHashEntries = 2L << numberOfBits;
-
-        // Between # of entries in Data table and # of possible hash values, we choose the
smaller one.
-        long groupByTableCardinality = Math.min(possibleNumberOfHashEntries, maxNumberOfTuplesInDataTable);
-        long groupByTableByteSize = SerializableHashTable.getExpectedTableByteSize(groupByTableCardinality,
frameSize);
-
-        // Gets the ratio of hash-table size in the total size (hash + data table).
-        double hashTableRatio = (double) groupByTableByteSize / (groupByTableByteSize + memoryBudgetByteSize);
-
-        // Gets the table size based on the ratio that we have calculated.
-        long finalGroupByTableByteSize = (long) (hashTableRatio * memoryBudgetByteSize);
-
-        long finalGroupByTableCardinality = finalGroupByTableByteSize
-                / SerializableHashTable.getExpectedByteSizePerHashValue();
-
-        // The maximum cardinality of a hash table: Integer.MAX_VALUE
-        return finalGroupByTableCardinality > Integer.MAX_VALUE ? Integer.MAX_VALUE
-                : (int) finalGroupByTableCardinality;
-    }
 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/2065eab8/hyracks-fullstack/algebricks/algebricks-core/src/test/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperatorTest.java
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/algebricks/algebricks-core/src/test/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperatorTest.java
b/hyracks-fullstack/algebricks/algebricks-core/src/test/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperatorTest.java
deleted file mode 100644
index a633998..0000000
--- a/hyracks-fullstack/algebricks/algebricks-core/src/test/java/org/apache/hyracks/algebricks/core/algebra/operators/physical/ExternalGroupByPOperatorTest.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- *  Licensed to the Apache Software Foundation (ASF) under one
- *  or more contributor license agreements.  See the NOTICE file
- *  distributed with this work for additional information
- *  regarding copyright ownership.  The ASF licenses this file
- *  to you under the Apache License, Version 2.0 (the
- *  "License"); you may not use this file except in compliance
- *  with the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing,
- *  software distributed under the License is distributed on an
- *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- *  KIND, either express or implied.  See the License for the
- *  specific language governing permissions and limitations
- *  under the License.
- */
-
-package org.apache.hyracks.algebricks.core.algebra.operators.physical;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.lang3.mutable.Mutable;
-import org.apache.commons.lang3.mutable.MutableObject;
-import org.apache.hyracks.algebricks.common.utils.Pair;
-import org.apache.hyracks.algebricks.core.algebra.base.ILogicalExpression;
-import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
-import org.apache.hyracks.algebricks.core.algebra.expressions.VariableReferenceExpression;
-import org.junit.Assert;
-import org.junit.Test;
-
-import junit.extensions.PA;
-
-public class ExternalGroupByPOperatorTest {
-
-    @Test
-    public void testCalculateGroupByTableCardinality() throws Exception {
-
-        // Creates a dummy variable and an expression that are needed by the operator. They
are not used by this test.
-        LogicalVariable v = new LogicalVariable(0);
-        MutableObject<ILogicalExpression> e = new MutableObject<ILogicalExpression>(new
VariableReferenceExpression(v));
-        List<Pair<LogicalVariable, Mutable<ILogicalExpression>>> gbyList
= new ArrayList<>();
-        gbyList.add(new Pair<>(v, e));
-        ExternalGroupByPOperator eGByOp = new ExternalGroupByPOperator(gbyList, 0, 0);
-
-        // Test 1: compiler.groupmemory: 512 bytes, frame size: 256 bytes, with 1 column
group-by
-        long memoryBudgetInBytes = 512;
-        int numberOfGroupByColumns = 1;
-        int frameSize = 256;
-        int resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 9);
-
-        // Sets the frame size to 128KB.
-        frameSize = 128 * 1024;
-
-        // Test 2: memory size: 1 MB, frame size: 128 KB, 1 column group-by
-        memoryBudgetInBytes = 1024 * 1024;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 19660);
-
-        // Test 3: memory size: 100 MB, frame size: 128 KB, 1 column group-by
-        memoryBudgetInBytes = 1024 * 1024 * 100;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 1937883);
-
-        // Test 4: memory size: 1 GB, frame size: 128 KB, 1 column group-by
-        memoryBudgetInBytes = 1024 * 1024 * 1024;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 19841178);
-
-        // Test 5: memory size: 10 GB, frame size: 128 KB, 1 column group-by
-        memoryBudgetInBytes = 1024 * 1024 * 1024 * 10L;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 198409112);
-
-        // Test 6: memory size: 100 GB, frame size: 128 KB, 1 column group-by
-        memoryBudgetInBytes = 1024 * 1024 * 1024 * 100L;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 1962753871);
-
-        // Test 7: memory size: 1 TB, frame size: 128 KB, 1 column group-by
-        // The cardinality will be set to Integer.MAX_VALUE in this case since the budget
is too huge.
-        memoryBudgetInBytes = 1024 * 1024 * 1024 * 1024L;
-        frameSize = 128 * 1024;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 2147483647);
-
-        // Test 8: memory size: 1 MB, frame size: 128 KB, 2 columns group-by
-        memoryBudgetInBytes = 1024 * 1024;
-        numberOfGroupByColumns = 2;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 16681);
-
-        // Test 9: memory size: 1 MB, frame size: 128 KB, 3 columns group-by
-        memoryBudgetInBytes = 1024 * 1024;
-        numberOfGroupByColumns = 3;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 15176);
-
-        // Test 10: memory size: 1 MB, frame size: 128 KB, 4 columns group-by
-        memoryBudgetInBytes = 1024 * 1024;
-        numberOfGroupByColumns = 4;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 13878);
-
-        // Test 11: memory size: 32 MB, frame size: 128 KB, 2 columns group-by
-        memoryBudgetInBytes = 1024 * 1024 * 32L;
-        numberOfGroupByColumns = 4;
-        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
-                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
-        Assert.assertTrue(resultCardinality == 408503);
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/2065eab8/hyracks-fullstack/hyracks/hyracks-dataflow-std/pom.xml
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/pom.xml b/hyracks-fullstack/hyracks/hyracks-dataflow-std/pom.xml
index 72a1bb6..0285069 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/pom.xml
@@ -76,6 +76,12 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
+      <groupId>com.e-movimento.tinytools</groupId>
+      <artifactId>privilegedaccessor</artifactId>
+      <version>1.2.2</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <scope>test</scope>

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/2065eab8/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptor.java
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptor.java
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptor.java
index 4e0724c..2d8433d 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptor.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptor.java
@@ -33,6 +33,7 @@ import org.apache.hyracks.dataflow.std.base.AbstractActivityNode;
 import org.apache.hyracks.dataflow.std.base.AbstractOperatorDescriptor;
 import org.apache.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
 import org.apache.hyracks.dataflow.std.group.ISpillableTableFactory;
+import org.apache.hyracks.dataflow.std.structures.SerializableHashTable;
 
 /**
  *
@@ -151,4 +152,51 @@ public class ExternalGroupOperatorDescriptor extends AbstractOperatorDescriptor
 
     }
 
+    /**
+     * Based on a rough estimation of a tuple (each field size: 4 bytes) size and the number
of possible hash values
+     * for the given number of group-by columns, calculates the number of hash entries for
the hash table in Group-by.
+     * The formula is min(# of possible hash values, # of possible tuples in the data table).
+     * This method assumes that the group-by table consists of hash table that stores hash
value of tuple pointer
+     * and data table actually stores the aggregated tuple.
+     * For more details, refer to this JIRA issue: https://issues.apache.org/jira/browse/ASTERIXDB-1556
+     *
+     * @param memoryBudgetByteSize
+     * @param numberOfGroupByColumns
+     * @return group-by table size (the cardinality of group-by table)
+     */
+    public static int calculateGroupByTableCardinality(long memoryBudgetByteSize, int numberOfGroupByColumns,
+            int frameSize) {
+        // Estimates a minimum tuple size with n fields:
+        // (4:tuple offset in a frame, 4n:each field offset in a tuple, 4n:each field size
4 bytes)
+        int tupleByteSize = 4 + 8 * numberOfGroupByColumns;
+
+        // Maximum number of tuples
+        long maxNumberOfTuplesInDataTable = memoryBudgetByteSize / tupleByteSize;
+
+        // To calculate possible hash values, this counts the number of bits.
+        // We assume that each field consists of 4 bytes.
+        // Also, too high range that is greater than Long.MAXVALUE (64 bits) is not necessary
for our calculation.
+        // And, this should not generate negative numbers when shifting the number.
+        int numberOfBits = Math.min(61, numberOfGroupByColumns * 4 * 8);
+
+        // Possible number of unique hash entries
+        long possibleNumberOfHashEntries = 2L << numberOfBits;
+
+        // Between # of entries in Data table and # of possible hash values, we choose the
smaller one.
+        long groupByTableCardinality = Math.min(possibleNumberOfHashEntries, maxNumberOfTuplesInDataTable);
+        long groupByTableByteSize = SerializableHashTable.getExpectedTableByteSize(groupByTableCardinality,
frameSize);
+
+        // Gets the ratio of hash-table size in the total size (hash + data table).
+        double hashTableRatio = (double) groupByTableByteSize / (groupByTableByteSize + memoryBudgetByteSize);
+
+        // Gets the table size based on the ratio that we have calculated.
+        long finalGroupByTableByteSize = (long) (hashTableRatio * memoryBudgetByteSize);
+
+        long finalGroupByTableCardinality =
+                finalGroupByTableByteSize / SerializableHashTable.getExpectedByteSizePerHashValue();
+
+        // The maximum cardinality of a hash table: Integer.MAX_VALUE
+        return finalGroupByTableCardinality > Integer.MAX_VALUE ? Integer.MAX_VALUE
+                : (int) finalGroupByTableCardinality;
+    }
 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/2065eab8/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupWriteOperatorNodePushable.java
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupWriteOperatorNodePushable.java
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupWriteOperatorNodePushable.java
index b17215f..9a3668e 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupWriteOperatorNodePushable.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupWriteOperatorNodePushable.java
@@ -120,7 +120,13 @@ public class ExternalGroupWriteOperatorNodePushable extends AbstractUnaryOutputS
 
         for (int i = 0; i < runs.length; i++) {
             if (runs[i] != null) {
-                ISpillableTable partitionTable = spillableTableFactory.buildSpillableTable(ctx,
numOfTuples[i],
+                // Calculates the hash table size (# of unique hash values) based on the
budget and a tuple size.
+                int memoryBudgetInBytes = ctx.getInitialFrameSize() * frameLimit;
+                int groupByColumnsCount = mergeGroupFields.length;
+                int hashTableCardinality = ExternalGroupOperatorDescriptor.calculateGroupByTableCardinality(
+                        memoryBudgetInBytes, groupByColumnsCount, ctx.getInitialFrameSize());
+                hashTableCardinality = (int) Math.min(hashTableCardinality, numOfTuples[i]);
+                ISpillableTable partitionTable = spillableTableFactory.buildSpillableTable(ctx,
hashTableCardinality,
                         runs[i].getFileSize(), mergeGroupFields, groupByComparators, nmkComputer,
                         mergeAggregatorFactory, partialAggRecordDesc, outRecordDesc, frameLimit,
level);
                 RunFileWriter[] runFileWriters = new RunFileWriter[partitionTable.getNumPartitions()];

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/2065eab8/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptorTest.java
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptorTest.java
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptorTest.java
new file mode 100644
index 0000000..392aab5
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupOperatorDescriptorTest.java
@@ -0,0 +1,117 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ */
+
+package org.apache.hyracks.dataflow.std.group.external;
+
+import org.apache.hyracks.api.job.IOperatorDescriptorRegistry;
+import org.apache.hyracks.api.job.JobSpecification;
+import org.junit.Assert;
+import org.junit.Test;
+
+import junit.extensions.PA;
+
+public class ExternalGroupOperatorDescriptorTest {
+
+    @Test
+    public void testCalculateGroupByTableCardinality() throws Exception {
+
+        // Sets a dummy variable.
+        IOperatorDescriptorRegistry spec = new JobSpecification(32768);
+        ExternalGroupOperatorDescriptor eGByOp =
+                new ExternalGroupOperatorDescriptor(spec, 0, 0, null, 4, null, null, null,
null, null, null, null);
+
+        // Test 1: compiler.groupmemory: 512 bytes, frame size: 256 bytes, with 1 column
group-by
+        long memoryBudgetInBytes = 512;
+        int numberOfGroupByColumns = 1;
+        int frameSize = 256;
+        int resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 9);
+
+        // Sets the frame size to 128KB.
+        frameSize = 128 * 1024;
+
+        // Test 2: memory size: 1 MB, frame size: 128 KB, 1 column group-by
+        memoryBudgetInBytes = 1024 * 1024;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 19660);
+
+        // Test 3: memory size: 100 MB, frame size: 128 KB, 1 column group-by
+        memoryBudgetInBytes = 1024 * 1024 * 100;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 1937883);
+
+        // Test 4: memory size: 1 GB, frame size: 128 KB, 1 column group-by
+        memoryBudgetInBytes = 1024 * 1024 * 1024;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 19841178);
+
+        // Test 5: memory size: 10 GB, frame size: 128 KB, 1 column group-by
+        memoryBudgetInBytes = 1024 * 1024 * 1024 * 10L;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 198409112);
+
+        // Test 6: memory size: 100 GB, frame size: 128 KB, 1 column group-by
+        memoryBudgetInBytes = 1024 * 1024 * 1024 * 100L;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 1962753871);
+
+        // Test 7: memory size: 1 TB, frame size: 128 KB, 1 column group-by
+        // The cardinality will be set to Integer.MAX_VALUE in this case since the budget
is too huge.
+        memoryBudgetInBytes = 1024 * 1024 * 1024 * 1024L;
+        frameSize = 128 * 1024;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 2147483647);
+
+        // Test 8: memory size: 1 MB, frame size: 128 KB, 2 columns group-by
+        memoryBudgetInBytes = 1024 * 1024;
+        numberOfGroupByColumns = 2;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 16681);
+
+        // Test 9: memory size: 1 MB, frame size: 128 KB, 3 columns group-by
+        memoryBudgetInBytes = 1024 * 1024;
+        numberOfGroupByColumns = 3;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 15176);
+
+        // Test 10: memory size: 1 MB, frame size: 128 KB, 4 columns group-by
+        memoryBudgetInBytes = 1024 * 1024;
+        numberOfGroupByColumns = 4;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 13878);
+
+        // Test 11: memory size: 32 MB, frame size: 128 KB, 2 columns group-by
+        memoryBudgetInBytes = 1024 * 1024 * 32L;
+        numberOfGroupByColumns = 4;
+        resultCardinality = (int) PA.invokeMethod(eGByOp, "calculateGroupByTableCardinality(long,int,int)",
+                memoryBudgetInBytes, numberOfGroupByColumns, frameSize);
+        Assert.assertTrue(resultCardinality == 408503);
+    }
+
+}


Mime
View raw message