carbondata-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gvram...@apache.org
Subject [1/2] incubator-carbondata git commit: When data contains long max and min values for a measure column with bigInt datatype, the delta compression selected is DATA_BYTE which is incorrect. For selecting the delta compression min value is decremented from
Date Mon, 20 Mar 2017 12:43:19 GMT
Repository: incubator-carbondata
Updated Branches:
  refs/heads/master 8d0a672b9 -> e3fd98bbf


When data contains long max and min values for a measure column with bigInt datatype, the
delta compression selected is DATA_BYTE which is incorrect. For selecting the delta compression
min value is decremented from max value and here the min value is negative, so it performs
addition operation and goes out of long range. When a long value goes out of range it starts
again from the long max negative value which results in wrong compression selection.
This leads to data loss and incorrect query results.


Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/4f915d10
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/4f915d10
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/4f915d10

Branch: refs/heads/master
Commit: 4f915d102a5f6186aed5a74cfe23a0a81ea62aee
Parents: 8d0a672
Author: manishgupta88 <tomanishgupta18@gmail.com>
Authored: Mon Mar 20 17:13:02 2017 +0530
Committer: manishgupta88 <tomanishgupta18@gmail.com>
Committed: Mon Mar 20 17:13:21 2017 +0530

----------------------------------------------------------------------
 .../carbondata/core/util/ValueCompressionUtil.java  | 12 ++++++++++--
 .../test/resources/testBigInt_boundary_value.csv    | 16 ++++++++++++++++
 .../spark/testsuite/bigdecimal/TestBigInt.scala     | 13 +++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
b/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
index 6a14373..c8a9397 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
@@ -181,8 +181,16 @@ public final class ValueCompressionUtil {
       int mantissa, byte dataTypeSelected, char measureStoreType) {
     DataType adaptiveDataType = getDataType((long) maxValue, mantissa, dataTypeSelected);
     int adaptiveSize = getSize(adaptiveDataType);
-    DataType deltaDataType = getDataType((long) maxValue - (long) minValue, mantissa,
-        dataTypeSelected);
+    DataType deltaDataType = null;
+    // we cannot apply compression in case actual data type of the column is long
+    // consider the scenario when max and min value are equal to is long max and min value
OR
+    // when the max and min value are resulting in a value greater than long max value, then
+    // it is not possible to determine the compression type.
+    if (adaptiveDataType == DataType.DATA_LONG) {
+      deltaDataType = DataType.DATA_BIGINT;
+    } else {
+      deltaDataType = getDataType((long) maxValue - (long) minValue, mantissa, dataTypeSelected);
+    }
     int deltaSize = getSize(deltaDataType);
     if (adaptiveSize > deltaSize) {
       return new CompressionFinder(COMPRESSION_TYPE.DELTA_DOUBLE, DataType.DATA_BIGINT,

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
----------------------------------------------------------------------
diff --git a/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
b/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
new file mode 100644
index 0000000..80ec0b4
--- /dev/null
+++ b/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
@@ -0,0 +1,16 @@
+Invalid values,92233720368547758071234
+Invalid values,def
+All_null_values,null
+All_zeros_values,0
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Min_range_values,-9223372036854775808
+Min_range_values,-9223372036854775807
+Min_range_values,-9223372036854775806
+Min_range_values,-9223372036854775805
+Normal_values,2345
+Normal_values,1234
+Normal_values,3456
+Normal_values,4567

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
index 59fc9d3..f738040 100644
--- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
+++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
@@ -79,6 +79,19 @@ class TestBigInt extends QueryTest with BeforeAndAfterAll {
       sql("select count(distinct salary) from hiveTable"))
   }
 
+  test("test big int data type storage for boundary values") {
+    sql("DROP TABLE IF EXISTS Test_Boundary_carbon")
+    sql("DROP TABLE IF EXISTS Test_Boundary_hive")
+    sql("create table Test_Boundary_carbon (c1_String string, c2_Bigint Bigint) STORED BY
'org.apache.carbondata.format'")
+    sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/testBigInt_boundary_value.csv' into table
Test_Boundary_carbon OPTIONS('FILEHEADER'='c1_String,c2_Bigint')")
+    sql("create table Test_Boundary_hive (c1_String string, c2_Bigint Bigint) row format
delimited fields terminated by ','")
+    sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/testBigInt_boundary_value.csv' into table
Test_Boundary_hive")
+    checkAnswer(sql("select c2_bigInt*23 from Test_Boundary_carbon where c2_BigInt<9223372036854775807"),
+      sql("select c2_bigInt*23 from Test_Boundary_hive where c2_BigInt<9223372036854775807"))
+    sql("DROP TABLE IF EXISTS Test_Boundary_carbon")
+    sql("DROP TABLE IF EXISTS Test_Boundary_hive")
+  }
+
   override def afterAll {
     sql("drop table if exists carbonTable")
     sql("drop table if exists hiveTable")


Mime
View raw message