carbondata-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gvram...@apache.org
Subject [1/2] incubator-carbondata git commit: Problem: Higher MAXCOLUMNS value in load DML options is leading to out of memory error
Date Sat, 17 Sep 2016 20:27:41 GMT
Repository: incubator-carbondata
Updated Branches:
  refs/heads/master 4003811b0 -> de56d0e40


Problem: Higher MAXCOLUMNS value in load DML options is leading to out of memory error

Analysis: When a higher value lets say Integer max value is configured for maxcolumns option
in load DML and executor memory is less, then in that case UnivocityCsvParser throws an out
of memory error when it tries to create an array of size of maxColumns option value.

Fix: Set the threshold value for maxColumns option value that our system can support and if
maxColumns option value is greater than threshold value then assign the threshold value to
maxColumns option value

Impact: Data loading


Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/15c72428
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/15c72428
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/15c72428

Branch: refs/heads/master
Commit: 15c72428ce63b2103f05363aa0390075753fb73b
Parents: 4003811
Author: manishgupta88 <tomanishgupta18@gmail.com>
Authored: Sat Sep 17 10:52:27 2016 +0530
Committer: Venkata Ramana G <ramana.gollamudi@huawei.com>
Committed: Sun Sep 18 01:50:36 2016 +0530

----------------------------------------------------------------------
 .../TestDataLoadWithColumnsMoreThanSchema.scala        | 12 ++++++++++++
 .../processing/csvreaderstep/UnivocityCsvParser.java   | 13 +++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/15c72428/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
----------------------------------------------------------------------
diff --git a/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
b/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
index 7bd29d5..4e5a207 100644
--- a/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
+++ b/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
@@ -87,6 +87,18 @@ class TestDataLoadWithColumnsMoreThanSchema extends QueryTest with BeforeAndAfte
     }
   }
 
+  test("test for maxcolumns option value greater than threshold value for maxcolumns") {
+    sql("DROP TABLE IF EXISTS valid_max_columns_test")
+    sql("CREATE TABLE valid_max_columns_test (imei string,age int,task bigint,num double,level
decimal(10,3),productdate timestamp,mark int,name string)STORED BY 'org.apache.carbondata.format'")
+    try {
+      sql("LOAD DATA LOCAL INPATH './src/test/resources/character_carbon.csv' into table
valid_max_columns_test options('MAXCOLUMNS'='22000')")
+      checkAnswer(sql("select count(*) from valid_max_columns_test"),
+        sql("select count(*) from hive_char_test"))
+    } catch {
+      case _ => assert(false)
+    }
+  }
+
   override def afterAll {
     sql("DROP TABLE IF EXISTS char_test")
     sql("DROP TABLE IF EXISTS hive_char_test")

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/15c72428/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
----------------------------------------------------------------------
diff --git a/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
b/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
index 89eec54..f72dd5b 100644
--- a/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
+++ b/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
@@ -49,6 +49,10 @@ public class UnivocityCsvParser {
    */
   private static final int DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING = 2000;
   /**
+   * Maximum allowed value for number of columns to be parsed in each row
+   */
+  private static final int THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING = 20000;
+  /**
    * reader for csv
    */
   private Reader inputStreamReader;
@@ -125,12 +129,17 @@ public class UnivocityCsvParser {
     int maxNumberOfColumnsForParsing = DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING;
     if (maxColumns > 0) {
       if (columnCountInSchema > maxColumns) {
-        maxNumberOfColumnsForParsing = columnCountInSchema + 10;
+        maxNumberOfColumnsForParsing = columnCountInSchema;
+      } else if (maxColumns > THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING) {
+        maxNumberOfColumnsForParsing = THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING;
+        LOGGER.info("MAXCOLUMNS option value configured is more than system allowed limit.
"
+            + "Therefore threshold value for max column parsing will be considered: "
+            + THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING);
       } else {
         maxNumberOfColumnsForParsing = maxColumns;
       }
     } else if (columnCountInSchema > DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING) {
-      maxNumberOfColumnsForParsing = columnCountInSchema + 10;
+      maxNumberOfColumnsForParsing = columnCountInSchema;
     }
     return maxNumberOfColumnsForParsing;
   }


Mime
View raw message