trafodion-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sure...@apache.org
Subject [1/5] incubator-trafodion git commit: [TRAFODION-2138] Hive scan on wide tables can result in lost rows or error
Date Sun, 07 Aug 2016 23:48:06 GMT
Repository: incubator-trafodion
Updated Branches:
  refs/heads/master 9f3542dcf -> a72306e4e


[TRAFODION-2138] Hive scan on wide tables can result in lost rows or error

The fix has three parts
a)	RangeTailIOSize will now default to maximum length of a row, if > 16KB.
Each esp range will look ahead by this size. Previously this was hard coded to 16KB.
b)	If a whole buffer does not have the start of a row, that is not an error.
c)	We raise an error during compile if maximum row size is greater than size of IO buffer.
We need to have the whole row (from start to finish) in a contiguous buffer to do the rest
of our logic.
Currently IO buffer defaults to 64 MB.


Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/6d266221
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/6d266221
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/6d266221

Branch: refs/heads/master
Commit: 6d266221b959c14319a760716939f261a2460276
Parents: 98b00b6
Author: Suresh Subbiah <sureshs@apache.org>
Authored: Wed Aug 3 20:35:43 2016 +0000
Committer: Suresh Subbiah <sureshs@apache.org>
Committed: Wed Aug 3 20:35:43 2016 +0000

----------------------------------------------------------------------
 core/sql/bin/SqlciErrors.txt         |   1 +
 core/sql/executor/ExHdfsScan.cpp     |  52 +++++++++-----
 core/sql/exp/ExpLOBaccess.cpp        |   2 +-
 core/sql/generator/GenRelScan.cpp    |  10 ++-
 core/sql/optimizer/BindRelExpr.cpp   |  18 +++++
 core/sql/regress/seabase/EXPECTED031 | 108 +++++++++++++++++++++++++-----
 core/sql/regress/seabase/TEST031     |  19 ++++++
 core/sql/sqlcomp/nadefaults.cpp      |   3 +-
 8 files changed, 179 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6d266221/core/sql/bin/SqlciErrors.txt
----------------------------------------------------------------------
diff --git a/core/sql/bin/SqlciErrors.txt b/core/sql/bin/SqlciErrors.txt
index 88ea933..b5704f8 100644
--- a/core/sql/bin/SqlciErrors.txt
+++ b/core/sql/bin/SqlciErrors.txt
@@ -1282,6 +1282,7 @@ $1~String1 --------------------------------
 4223 42000 99999 BEGINNER MAJOR DBADMIN $0~String0 not supported in this software version.
 4224 42000 99999 BEGINNER MAJOR DBADMIN The directory $0~String0 is not a valid Hive location.
 4225 42000 99999 BEGINNER MAJOR DBADMIN Number of column families cannot exceed 32.
+4226 42000 99999 BEGINNER MAJOR DBADMIN Table $0~TableName has a maximum record length of
$0~Int0 which is greater than HDFS_IO_BUFFERSIZE. Increase buffer size setting, or reduce
HIVE_MAX_STRING_LENGTH.
 4240 ZZZZZ 99999 BEGINNER MAJOR DBADMIN Expression $0~string0 in the DIVISION BY clause references
columns other than clustering key columns.
 4241 ZZZZZ 99999 BEGINNER MAJOR DBADMIN The value $0~string0 is not supported at this place
in the DIVISION BY clause, only constants are allowed.
 4242 ZZZZZ 99999 BEGINNER MAJOR DBADMIN The value $0~string0 is not supported at this place
in the DIVISION BY clause, only key columns are allowed.

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6d266221/core/sql/executor/ExHdfsScan.cpp
----------------------------------------------------------------------
diff --git a/core/sql/executor/ExHdfsScan.cpp b/core/sql/executor/ExHdfsScan.cpp
index 7f4cf90..978bd41 100644
--- a/core/sql/executor/ExHdfsScan.cpp
+++ b/core/sql/executor/ExHdfsScan.cpp
@@ -563,7 +563,8 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
 	      }
 	    else
 	      {
-               
+                Int64 rangeTail  = hdfo_->fileIsSplitEnd() ? 
+                  hdfsScanTdb().rangeTailIOSize_ : 0;
                 openType = 2; // must open
                 retcode = ExpLOBInterfaceSelectCursor
                   (lobGlob_,
@@ -573,7 +574,7 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
                    hdfsScanTdb().hostName_,
                    hdfsScanTdb().port_,
                    0, NULL, // handle not valid for non lob access
-                   bytesLeft_, // max bytes
+                   bytesLeft_ + rangeTail, // max bytes
                    cursorId_, 
 		       
                    requestTag_, Lob_Memory,
@@ -608,6 +609,9 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
                 
                     hdfsFileName_ = hdfo->fileName();
                     sprintf(cursorId, "%d", currRangeNum_ + 1);
+		    rangeTail  = hdfo->fileIsSplitEnd() ? 
+		      hdfsScanTdb().rangeTailIOSize_ : 0;
+
                     openType = 1; // preOpen
                 
                     retcode = ExpLOBInterfaceSelectCursor
@@ -618,7 +622,7 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
                        hdfsScanTdb().hostName_,
                        hdfsScanTdb().port_,
                        0, NULL,//handle not relevant for non lob access
-                       hdfo->getBytesToRead(), // max bytes
+                       hdfo->getBytesToRead() + rangeTail, // max bytes
                        cursorId, 
                            
                        requestTag_, Lob_Memory,
@@ -812,23 +816,39 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
 	      {
 		// Position in the hdfsScanBuffer_ to the
 		// first record delimiter.  
-		hdfsBufNextRow_ = hdfs_strchr(hdfsScanBuffer_,
-                                         hdfsScanTdb().recordDelimiter_, hdfsScanBuffer_+trailingPrevRead_+
bytesRead_, checkRangeDelimiter_, hdfsScanTdb().getHiveScanMode(), &changedLen);
+		hdfsBufNextRow_ = 
+		  hdfs_strchr(hdfsScanBuffer_,
+			      hdfsScanTdb().recordDelimiter_, 
+			      hdfsScanBuffer_+trailingPrevRead_+ 
+			      min(bytesRead_, hdfo_->bytesToRead_), 
+			      checkRangeDelimiter_, 
+			      hdfsScanTdb().getHiveScanMode(), &changedLen);
 		// May be that the record is too long? Or data isn't ascii?
 		// Or delimiter is incorrect.
 		if (! hdfsBufNextRow_)
 		  {
-		    ComDiagsArea *diagsArea = NULL;
-
-		    ExRaiseSqlError(getHeap(), &diagsArea, 
-				    (ExeErrorCode)(8446), NULL, 
-				    NULL, NULL, NULL,
-				    (char*)"No record delimiter found in buffer from hdfsRead.",
-				    NULL);
-		    // no need to log errors in this case (bulk load) since this is a major issue
-		    // and need to be correxted
-		    pentry_down->setDiagsArea(diagsArea);
-		    step_ = HANDLE_ERROR_WITH_CLOSE;
+		    if (hdfo_->bytesToRead_ < hdfsScanTdb().rangeTailIOSize_)
+		      {
+			// for wide rows it is not an error if a whole range
+			// does not include a record delimiter. RangeTaileIOSize
+			// is set to max row size in generator by default.
+			// It is also checked in the compiler that rowsize
+			// is less than buffer size.
+			step_ = CLOSE_HDFS_CURSOR;
+		      }
+		    else 
+		      {
+			ComDiagsArea *diagsArea = NULL;
+			ExRaiseSqlError(getHeap(), &diagsArea, 
+					(ExeErrorCode)(8446), NULL, 
+					NULL, NULL, NULL,
+					(char*)"No record delimiter found in buffer from hdfsRead.",
+					NULL);
+			// no need to log errors in this case (bulk load) since
+			// this is a major issue and needs to be corrected
+			pentry_down->setDiagsArea(diagsArea);
+			step_ = HANDLE_ERROR_WITH_CLOSE;
+		      }
 		    break;
 		  }
 		

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6d266221/core/sql/exp/ExpLOBaccess.cpp
----------------------------------------------------------------------
diff --git a/core/sql/exp/ExpLOBaccess.cpp b/core/sql/exp/ExpLOBaccess.cpp
index 575bb92..773ff92 100644
--- a/core/sql/exp/ExpLOBaccess.cpp
+++ b/core/sql/exp/ExpLOBaccess.cpp
@@ -2800,7 +2800,7 @@ Ex_Lob_Error ExLobGlobals::performRequest(ExLobHdfsRequest *request)
           buf->data_ = (char *) (getHeap())->allocateMemory( cursor->bufMaxSize_);
           lobPtr->stats_.buffersUsed++;
         }
-        size = min(cursor->bufMaxSize_, (cursor->maxBytes_ - cursor->bytesRead_
+ (16 * 1024)));
+        size = min(cursor->bufMaxSize_, (cursor->maxBytes_ - cursor->bytesRead_));
         if (buf->data_) {
           lobPtr->readCursorDataSimple(buf->data_, size, *cursor, buf->bytesRemaining_);
           buf->bytesUsed_ = 0;

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6d266221/core/sql/generator/GenRelScan.cpp
----------------------------------------------------------------------
diff --git a/core/sql/generator/GenRelScan.cpp b/core/sql/generator/GenRelScan.cpp
index 0b152ff..1ea3823 100644
--- a/core/sql/generator/GenRelScan.cpp
+++ b/core/sql/generator/GenRelScan.cpp
@@ -1173,7 +1173,6 @@ if (hTabStats->isOrcFile())
    {
      hdfsBufSize = (Int64)CmpCommon::getDefaultNumeric(HDFS_IO_BUFFERSIZE);
      hdfsBufSize = hdfsBufSize * 1024; // convert to bytes
-     
      Int64 hdfsBufSizeTesting = (Int64)
        CmpCommon::getDefaultNumeric(HDFS_IO_BUFFERSIZE_BYTES);
      if (hdfsBufSizeTesting)
@@ -1182,6 +1181,15 @@ if (hTabStats->isOrcFile())
 
   UInt32 rangeTailIOSize = (UInt32)
       CmpCommon::getDefaultNumeric(HDFS_IO_RANGE_TAIL);
+  if (rangeTailIOSize == 0) 
+    {
+      rangeTailIOSize = getTableDesc()->getNATable()->getRecordLength() +
+	(getTableDesc()->getNATable()->getClusteringIndex()->
+	 getAllColumns().entries())*2;
+      // for each range we look ahead in the next range upto the maximum
+      // record length to find the end of record delimiter.
+      rangeTailIOSize = MAXOF(rangeTailIOSize, 16*1024);
+    }
 
   char * tablename = 
     space->AllocateAndCopyToAlignedSpace(GenGetQualifiedName(getIndexDesc()->getNAFileSet()->getFileSetName()),
0);

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6d266221/core/sql/optimizer/BindRelExpr.cpp
----------------------------------------------------------------------
diff --git a/core/sql/optimizer/BindRelExpr.cpp b/core/sql/optimizer/BindRelExpr.cpp
index e2960f9..70c0b12 100644
--- a/core/sql/optimizer/BindRelExpr.cpp
+++ b/core/sql/optimizer/BindRelExpr.cpp
@@ -7582,6 +7582,24 @@ RelExpr *Scan::bindNode(BindWA *bindWA)
         }
     }
 
+   if (naTable->isHiveTable() && 
+       !(naTable->getClusteringIndex()->getHHDFSTableStats()->isOrcFile() ||
+	 naTable->getClusteringIndex()->getHHDFSTableStats()
+	 ->isSequenceFile()) &&
+       (CmpCommon::getDefaultNumeric(HDFS_IO_BUFFERSIZE_BYTES) == 0) && 
+       (naTable->getRecordLength() >
+	CmpCommon::getDefaultNumeric(HDFS_IO_BUFFERSIZE)*1024))
+     {
+       // do not raise error if buffersize is set though buffersize_bytes.
+       // Typically this setting is used for testing alone.
+       *CmpCommon::diags() << DgSqlCode(-4226)
+			   << DgTableName(
+					  naTable->getTableName().
+					  getQualifiedNameAsAnsiString())
+			   << DgInt0(naTable->getRecordLength());
+       bindWA->setErrStatus();
+       return NULL;
+     }
   // Bind the base class.
   //
   RelExpr *boundExpr = bindSelf(bindWA);

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6d266221/core/sql/regress/seabase/EXPECTED031
----------------------------------------------------------------------
diff --git a/core/sql/regress/seabase/EXPECTED031 b/core/sql/regress/seabase/EXPECTED031
index 873ed2c..c3141d2 100644
--- a/core/sql/regress/seabase/EXPECTED031
+++ b/core/sql/regress/seabase/EXPECTED031
@@ -431,19 +431,19 @@ Z            Z            (EXPR)  (EXPR)
 --- 1 row(s) selected.
 >>select substring(a, 1, 10), cast(b as char(10)) from hive.hive.t031hive;
 
-(EXPR)                                    (EXPR)
-----------------------------------------  ----------------------------------------
-
-aaaaaaaaaa                                def                                     
-aaaaaaaaaa                                zzz                                     
-aaaaaaaaaa                                zzz                                     
-aaaaaaaaaa                                zzz                                     
-aaaaaaaaaa                                zzz                                     
-aaaaaaaaaa                                zzz                                     
-a                                         zzz                                     
-aaaaaaaaaa                                ?                                       
-aaaaaaaaaa                                ?                                       
-aaaaaaaaaa                                ?                                       
+(EXPR)      (EXPR)    
+----------  ----------
+
+aaaaaaaaaa  def       
+aaaaaaaaaa  zzz       
+aaaaaaaaaa  zzz       
+aaaaaaaaaa  zzz       
+aaaaaaaaaa  zzz       
+aaaaaaaaaa  zzz       
+a           zzz       
+aaaaaaaaaa  ?         
+aaaaaaaaaa  ?         
+aaaaaaaaaa  ?         
 
 --- 10 row(s) selected.
 >>select [last 0] * from hive.hive.t031hive;
@@ -472,6 +472,84 @@ aaaaaaaaaa                                ?
 
 --- 1 row(s) selected.
 >>
+>>control query shape sort_groupby(exchange(cut));
+
+--- SQL operation complete.
+>>cqd hive_min_bytes_per_esp_partition '2000000' ;
+
+--- SQL operation complete.
+>>prepare s2 from select count(*) from hive.hive.t031hive;
+
+--- SQL command prepared.
+>>explain options 'f' s2 ;
+
+LC   RC   OP   OPERATOR              OPT       DESCRIPTION           CARD
+---- ---- ---- --------------------  --------  --------------------  ---------
+
+4    .    5    root                                                  1.00E+000
+3    .    4    sort_partial_aggr_ro                                  1.00E+000
+2    .    3    esp_exchange                    1:2(hash2)            1.00E+000
+1    .    2    sort_partial_aggr_le                                  1.00E+000
+.    .    1    hive_scan                       T031HIVE              1.00E+000
+
+--- SQL operation complete.
+>>execute s2 ;
+
+(EXPR)              
+--------------------
+
+                  10
+
+--- 1 row(s) selected.
+>>
+>>cqd hive_min_bytes_per_esp_partition '1000000' ;
+
+--- SQL operation complete.
+>>prepare s2 from select count(*) from hive.hive.t031hive;
+
+--- SQL command prepared.
+>>explain options 'f' s2 ;
+
+LC   RC   OP   OPERATOR              OPT       DESCRIPTION           CARD
+---- ---- ---- --------------------  --------  --------------------  ---------
+
+4    .    5    root                                                  1.00E+000
+3    .    4    sort_partial_aggr_ro                                  1.00E+000
+2    .    3    esp_exchange                    1:4(hash2)            1.00E+000
+1    .    2    sort_partial_aggr_le                                  1.00E+000
+.    .    1    hive_scan                       T031HIVE              1.00E+000
+
+--- SQL operation complete.
+>>execute s2 ;
+
+(EXPR)              
+--------------------
+
+                  10
+
+--- 1 row(s) selected.
+>>
+>>-- test error
+>>cqd hdfs_io_buffersize '3' ;
+
+--- SQL operation complete.
+>>prepare s2 from select count(*) from hive.hive.t031hive;
+
+*** ERROR[4226] Table HIVE.HIVE.T031HIVE has a maximum record length of 2000016 which is
greater than HDFS_IO_BUFFERSIZE. Increase buffer size setting, or reduce HIVE_MAX_STRING_LENGTH.
+
+*** ERROR[8822] The statement was not prepared.
+
+>>
+>>control query shape cut ;
+
+--- SQL operation complete.
+>>cqd hdfs_io_buffersize reset ;
+
+--- SQL operation complete.
+>>cqd hive_min_bytes_per_esp_partition reset;
+
+--- SQL operation complete.
+>>
 >>-- default USER
 >>drop table if exists t031t1;
 
@@ -482,7 +560,7 @@ aaaaaaaaaa                                ?
 >>invoke t031t1;
 
 -- Definition of Trafodion table TRAFODION.SCH.T031T1
--- Definition current  Wed Jul 13 23:30:01 2016
+-- Definition current  Wed Aug  3 15:45:17 2016
 
   (
     SYSKEY                           LARGEINT NO DEFAULT NOT NULL NOT DROPPABLE
@@ -513,7 +591,7 @@ A            B
 >>invoke t031t1;
 
 -- Definition of Trafodion table TRAFODION.SCH.T031T1
--- Definition current  Wed Jul 13 23:30:02 2016
+-- Definition current  Wed Aug  3 15:45:22 2016
 
   (
     SYSKEY                           LARGEINT NO DEFAULT NOT NULL NOT DROPPABLE

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6d266221/core/sql/regress/seabase/TEST031
----------------------------------------------------------------------
diff --git a/core/sql/regress/seabase/TEST031 b/core/sql/regress/seabase/TEST031
index 382e18f..da3dc51 100644
--- a/core/sql/regress/seabase/TEST031
+++ b/core/sql/regress/seabase/TEST031
@@ -166,6 +166,25 @@ delete from t031t1;
 insert into t031t1 select * from hive.hive.t031hive;
 select count(*) from t031t1;
 
+control query shape sort_groupby(exchange(cut));
+cqd hive_min_bytes_per_esp_partition '2000000' ;
+prepare s2 from select count(*) from hive.hive.t031hive;
+explain options 'f' s2 ;
+execute s2 ;
+
+cqd hive_min_bytes_per_esp_partition '1000000' ;
+prepare s2 from select count(*) from hive.hive.t031hive;
+explain options 'f' s2 ;
+execute s2 ;
+
+-- test error
+cqd hdfs_io_buffersize '3' ;
+prepare s2 from select count(*) from hive.hive.t031hive;
+
+control query shape cut ;
+cqd hdfs_io_buffersize reset ;
+cqd hive_min_bytes_per_esp_partition reset;
+
 -- default USER
 drop table if exists t031t1;
 create table t031t1 (a int, b varchar(20) default user);

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6d266221/core/sql/sqlcomp/nadefaults.cpp
----------------------------------------------------------------------
diff --git a/core/sql/sqlcomp/nadefaults.cpp b/core/sql/sqlcomp/nadefaults.cpp
index 25c5672..4120de3 100644
--- a/core/sql/sqlcomp/nadefaults.cpp
+++ b/core/sql/sqlcomp/nadefaults.cpp
@@ -1796,7 +1796,8 @@ SDDkwd__(EXE_DIAGNOSTIC_EVENTS,		"OFF"),
 
   DDui1__(HDFS_IO_BUFFERSIZE,                            "65536"),
   DDui___(HDFS_IO_BUFFERSIZE_BYTES,               "0"),
-  DDui1__(HDFS_IO_RANGE_TAIL,                     "16384"),
+  // The value 0 denotes RangeTail = max record length of table.
+  DDui___(HDFS_IO_RANGE_TAIL,                     "0"),
   DDkwd__(HDFS_PREFETCH,                           "ON"),
   DDkwd__(HDFS_READ_CONTINUE_ON_ERROR,                           "OFF"),
   DDui1__(HDFS_REPLICATION,                            "1"),


Mime
View raw message