trafodion-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dbirds...@apache.org
Subject [1/2] incubator-trafodion git commit: [TRAFODION-1634] Update stats on Hive tables is slow
Date Tue, 24 Nov 2015 16:40:22 GMT
Repository: incubator-trafodion
Updated Branches:
  refs/heads/master d2c47ec52 -> f347e24fa


[TRAFODION-1634] Update stats on Hive tables is slow

Fix suggested by Feng Qiang (Vito). Update stats for Hive table uses
INSERT to populate the sample table. It has now been changed to use
UPSERT USING LOAD.
There is also some cleanup change to remove code related to dp2sampling
and the adjustment it required when table had varchar columns.


Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/11c61dc0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/11c61dc0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/11c61dc0

Branch: refs/heads/master
Commit: 11c61dc0d325494cb435f37d4f05802ee2def9ee
Parents: e54c732
Author: Suresh Subbiah <sureshs@apache.org>
Authored: Sat Nov 21 01:55:00 2015 +0000
Committer: Suresh Subbiah <sureshs@apache.org>
Committed: Sat Nov 21 01:55:00 2015 +0000

----------------------------------------------------------------------
 core/sql/ustat/hs_cli.cpp     |   4 +-
 core/sql/ustat/hs_cli.h       |   2 +-
 core/sql/ustat/hs_globals.cpp | 128 ++-----------------------------------
 core/sql/ustat/hs_globals.h   |   2 -
 core/sql/ustat/hs_la.h        |   6 +-
 core/sql/ustat/hs_parser.cpp  |   1 -
 6 files changed, 11 insertions(+), 132 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/11c61dc0/core/sql/ustat/hs_cli.cpp
----------------------------------------------------------------------
diff --git a/core/sql/ustat/hs_cli.cpp b/core/sql/ustat/hs_cli.cpp
index 4761796..ed10d97 100644
--- a/core/sql/ustat/hs_cli.cpp
+++ b/core/sql/ustat/hs_cli.cpp
@@ -1720,8 +1720,8 @@ Lng32 HSPersSamples::createAndInsert(HSTableDef *tabDef, NAString &sampleName,
                           createDandI,
                           minRowCtPerPartition
                          );
-      // sampleName output, actualRows & sampleRows will get modified if necessary
-      //  (based on isEstimate and the use of DP2 sampling respectively).
+      // sampleName output & actualRows will get modified if necessary
+      //  (based on isEstimate).
     if (!retcode)
     {
 

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/11c61dc0/core/sql/ustat/hs_cli.h
----------------------------------------------------------------------
diff --git a/core/sql/ustat/hs_cli.h b/core/sql/ustat/hs_cli.h
index 9cda361..3c8bb01 100644
--- a/core/sql/ustat/hs_cli.h
+++ b/core/sql/ustat/hs_cli.h
@@ -145,7 +145,7 @@ class HSSample
   public:
     HSSample(HSTableDef *tableDef,           // input
              Lng32        type,               // input - sample type used for table creation
-             double      sampleTblPercent,   // input - used to determine DP2 sampling
+             double      sampleTblPercent,   // input 
              NABoolean   persistent = FALSE, // input - used to determine whether to drop
              NABoolean   isIUS = FALSE       // input - whether the sample table
                                              // is created for IUS

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/11c61dc0/core/sql/ustat/hs_globals.cpp
----------------------------------------------------------------------
diff --git a/core/sql/ustat/hs_globals.cpp b/core/sql/ustat/hs_globals.cpp
index cd997c0..aeb5433 100644
--- a/core/sql/ustat/hs_globals.cpp
+++ b/core/sql/ustat/hs_globals.cpp
@@ -3245,9 +3245,7 @@ Lng32 HSGlobalsClass::Initialize()
             if (sampleTblPercent > 100) sampleTblPercent=100;
             if (sampleTblPercent < 0)   sampleTblPercent=0;
             samplePercentX100 = (short) (sampleTblPercent * 100); 
-               // saved for automation: percent * 100.  Note that if DP2 sampling
-               // is enabled, this value may actually be adjusted, however, we 
-               // want to save the unadjusted value.
+               // saved for automation: percent * 100.  
           }
       }
     else
@@ -3594,96 +3592,6 @@ Lng32 createSampleOption(Lng32 sampleType, double samplePercent, NAString
&sampl
 }
 
 
-/**********************************************/
-/* FUNCTION: enableDp2SamplingIfSuitable()    */
-/* PURPOSE: helper method that will           */
-/*          (a) determine if sampling % is    */
-/*              appropriate for Dp2 sampling  */
-/*          (b) check if minimal rowcount     */
-/*              per partition is > 10         */
-/*          (c) if not, return FALSE          */
-/*          (d) if YES, check for varchars    */ 
-/*              and adjust sampling % or do   */
-/*              not use DP2 sampling.         */
-/*          (e) if using DP2 sampling, set    */
-/*              CQD ALLOW_DP2_ROW_SAMPLING.   */
-/*          (f) return TRUE or FALSE.         */
-/* INPUT:  objDef - pointer to source table   */ 
-/*           object.                          */
-/* OUTPUT: samplePercent (may be adjusted)    */
-/*         sampleRowCnt  (may be adjusted)    */
-/* RETCODE: NABoolean - TRUE if DP2 sampling  */
-/*                      enabled.              */
-/**********************************************/
-NABoolean enableDp2SamplingIfSuitable(HSTableDef *objDef, 
-                                      double &samplePercent, Int64 &sampleRowCnt,
-                                      Int64 minRowCtPerPartition)
-{
-  HSLogMan *LM = HSLogMan::Instance();
-  NABoolean dp2SamplingUsed = FALSE;
-
-  // (a) Experiments have shown that if sampling % > 5, DP2 sampling does not 
-  // provide much benefit.  So the default setting for ALLOW_DP2_ROW_SAMPLING 
-  // (system) will not allow DP2 sampling above 5%.  Setting allow_dp2_sampling 
-  // to ON allows the user to force dp2Sampling.
-  const double MAX_SAMPLE_FRACTION_IN_DP2_SYSTEM = 5;
-  if ((samplePercent < MAX_SAMPLE_FRACTION_IN_DP2_SYSTEM &&
-       CmpCommon::getDefault(ALLOW_DP2_ROW_SAMPLING) == DF_SYSTEM) ||
-       CmpCommon::getDefault(ALLOW_DP2_ROW_SAMPLING) == DF_ON)
-  {
-    Lng32 varcharLength = objDef->getTotalVarcharLength();
-
-    // Unconditionally test the feasibility of performing DP2 sampling.
-    // DP2 sampling is not recommended when #rows per partition
-    // is less than 10.
-    {
-
-      // Do not use DP2 sampling if the minimal rowcount per partition is equal to
-      // or less than 10
-      if (minRowCtPerPartition > 10) 
-      {
-        dp2SamplingUsed = TRUE;
-
-        // DP2 sampling tends to retrieve a smaller sample than expected if
-        // the table contains varchars.  Increase the sampling ratio by
-        // assuming that all varchars are half full.
-        // this is the average number of rows in a block
-        if ( varcharLength > 0 ) {
-
-           Lng32 recLen = objDef->getRecordLength();             
-           Lng32 blockSize = objDef->getBlockSize();
-
-           Lng32 dp2_calc_num_rows_per_block =  blockSize/recLen ;
-           float expected_avg_num_rows_per_block =  ((float) blockSize)/(recLen - (varcharLength/2));
-           float adjustment_factor = 
-             (expected_avg_num_rows_per_block - dp2_calc_num_rows_per_block)/expected_avg_num_rows_per_block
;
-           double oldsamplePercent = samplePercent;
-           samplePercent = samplePercent*(1+adjustment_factor);
-           if (samplePercent > 100) samplePercent = 100;
-           sampleRowCnt  = (Int64)((double)sampleRowCnt *(samplePercent/oldsamplePercent));
-   
-           if (LM->LogNeeded())
-           {
-             sprintf(LM->msg, "%s %f %s %f", 
-                              "\t\tSAMPLING %% INCREASED.\n\t\tOLD SAMPLING %% =",
-                              oldsamplePercent, 
-                              "\n\t\tNEW SAMPLING %%  = %f \n", 
-                              samplePercent); 
-             LM->Log(LM->msg);
-           }
-        }
-      }
-    }
-    if (dp2SamplingUsed == TRUE) 
-    {
-      LM->Log("\t\tDP2 ROW SAMPLING ENABLED.\n");
-      HSFuncExecQuery("CONTROL QUERY DEFAULT ALLOW_DP2_ROW_SAMPLING 'ON'");   // step (c)
-    }
-  }
-  return dp2SamplingUsed;   // step (d)
-}
-
-
 /***********************************************/
 /* METHOD:  HSSample makeTableName() member    */
 /* PURPOSE: Creates a unique sample table name */
@@ -3786,7 +3694,6 @@ void HSSample::makeTableName(NABoolean isPersSample)
 /*              rowCountIsEstimate = TRUE.     */
 /* INPUT:   sampleRowCnt - the size of the     */
 /*              sample table to create.        */
-/*              Adjusted if DP2 sampling used. */
 /* RETCODE:  0 - successful                    */
 /*           non-zero otherwise                */
 /***********************************************/
@@ -3803,7 +3710,6 @@ Lng32 HSSample::make(NABoolean rowCountIsEstimate, // input
     Lng32 retcode = 0;
     NAString dml, insertType, sampleOption;
     char intStr[30];
-    NABoolean dp2SamplingUsed = FALSE;
     NABoolean forceNoPartitioning = TRUE;
 
     HSTranMan *TM = HSTranMan::Instance();
@@ -3812,11 +3718,7 @@ Lng32 HSSample::make(NABoolean rowCountIsEstimate, // input
 
     LM->StartTimer("Create/populate sample table");
     (void)getTimeDiff(TRUE);
-
-
-    // Enable DP2 sampling when suitable - may adjust 'samplePercent' & 'sampleRowCnt'.
-    dp2SamplingUsed = enableDp2SamplingIfSuitable(objDef, samplePercent, sampleRowCnt, 
-                                                  minRowCtPerPartition);     
+     
     sampleRowCount = sampleRowCnt;  // Save sample row count for HSSample object.
 
     // Create sample option based on sampling type, using 'samplePercent'.
@@ -3857,7 +3759,8 @@ Lng32 HSSample::make(NABoolean rowCountIsEstimate, // input
     // need to use a vanilla INSERT statement. Otherwise, we can use SIDETREE
     // INSERTS for better performance. A current bug in the HBase interface
     // requires the use of Upsert.
-    if (hs_globals->isHbaseTable)
+    // For Hive tables the sample table used is a Trafodion table
+    if (hs_globals->isHbaseTable || hs_globals->isHiveTable)
       {
         if (CmpCommon::getDefault(TRAF_LOAD_USE_FOR_STATS) == DF_ON)
           {
@@ -3994,8 +3897,7 @@ Lng32 HSSample::make(NABoolean rowCountIsEstimate, // input
     //plan issues, the cqd should be removed.                              //Workaround:
10-040706-7608
     if (!(TM->InTransaction()))                                            //Workaround:
10-040706-7608
       HSFuncExecQuery("CONTROL QUERY DEFAULT PLAN_STEALING RESET");        //Workaround:
10-040706-7608
-    if (dp2SamplingUsed)
-      HSFuncExecQuery("CONTROL QUERY DEFAULT ALLOW_DP2_ROW_SAMPLING RESET");
+    
     HSFuncExecQuery("CONTROL QUERY DEFAULT POS RESET");
     HSFuncExecQuery("CONTROL QUERY DEFAULT POS_NUM_OF_PARTNS RESET");
     
@@ -4029,15 +3931,13 @@ Lng32 HSSample::make(NABoolean rowCountIsEstimate, // input
       //    (b) user has not specified the rowcount and
       //    (c) we appear to get a meaningful rowcount for the source table 
       //        (source table rowcount >= rows inserted into sample table) and
-      //    (d) dp2 sampling has not been used
-      //    (e) CLUSTER sampling not used
+      //    (d) CLUSTER sampling not used
       // we set the actualRowCount to the value obtained from the statistics table
       // This works since every row of the source table is scanned for EID sampling
       // and the number of rows scanned is recorded in the stats area.
       if (rowCountIsEstimate &&
           !(hs_globals->optFlags & ROWCOUNT_OPT) &&
           (sourceTableRowCount > sampleRowCount) &&
-          !(dp2SamplingUsed) &&
           (hs_globals->optFlags & SAMPLE_REQUESTED) != SAMPLE_RAND_2) 
         {
           tableRowCnt = sourceTableRowCount;
@@ -4923,7 +4823,6 @@ void HSGlobalsClass::getMemoryRequirementsForOneGroup(HSColGroupStruct*
group, I
 // sample table. The maximum number of rows we will actually read must be based
 // on the amount of memory allocated to hold their values.
 //
-// Note: row count may be adjusted up if DP2 sampling and heavy amount of varchars.
 Int64 HSGlobalsClass::getInternalSortMemoryRequirements(NABoolean performISForMC)
 {
   HSLogMan *LM = HSLogMan::Instance();
@@ -4942,17 +4841,6 @@ Int64 HSGlobalsClass::getInternalSortMemoryRequirements(NABoolean performISForMC
      getMCMemoryRequirements(multiGroup, rows);
   }
 
-  // Double rows to read if we are sampling with DP2 and more than half the 
-  // row data may be varchars.  DP2 row sampling can be thrown off in this
-  // case, resulting in many more rows read than expected, so we need to have
-  // memory to store these rows.   -- Disabled for now. --
-#if 0
-  if (sampleRowCount > 0 &&
-      sampleRowCount*100/actualRowCount < 5 && // DP2 sampling may be enabled.
-      (float) cumuVarCharSize/(float) cumuElementSize > 0.5) 
-    rows *= 2;  
-#endif
-
   return rows;
 }
 
@@ -5396,7 +5284,6 @@ Lng32 HSGlobalsClass::CollectStatistics()
 
         mapInternalSortTypes(singleGroup);
         Int64 maxRowsToRead = getInternalSortMemoryRequirements(TRUE); 
-          // row count may be adjusted up if DP2 sampling and heavy amount of varchars.
 
         if (trySampleTableBypassForIS && multiGroup ) {
 
@@ -5496,8 +5383,6 @@ Lng32 HSGlobalsClass::CollectStatistics()
               if (LM->LogNeeded())
                 LM->Log("Internal sort: reading sample directly from base table; no sample
table created");
               *hssample_table = getTableName(user_table->data(), nameSpace);
-              enableDp2SamplingIfSuitable(objDef, sampleTblPercent, sampleRowCount,
-                                          minRowCtPerPartition_); 
                 // sampleTblPercent and sampleRowCount may get adjusted.
               retcode = createSampleOption(optFlags & SAMPLE_REQUESTED, 
                                            sampleTblPercent, *sampleOption, 
@@ -5576,7 +5461,6 @@ Lng32 HSGlobalsClass::CollectStatistics()
 
         HSFuncExecQuery("CONTROL QUERY DEFAULT FLOATTYPE RESET");
         HSFuncExecQuery("CONTROL QUERY DEFAULT LIMIT_MAX_NUMERIC_PRECISION RESET");
-        HSFuncExecQuery("CONTROL QUERY DEFAULT ALLOW_DP2_ROW_SAMPLING RESET");
       }
 
                                           /*=================================*/

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/11c61dc0/core/sql/ustat/hs_globals.h
----------------------------------------------------------------------
diff --git a/core/sql/ustat/hs_globals.h b/core/sql/ustat/hs_globals.h
index 76ee429..ab650cf 100644
--- a/core/sql/ustat/hs_globals.h
+++ b/core/sql/ustat/hs_globals.h
@@ -73,8 +73,6 @@ class AbstractFastStatsHist;
 Lng32 AddNecessaryColumns();
 Lng32 AddAllColumnsForIUS();
 
-NABoolean enableDp2SamplingIfSuitable(HSTableDef *objDef, double &samplePercent,
-                                      Int64 &sampleRowCnt);
 Lng32 createSampleOption(Lng32 sampleType, double samplePercent, NAString &sampleOpt,
                         Int64 sampleValue1=0, Int64 sampleValue2=0);
 Lng32 doubleToHSDataBuffer(const double dbl, HSDataBuffer& dbf);

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/11c61dc0/core/sql/ustat/hs_la.h
----------------------------------------------------------------------
diff --git a/core/sql/ustat/hs_la.h b/core/sql/ustat/hs_la.h
index e9e7286..bd70e53 100644
--- a/core/sql/ustat/hs_la.h
+++ b/core/sql/ustat/hs_la.h
@@ -299,8 +299,7 @@ class HSHiveTableDef : public HSTableDef
       }
     Lng32 getTotalVarcharLength() const
       {
-        // Used in determination of whether dp2 sampling should be used, and
-        // also for row count estimation for sq tables. Not needed for hive.
+        // Used for row count estimation for sq tables. Not needed for hive.
         return 0;
       }
     Lng32 getBlockSize() const
@@ -407,8 +406,7 @@ class HSHbaseTableDef : public HSTableDef
       }
     Lng32 getTotalVarcharLength() const
       {
-        // Used in determination of whether dp2 sampling should be used, and
-        // also for row count estimation for sq tables. Not needed for HBase.
+        // Used for row count estimation for sq tables. Not needed for HBase.
         return 0;
       }
     Lng32 getBlockSize() const

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/11c61dc0/core/sql/ustat/hs_parser.cpp
----------------------------------------------------------------------
diff --git a/core/sql/ustat/hs_parser.cpp b/core/sql/ustat/hs_parser.cpp
index 6b3d555..014f7ad 100644
--- a/core/sql/ustat/hs_parser.cpp
+++ b/core/sql/ustat/hs_parser.cpp
@@ -459,7 +459,6 @@ Lng32 AddTableName( const hs_table_type type
         hs_globals->hsintval_table->append(".HISTINTS");        
 
         // RESET CQDS:
-        HSFuncExecQuery("CONTROL QUERY DEFAULT ALLOW_DP2_ROW_SAMPLING RESET");
         HSFuncExecQuery("CONTROL QUERY DEFAULT POS RESET");
         HSFuncExecQuery("CONTROL QUERY DEFAULT POS_NUM_OF_PARTNS RESET");
         // LCOV_EXCL_STOP


Mime
View raw message