hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From na...@apache.org
Subject svn commit: r1404924 [1/6] - in /hive/trunk: common/src/java/org/apache/hadoop/hive/common/ common/src/java/org/apache/hadoop/hive/conf/ conf/ ql/src/java/org/apache/hadoop/hive/ql/ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/had...
Date Fri, 02 Nov 2012 11:20:29 GMT
Author: namit
Date: Fri Nov  2 11:20:26 2012
New Revision: 1404924

URL: http://svn.apache.org/viewvc?rev=1404924&view=rev
Log:
HIVE-3554 Hive List Bucketing - Query logic
(Gang Tim Liu via namit)


Added:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcCtx.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcFactory.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpPartitionWalkerCtx.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpWalkerCtx.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBPartitionProcFactory.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBProcFactory.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPruner.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPrunerUtils.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ValidationUtility.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/DynamicMultiDimeCollectionTest.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPrunnerTest.java
    hive/trunk/ql/src/test/queries/clientnegative/column_change_skewedcol_type1.q
    hive/trunk/ql/src/test/queries/clientnegative/invalid_config1.q
    hive/trunk/ql/src/test/queries/clientpositive/alter_skewed_table.q
    hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_1.q
    hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_2.q
    hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_3.q
    hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_oneskew_1.q
    hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_oneskew_2.q
    hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_oneskew_3.q
    hive/trunk/ql/src/test/results/clientnegative/column_change_skewedcol_type1.q.out
    hive/trunk/ql/src/test/results/clientnegative/invalid_config1.q.out
    hive/trunk/ql/src/test/results/clientpositive/alter_skewed_table.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_query_multiskew_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_query_multiskew_2.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_query_multiskew_3.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_query_oneskew_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_query_oneskew_2.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_query_oneskew_3.q.out
Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/conf/hive-default.xml.template
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/Driver.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/AlterTableDesc.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/CreateTableDesc.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java
    hive/trunk/ql/src/test/queries/clientnegative/column_rename5.q
    hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_col_name_value_no_mismatch.q
    hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_dup_col_name.q
    hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_failure_invalid_col_name.q
    hive/trunk/ql/src/test/queries/clientpositive/create_skewed_table1.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt1.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt10.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt11.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt12.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt13.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt14.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt15.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt16.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt17.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt18.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt19.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt2.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt20.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt3.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt4.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt5.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt6.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt7.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt8.q
    hive/trunk/ql/src/test/queries/clientpositive/skewjoinopt9.q
    hive/trunk/ql/src/test/results/clientnegative/column_rename5.q.out

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/common/FileUtils.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/common/FileUtils.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/common/FileUtils.java Fri Nov  2 11:20:26 2012
@@ -26,6 +26,7 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
 
 import org.apache.hadoop.util.Shell;
 
@@ -255,4 +256,28 @@ public final class FileUtils {
       results.add(fileStatus);
     }
   }
+
+  /**
+   * default directory will have the same depth as number of skewed columns
+   * this will make future operation easy like DML merge, concatenate merge
+   * @param skewedCols
+   * @param hconf
+   * @return
+   */
+  public static String makeDefaultListBucketingDirName(List<String> skewedCols,
+      Configuration hconf) {
+    String lbDirName;
+    String defaultDir = FileUtils.escapePathName(HiveConf.getVar(hconf,
+        HiveConf.ConfVars.HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME));
+    StringBuilder defaultDirPath = new StringBuilder();
+    for (int i = 0; i < skewedCols.size(); i++) {
+      if (i > 0) {
+        defaultDirPath.append(Path.SEPARATOR);
+      }
+      defaultDirPath.append(defaultDir);
+    }
+    lbDirName = defaultDirPath.toString();
+    return lbDirName;
+  }
+
 }

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Fri Nov  2 11:20:26 2012
@@ -651,13 +651,18 @@ public class HiveConf extends Configurat
     HIVE_MULTI_INSERT_MOVE_TASKS_SHARE_DEPENDENCIES(
         "hive.multi.insert.move.tasks.share.dependencies", false),
 
-    /**
-     * Enable list bucketing DDL. Default value is false so that we disable it by default.
-     *
-     * This will be removed once the rest of the DML changes are committed.
-     */
+    /* The following section contains all configurations used for list bucketing feature.*/
+    // Enable list bucketing DDL. Default value is false so that we disable it by default.
+    // This will be removed once the rest of the DML changes are committed.
     HIVE_INTERNAL_DDL_LIST_BUCKETING_ENABLE("hive.internal.ddl.list.bucketing.enable", false),
 
+    // Default list bucketing directory name.
+    HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME("hive.exec.list.bucketing.default.dir",
+        "HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME"),
+    // Enable list bucketing optimizer. Default value is false so that we disable it by default.
+    // This will be removed once the rest of the DML changes are committed.
+    HIVEOPTLISTBUCKETING("hive.optimize.listbucketing", false),
+
     // Allow TCP Keep alive socket option for for HiveServer or a maximum timeout for the socket.
 
     SERVER_READ_SOCKET_TIMEOUT("hive.server.read.socket.timeout", 10),

Modified: hive/trunk/conf/hive-default.xml.template
URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml.template (original)
+++ hive/trunk/conf/hive-default.xml.template Fri Nov  2 11:20:26 2012
@@ -529,6 +529,23 @@
 </property>
 
 <property>
+  <name>hive.exec.list.bucketing.default.dir</name>
+  <value>HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME</value>
+  <description>Default directory name used in list bucketing.
+    List bucketing feature will create sub-directory for each skewed-value and a default directory
+    for non-skewed value. This config specifies the default name for the default directory.
+    Sub-directory is created by list bucketing DML and under partition directory. User doesn't need 
+    to know how to construct the canonical path. It just gives user choice if they want to change 
+    the default directory name.
+    For example, there are 2 skewed column c1 and c2. 2 skewed value: (1,a) and (2,b). subdirectory:
+    <partition-dir>/c1=1/c2=a/
+    <partition-dir>/c1=2/c2=b/
+    <partition-dir>/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/
+    Note: This config won't impact users if they don't list bucketing.
+</description>
+</property>
+
+<property>
   <name>hive.skewjoin.key</name>
   <value>100000</value>
   <description>Determine if we get a skew key in join. If we see more

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/Driver.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/Driver.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/Driver.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/Driver.java Fri Nov  2 11:20:26 2012
@@ -879,6 +879,10 @@ public class Driver implements CommandPr
     errorMessage = null;
     SQLState = null;
 
+    if (!validateConfVariables()) {
+      return new CommandProcessorResponse(12, errorMessage, SQLState);
+    }
+
     HiveDriverRunHookContext hookContext = new HiveDriverRunHookContextImpl(conf, command);
     // Get all the driver run hooks and pre-execute them.
     List<HiveDriverRunHook> driverRunHooks;
@@ -972,6 +976,26 @@ public class Driver implements CommandPr
   }
 
   /**
+   * Validate configuration variables.
+   *
+   * @return
+   */
+  private boolean validateConfVariables() {
+    boolean valid = true;
+    if ((!conf.getBoolVar(HiveConf.ConfVars.HIVE_HADOOP_SUPPORTS_SUBDIRECTORIES))
+        && ((conf.getBoolVar(HiveConf.ConfVars.HIVE_INTERNAL_DDL_LIST_BUCKETING_ENABLE))
+            || (conf.getBoolVar(HiveConf.ConfVars.HADOOPMAPREDINPUTDIRRECURSIVE)) || (conf
+              .getBoolVar(HiveConf.ConfVars.HIVEOPTLISTBUCKETING)))) {
+      errorMessage = "FAILED: Hive Internal Error: "
+          + ErrorMsg.SUPPORT_DIR_MUST_TRUE_FOR_LIST_BUCKETING.getMsg();
+      SQLState = ErrorMsg.findSQLState(errorMessage);
+      console.printError(errorMessage + "\n");
+      valid = false;
+    }
+    return valid;
+  }
+
+  /**
    * Returns a set of hooks specified in a configuration variable.
    *
    * See getHooks(HiveConf.ConfVars hookConfVar, Class<T> clazz)

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java Fri Nov  2 11:20:26 2012
@@ -245,22 +245,31 @@ public enum ErrorMsg {
 
   SHOW_CREATETABLE_INDEX(10144, "SHOW CREATE TABLE does not support tables of type INDEX_TABLE."),
 
-  CREATE_SKEWED_TABLE_NO_COLUMN_NAME(10200, "No skewed column name."),
-  CREATE_SKEWED_TABLE_NO_COLUMN_VALUE(10201, "No skewed values."),
-  CREATE_SKEWED_TABLE_DUPLICATE_COLUMN_NAMES(10202,
+
+  ALTER_TBL_SKEWED_LOC_NO_LOC(10197, "Alter table skewed location doesn't have locations."),
+  ALTER_TBL_SKEWED_LOC_NO_MAP(10198, "Alter table skewed location doesn't have location map."),
+  SUPPORT_DIR_MUST_TRUE_FOR_LIST_BUCKETING(
+      10199,
+      "hive.mapred.supports.subdirectories must be true"
+          + " if any one of following is true: hive.internal.ddl.list.bucketing.enable,"
+          + " hive.optimize.listbucketing and mapred.input.dir.recursive"),
+  SKEWED_TABLE_NO_COLUMN_NAME(10200, "No skewed column name."),
+  SKEWED_TABLE_NO_COLUMN_VALUE(10201, "No skewed values."),
+  SKEWED_TABLE_DUPLICATE_COLUMN_NAMES(10202,
       "Duplicate skewed column name:"),
-  CREATE_SKEWED_TABLE_INVALID_COLUMN(10203,
+  SKEWED_TABLE_INVALID_COLUMN(10203,
       "Invalid skewed column name:"),
-  CREATE_SKEWED_TABLE_SKEWED_COL_NAME_VALUE_MISMATCH_1(10204,
+  SKEWED_TABLE_SKEWED_COL_NAME_VALUE_MISMATCH_1(10204,
       "Skewed column name is empty but skewed value is not."),
-  CREATE_SKEWED_TABLE_SKEWED_COL_NAME_VALUE_MISMATCH_2(10205,
+  SKEWED_TABLE_SKEWED_COL_NAME_VALUE_MISMATCH_2(10205,
       "Skewed column value is empty but skewed name is not."),
-  CREATE_SKEWED_TABLE_SKEWED_COL_NAME_VALUE_MISMATCH_3(10206,
+  SKEWED_TABLE_SKEWED_COL_NAME_VALUE_MISMATCH_3(10206,
       "The number of skewed column names and the number of " +
       "skewed column values are different: "),
   ALTER_TABLE_NOT_ALLOWED_RENAME_SKEWED_COLUMN(10207,
-          " is a skewed column. It's not allowed to rename skewed column."),
-  HIVE_INTERNAL_DDL_LIST_BUCKETING_DISABLED(10208,
+      " is a skewed column. It's not allowed to rename skewed column"
+          + " or change skewed column type."),
+ HIVE_INTERNAL_DDL_LIST_BUCKETING_DISABLED(10208,
               "List Bucketing DDL is not allowed to use since feature is not completed yet."),
 
   HIVE_GROUPING_SETS_AGGR_NOMAPAGGR(10209,

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java Fri Nov  2 11:20:26 2012
@@ -77,6 +77,7 @@ import org.apache.hadoop.hive.metastore.
 import org.apache.hadoop.hive.metastore.api.PrivilegeGrantInfo;
 import org.apache.hadoop.hive.metastore.api.Role;
 import org.apache.hadoop.hive.metastore.api.SerDeInfo;
+import org.apache.hadoop.hive.metastore.api.SkewedInfo;
 import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
 import org.apache.hadoop.hive.ql.Context;
 import org.apache.hadoop.hive.ql.DriverContext;
@@ -3182,6 +3183,49 @@ public class DDLTask extends Task<DDLWor
       } catch (URISyntaxException e) {
         throw new HiveException(e);
       }
+    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDSKEWEDBY) {
+      /* Validation's been done at compile time. no validation is needed here. */
+      List<String> skewedColNames = null;
+      List<List<String>> skewedValues = null;
+
+      if (alterTbl.isTurnOffSkewed()) {
+        /* Convert skewed table to non-skewed table. */
+        skewedColNames = new ArrayList<String>();
+        skewedValues = new ArrayList<List<String>>();
+      } else {
+        skewedColNames = alterTbl.getSkewedColNames();
+        skewedValues = alterTbl.getSkewedColValues();
+      }
+
+      if ( null == tbl.getSkewedInfo()) {
+        /* Convert non-skewed table to skewed table. */
+        SkewedInfo skewedInfo = new SkewedInfo();
+        skewedInfo.setSkewedColNames(skewedColNames);
+        skewedInfo.setSkewedColValues(skewedValues);
+        tbl.setSkewedInfo(skewedInfo);
+      } else {
+        tbl.setSkewedColNames(skewedColNames);
+        tbl.setSkewedColValues(skewedValues);
+      }
+    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ALTERSKEWEDLOCATION) {
+      // process location one-by-one
+      Map<List<String>,String> locMaps = alterTbl.getSkewedLocations();
+      Set<List<String>> keys = locMaps.keySet();
+      for(List<String> key:keys){
+        String newLocation = locMaps.get(key);
+        try {
+          URI locUri = new URI(newLocation);
+          if (part != null) {
+            List<String> slk = new ArrayList<String>(key);
+            part.setSkewedValueLocationMap(slk, locUri.toString());
+          } else {
+            List<String> slk = new ArrayList<String>(key);
+            tbl.setSkewedValueLocationMap(slk, locUri.toString());
+          }
+        } catch (URISyntaxException e) {
+          throw new HiveException(e);
+        }
+      }
     } else {
       formatter.consoleError(console,
                              "Unsupported Alter commnad",

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java Fri Nov  2 11:20:26 2012
@@ -22,6 +22,7 @@ import java.io.Serializable;
 import java.net.URI;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -612,4 +613,29 @@ public class Partition implements Serial
   public void setLastAccessTime(int lastAccessTime) {
     tPartition.setLastAccessTime(lastAccessTime);
   }
+
+  public List<List<String>> getSkewedColValues(){
+    return tPartition.getSd().getSkewedInfo().getSkewedColValues();
+  }
+
+  public List<String> getSkewedColNames() {
+    return tPartition.getSd().getSkewedInfo().getSkewedColNames();
+  }
+
+  public void setSkewedValueLocationMap(List<String> valList, String dirName)
+      throws HiveException {
+    Map<List<String>, String> mappings = tPartition.getSd().getSkewedInfo()
+        .getSkewedColValueLocationMaps();
+    if (null == mappings) {
+      mappings = new HashMap<List<String>, String>();
+      tPartition.getSd().getSkewedInfo().setSkewedColValueLocationMaps(mappings);
+    }
+
+    // Add or update new mapping
+    mappings.put(valList, dirName);
+  }
+
+  public Map<List<String>, String> getSkewedColValueLocationMaps() {
+    return tPartition.getSd().getSkewedInfo().getSkewedColValueLocationMaps();
+  }
 }

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java Fri Nov  2 11:20:26 2012
@@ -550,6 +550,13 @@ public class Table implements Serializab
         .getSkewedColNames() : new ArrayList<String>();
   }
 
+  public SkewedInfo getSkewedInfo() {
+    return tTable.getSd().getSkewedInfo();
+  }
+
+  public void setSkewedInfo(SkewedInfo skewedInfo) throws HiveException {
+    tTable.getSd().setSkewedInfo(skewedInfo);
+  }
 
   private boolean isField(String col) {
     for (FieldSchema field : getCols()) {

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java Fri Nov  2 11:20:26 2012
@@ -50,6 +50,7 @@ import org.apache.hadoop.hive.ql.metadat
 import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRMapJoinCtx;
 import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRUnionCtx;
 import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
+import org.apache.hadoop.hive.ql.optimizer.listbucketingpruner.ListBucketingPruner;
 import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
 import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext;
 import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext;
@@ -59,6 +60,7 @@ import org.apache.hadoop.hive.ql.parse.P
 import org.apache.hadoop.hive.ql.parse.RowResolver;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import org.apache.hadoop.hive.ql.plan.FetchWork;
 import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
 import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
@@ -709,9 +711,19 @@ public final class GenMapRedUtils {
       Path[] paths = null;
       sampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(topOp);
 
+      // Lookup list bucketing pruner
+      Map<String, ExprNodeDesc> partToPruner = parseCtx.getOpToPartToSkewedPruner().get(topOp);
+      ExprNodeDesc listBucketingPruner = (partToPruner != null) ? partToPruner.get(part.getName())
+          : null;
+
       if (sampleDescr != null) {
+        assert (listBucketingPruner == null) : "Sampling and list bucketing can't coexit.";
         paths = SamplePruner.prune(part, sampleDescr);
         parseCtx.getGlobalLimitCtx().disableOpt();
+      } else if (listBucketingPruner != null) {
+        assert (sampleDescr == null) : "Sampling and list bucketing can't coexist.";
+        /* Use list bucketing prunner's path. */
+        paths = ListBucketingPruner.prune(parseCtx, part, listBucketingPruner);
       } else {
         // Now we only try the first partition, if the first partition doesn't
         // contain enough size, we change to normal mode.

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java Fri Nov  2 11:20:26 2012
@@ -24,6 +24,7 @@ import java.util.List;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.optimizer.index.RewriteGBUsingIndex;
 import org.apache.hadoop.hive.ql.optimizer.lineage.Generator;
+import org.apache.hadoop.hive.ql.optimizer.listbucketingpruner.ListBucketingPruner;
 import org.apache.hadoop.hive.ql.optimizer.pcr.PartitionConditionRemover;
 import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
 import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcessor;
@@ -56,6 +57,10 @@ public class Optimizer {
       transformations.add(new PredicatePushDown());
       transformations.add(new PartitionPruner());
       transformations.add(new PartitionConditionRemover());
+      if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTLISTBUCKETING)) {
+        /* Add list bucketing pruner. */
+        transformations.add(new ListBucketingPruner());
+      }
     }
     if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_OPTIMIZE_SKEWJOIN_COMPILETIME)) {
       transformations.add(new SkewJoinOptimizer());

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java?rev=1404924&r1=1404923&r2=1404924&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java Fri Nov  2 11:20:26 2012
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hive.ql.optimizer;
 
+import java.util.HashMap;
 import java.util.Map;
 import java.util.Stack;
 
@@ -26,6 +27,7 @@ import org.apache.hadoop.hive.ql.exec.UD
 import org.apache.hadoop.hive.ql.lib.Node;
 import org.apache.hadoop.hive.ql.lib.NodeProcessor;
 import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Partition;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
@@ -100,7 +102,6 @@ public abstract class PrunerOperatorFact
      */
     protected abstract void generatePredicate(NodeProcessorCtx procCtx, FilterOperator fop,
         TableScanOperator top) throws SemanticException, UDFArgumentException;
-
     /**
      * Add pruning predicate.
      *
@@ -126,6 +127,42 @@ public abstract class PrunerOperatorFact
 
       return;
     }
+
+    /**
+     * Add pruning predicate.
+     *
+     * @param opToPrunner
+     * @param top
+     * @param new_pruner_pred
+     * @param part
+     * @throws UDFArgumentException
+     */
+    protected void addPruningPred(Map<TableScanOperator, Map<String, ExprNodeDesc>> opToPrunner,
+        TableScanOperator top, ExprNodeDesc new_pruner_pred, Partition part)
+        throws UDFArgumentException {
+      Map<String, ExprNodeDesc> oldPartToPruner = opToPrunner.get(top);
+      ExprNodeDesc pruner_pred = null;
+      if (oldPartToPruner == null) {
+        pruner_pred = new_pruner_pred;
+      } else {
+        ExprNodeDesc old_pruner_pred = oldPartToPruner.get(part.getName());
+        if (old_pruner_pred != null) {
+          // or the old_pruner_pred and the new_ppr_pred
+          pruner_pred = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("OR",
+              old_pruner_pred, new_pruner_pred);
+        } else {
+          pruner_pred = new_pruner_pred;
+        }
+      }
+
+      // Put the mapping from part to pruner_pred
+      Map<String, ExprNodeDesc> partToPruner = new HashMap<String, ExprNodeDesc>();
+      partToPruner.put(part.getName(), pruner_pred);
+      // Put the mapping from table scan operator to part-pruner map
+      opToPrunner.put(top, partToPruner);
+
+      return;
+    }
   }
 
   /**

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcCtx.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcCtx.java?rev=1404924&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcCtx.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcCtx.java Fri Nov  2 11:20:26 2012
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;
+
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+
+/**
+ * The processor context for list bucketing pruner. This contains the table alias
+ * that is being currently processed.
+ */
+public class LBExprProcCtx implements NodeProcessorCtx{
+
+  /**
+   * The table alias that is being currently processed.
+   */
+  private String tabAlias;
+
+  // partition walker working on
+  private final Partition part;
+
+  public LBExprProcCtx(String tabAlias, Partition part) {
+    this.tabAlias = tabAlias;
+    this.part = part;
+  }
+
+  /**
+   *
+   * @return
+   */
+  public String getTabAlias() {
+    return tabAlias;
+  }
+
+  /**
+   *
+   * @param tabAlias
+   */
+  public void setTabAlias(String tabAlias) {
+    this.tabAlias = tabAlias;
+  }
+
+  /**
+   * @return the part
+   */
+  public Partition getPart() {
+    return part;
+  }
+
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcFactory.java?rev=1404924&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcFactory.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBExprProcFactory.java Fri Nov  2 11:20:26 2012
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;
+
+import java.util.Map;
+
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.optimizer.PrunerExpressionOperatorFactory;
+import org.apache.hadoop.hive.ql.optimizer.PrunerUtils;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+
+/**
+ * Expression processor factory for list bucketing pruning. Each processor tries to
+ * convert the expression subtree into a list bucketing pruning expression. This
+ * expression is then used to figure out which skewed value to be used
+ */
+public class LBExprProcFactory extends PrunerExpressionOperatorFactory {
+
+  private LBExprProcFactory() {
+    // prevent instantiation
+  }
+
+  /**
+   * Processor for lbpr column expressions.
+   */
+  public static class LBPRColumnExprProcessor extends ColumnExprProcessor {
+
+    @Override
+    protected ExprNodeDesc processColumnDesc(NodeProcessorCtx procCtx, ExprNodeColumnDesc cd) {
+      ExprNodeDesc newcd;
+      LBExprProcCtx ctx = (LBExprProcCtx) procCtx;
+      Partition part = ctx.getPart();
+      if (cd.getTabAlias().equalsIgnoreCase(ctx.getTabAlias())
+          && isPruneForListBucketing(part, cd.getColumn())) {
+        newcd = cd.clone();
+      } else {
+        newcd = new ExprNodeConstantDesc(cd.getTypeInfo(), null);
+      }
+      return newcd;
+    }
+
+    /**
+     * Check if we prune it for list bucketing
+     * 1. column name is part of skewed column
+     * 2. partition has skewed value to location map
+     * @param part
+     * @param columnName
+     * @return
+     */
+    private boolean isPruneForListBucketing(Partition part, String columnName) {
+      return ListBucketingPrunerUtils.isListBucketingPart(part)
+          && (part.getSkewedColNames().contains(columnName));
+    }
+  }
+
+  /**
+   * Generates the list bucketing pruner for the expression tree.
+   *
+   * @param tabAlias
+   *          The table alias of the partition table that is being considered
+   *          for pruning
+   * @param pred
+   *          The predicate from which the list bucketing pruner needs to be
+   *          generated
+   * @param part
+   *          The partition this walker is walking
+   * @throws SemanticException
+   */
+  public static ExprNodeDesc genPruner(String tabAlias, ExprNodeDesc pred, Partition part)
+      throws SemanticException {
+    // Create the walker, the rules dispatcher and the context.
+    NodeProcessorCtx lbprCtx = new LBExprProcCtx(tabAlias, part);
+
+    Map<Node, Object> outputMap = PrunerUtils.walkExprTree(pred, lbprCtx, getColumnProcessor(),
+        getFieldProcessor(), getGenericFuncProcessor(), getDefaultExprProcessor());
+
+    // Get the exprNodeDesc corresponding to the first start node;
+    return (ExprNodeDesc) outputMap.get(pred);
+  }
+
+  /**
+   * Instantiate column processor.
+   *
+   * @return
+   */
+  public static NodeProcessor getColumnProcessor() {
+    return new LBPRColumnExprProcessor();
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpPartitionWalkerCtx.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpPartitionWalkerCtx.java?rev=1404924&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpPartitionWalkerCtx.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpPartitionWalkerCtx.java Fri Nov  2 11:20:26 2012
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;
+
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+
+/**
+ * Context used by list bucketing pruner to get all partitions
+ *
+ */
+public class LBOpPartitionWalkerCtx implements NodeProcessorCtx {
+
+  private final ParseContext parseContext;
+
+  private PrunedPartitionList partitions;
+
+  /**
+   * Constructor.
+   */
+  public LBOpPartitionWalkerCtx(ParseContext parseContext) {
+    this.parseContext = parseContext;
+  }
+
+  /**
+   * Return parse context.
+   *
+   * @return
+   */
+  public ParseContext getParseContext() {
+    return parseContext;
+  }
+
+  /**
+   * Return partitions.
+   *
+   * @return the partitions
+   */
+  public PrunedPartitionList getPartitions() {
+    return partitions;
+  }
+
+  /**
+   * Set partitions.
+   *
+   * @param partitions the partitions to set
+   */
+  public void setPartitions(PrunedPartitionList partitions) {
+    this.partitions = partitions;
+  }
+
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpWalkerCtx.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpWalkerCtx.java?rev=1404924&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpWalkerCtx.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBOpWalkerCtx.java Fri Nov  2 11:20:26 2012
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;
+
+import java.util.Map;
+
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+
+/**
+ * Context used by list bucketing to walk operator trees to generate expression tree.
+ *
+ */
+public class LBOpWalkerCtx implements NodeProcessorCtx {
+ /**
+   * Map from tablescan operator to list bucketing pruning descriptor that is
+   * initialized from the ParseContext.
+   */
+  private final Map<TableScanOperator, Map<String, ExprNodeDesc>> opToPartToLBPruner;
+
+  // partition walker working on
+  private final Partition part;
+
+  /**
+   * Constructor.
+   */
+  public LBOpWalkerCtx(Map<TableScanOperator, Map<String, ExprNodeDesc>> opToPartToLBPruner,
+      Partition part) {
+    this.opToPartToLBPruner = opToPartToLBPruner;
+    this.part = part;
+  }
+
+  /**
+   * @return the opToPartToLBPruner
+   */
+  public Map<TableScanOperator, Map<String, ExprNodeDesc>> getOpToPartToLBPruner() {
+    return opToPartToLBPruner;
+  }
+
+  /**
+   * @return the part
+   */
+  public Partition getPart() {
+    return part;
+  }
+
+
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBPartitionProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBPartitionProcFactory.java?rev=1404924&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBPartitionProcFactory.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBPartitionProcFactory.java Fri Nov  2 11:20:26 2012
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.optimizer.PrunerOperatorFactory;
+import org.apache.hadoop.hive.ql.optimizer.pcr.PcrOpProcFactory;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+
+/**
+ * Walk through top operators in tree to find all partitions.
+ *
+ * It should be done already in by {@link PcrOpProcFactory}
+ *
+ */
+public class LBPartitionProcFactory extends PrunerOperatorFactory {
+  static final Log LOG = LogFactory.getLog(ListBucketingPruner.class.getName());
+
+  /**
+   * Retrieve partitions for the filter. This is called only when
+   * the filter follows a table scan operator.
+   */
+  public static class LBPRPartitionFilterPruner extends FilterPruner {
+
+    @Override
+    protected void generatePredicate(NodeProcessorCtx procCtx, FilterOperator fop,
+        TableScanOperator top) throws SemanticException, UDFArgumentException {
+      LBOpPartitionWalkerCtx owc = (LBOpPartitionWalkerCtx) procCtx;
+
+      //Run partition pruner to get partitions
+      ParseContext parseCtx = owc.getParseContext();
+      PrunedPartitionList prunedPartList = parseCtx.getOpToPartList().get(top);
+      if (prunedPartList == null) {
+        // We never pruned the partition. Try to prune it.
+        ExprNodeDesc ppr_pred = parseCtx.getOpToPartPruner().get(top);
+        if (ppr_pred != null) {
+          try {
+            prunedPartList = PartitionPruner.prune(parseCtx.getTopToTable().get(top),
+                ppr_pred, parseCtx.getConf(),
+                (String) parseCtx.getTopOps().keySet()
+                .toArray()[0], parseCtx.getPrunedPartitions());
+            if (prunedPartList != null) {
+              owc.getParseContext().getOpToPartList().put(top, prunedPartList);
+            }
+          } catch (HiveException e) {
+            // Has to use full name to make sure it does not conflict with
+            // org.apache.commons.lang.StringUtils
+            throw new SemanticException(e.getMessage(), e);
+          }
+        }
+      }
+
+      if (prunedPartList != null) {
+        owc.setPartitions(prunedPartList);
+      }
+
+    }
+
+  }
+
+  public static NodeProcessor getFilterProc() {
+    return new LBPRPartitionFilterPruner();
+  }
+
+  private LBPartitionProcFactory() {
+    // prevent instantiation
+  }
+
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBProcFactory.java?rev=1404924&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBProcFactory.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/LBProcFactory.java Fri Nov  2 11:20:26 2012
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;
+
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.optimizer.PrunerOperatorFactory;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+
+/**
+ * Operator factory for list bucketing pruning processing of operator graph We find
+ * all the filter operators that appear just beneath the table scan operators.
+ * We then pass the filter to the list bucketing to construct a pruner for
+ * that table alias and store a mapping from the table scan operator to that
+ * pruner. We call that list bucketing during plan generation.
+ */
+public final class LBProcFactory extends PrunerOperatorFactory {
+  /**
+   * Determines the list bucketing pruner for the filter. This is called only when
+   * the filter follows a table scan operator.
+   */
+  public static class LBPRFilterPruner extends FilterPruner {
+
+    @Override
+    protected void generatePredicate(NodeProcessorCtx procCtx, FilterOperator fop,
+        TableScanOperator top) throws SemanticException, UDFArgumentException {
+      LBOpWalkerCtx owc = (LBOpWalkerCtx) procCtx;
+      // Otherwise this is not a sampling predicate and we need to
+      ExprNodeDesc predicate = fop.getConf().getPredicate();
+      String alias = top.getConf().getAlias();
+
+      // Generate the list bucketing pruning predicate
+      ExprNodeDesc lbprPred = LBExprProcFactory.genPruner(alias, predicate, owc.getPart());
+
+      /*
+       * add list bucketing predicate to to the table scan operator
+       */
+      addPruningPred(owc.getOpToPartToLBPruner(), top, lbprPred, owc.getPart());
+    }
+
+  }
+
+  public static NodeProcessor getFilterProc() {
+    return new LBPRFilterPruner();
+  }
+
+  private LBProcFactory() {
+    // prevent instantiation
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPruner.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPruner.java?rev=1404924&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPruner.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPruner.java Fri Nov  2 11:20:26 2012
@@ -0,0 +1,638 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.optimizer.PrunerUtils;
+import org.apache.hadoop.hive.ql.optimizer.Transform;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.session.SessionState;
+
+/**
+ * The transformation step that does list bucketing pruning.
+ *
+ */
+public class ListBucketingPruner implements Transform {
+
+  public static final String DEFAULT_SKEWED_DIRECTORY = SessionState.get()
+      .getConf().getVar(ConfVars.HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME);
+  public static final String DEFAULT_SKEWED_KEY = "HIVE_DEFAULT_LIST_BUCKETING_KEY";
+  static final Log LOG = LogFactory.getLog(ListBucketingPruner.class.getName());
+
+  /*
+   * (non-Javadoc)
+   *
+   * @see org.apache.hadoop.hive.ql.optimizer.Transform#transform(org.apache.hadoop.hive.ql.parse.
+   * ParseContext)
+   */
+  @Override
+  public ParseContext transform(ParseContext pctx) throws SemanticException {
+    // create a the context for walking operators
+    NodeProcessorCtx opPartWalkerCtx = new LBOpPartitionWalkerCtx(pctx);
+
+    // Retrieve all partitions generated from partition pruner and partition column pruner
+    PrunerUtils.walkOperatorTree(pctx, opPartWalkerCtx, LBPartitionProcFactory.getFilterProc(),
+        LBPartitionProcFactory.getDefaultProc());
+
+    PrunedPartitionList partsList = ((LBOpPartitionWalkerCtx) opPartWalkerCtx).getPartitions();
+    if (partsList != null) {
+      Set<Partition> parts = null;
+      parts = partsList.getConfirmedPartns();
+      parts.addAll(partsList.getUnknownPartns());
+      if ((parts != null) && (parts.size() > 0)) {
+        for (Partition part : parts) {
+          // only process partition which is skewed and list bucketed
+          if (ListBucketingPrunerUtils.isListBucketingPart(part)) {
+            // create a the context for walking operators
+            NodeProcessorCtx opWalkerCtx = new LBOpWalkerCtx(pctx.getOpToPartToSkewedPruner(),
+                part);
+
+            // walk operator tree to create expression tree for list bucketing
+            PrunerUtils.walkOperatorTree(pctx, opWalkerCtx, LBProcFactory.getFilterProc(),
+                LBProcFactory.getDefaultProc());
+          }
+        }
+      }
+    }
+
+    return pctx;
+  }
+
+  /**
+   * Prunes to the directories which match the skewed keys in where clause.
+   *
+   *
+   * Algorithm
+   *
+   * =========
+   *
+   * For each possible skewed element combination:
+   * 1. walk through ExprNode tree
+   * 2. decide Boolean (True/False/unknown(null))
+   *
+   * Go through each skewed element combination again:
+   * 1. if it is skewed value, skip the directory only if it is false, otherwise keep it
+   * 2. skip the default directory only if all skewed elements,non-skewed value, are false.
+   *
+   * Example
+   * =======
+   * For example:
+   * 1. skewed column (list): C1, C2
+   * 2. skewed value (list of list): (1,a), (2,b), (1,c)
+   *
+   * Unique skewed elements for each skewed column (list of list):
+   * (1,2,other), (a,b,c,other)
+   *
+   * Index: (0,1,2) (0,1,2,3)
+   * Output matches order of skewed column. Output can be read as:
+   *
+   * C1 has unique element list (1,2,other)
+   * C2 has unique element list (a,b,c,other)
+   *
+   * C1\C2 | a | b | c |Other
+   * 1 | (1,a) | X | (1,c) |X
+   * 2 | X |(2,b) | X |X
+   * other | X | X | X |X
+   *
+   * Complete dynamic-multi-dimension collection
+   *
+   * (0,0) (1,a) * -> T
+   * (0,1) (1,b) -> T
+   * (0,2) (1,c) *-> F
+   * (0,3) (1,other)-> F
+   * (1,0) (2,a)-> F
+   * (1,1) (2,b) * -> T
+   * (1,2) (2,c)-> F
+   * (1,3) (2,other)-> F
+   * (2,0) (other,a) -> T
+   * (2,1) (other,b) -> T
+   * (2,2) (other,c) -> T
+   * (2,3) (other,other) -> T
+   * * is skewed value entry
+   *
+   * Expression Tree : ((c1=1) and (c2=a)) or ( (c1=3) or (c2=b))
+   *
+   * or
+   * / \
+   * and or
+   * / \ / \
+   * c1=1 c2=a c1=3 c2=b
+   *
+   *
+   * For each entry in dynamic-multi-dimension container
+   *
+   * 1. walk through the tree to decide value (please see map's value above)
+   * 2. if it is skewed value
+   * 2.1 remove the entry from the map
+   * 2.2 add directory to path unless value is false
+   * 3. otherwise, add value to map
+   *
+   * Once it is done, go through the rest entries in map to decide default directory
+   * 1. we know all is not skewed value
+   * 2. we skip default directory only if all value is false
+   *
+   * What we choose at the end?
+   *
+   * 1. directory for (1,a) because it 's skewed value and match returns true
+   * 2. directory for (2,b) because it 's skewed value and match returns true
+   * 3. default directory because not all non-skewed value returns false
+   *
+   * we skip directory for (1,c) since match returns false
+   *
+   * Note: unknown is marked in {@link #transform(ParseContext)} <blockquote>
+   * <pre>
+   * newcd = new ExprNodeConstantDesc(cd.getTypeInfo(), null)
+   * </pre>
+   *
+   * </blockquote> can be checked via <blockquote>
+   *
+   * <pre>
+   *     child_nd instanceof ExprNodeConstantDesc
+   *               && ((ExprNodeConstantDesc) child_nd).getValue() == null)
+   * </pre>
+   *
+   * </blockquote>
+   *
+   * @param ctx
+   *          parse context
+   * @param part
+   *          partition
+   * @param pruner
+   *          expression node tree
+   * @return
+   */
+  public static Path[] prune(ParseContext ctx, Partition part, ExprNodeDesc pruner) {
+    Path[] finalPaths = null;
+
+    try {
+      finalPaths = execute(ctx, part, pruner);
+    } catch (SemanticException e) {
+      // Use full partition path for error case.
+      LOG.warn("Using full partition scan :" + part.getPath() + ".", e);
+      finalPaths = part.getPath();
+    }
+
+    return finalPaths;
+  }
+
+  /**
+   * Main skeleton for list bucketing pruning.
+   *
+   * @param ctx
+   * @param part
+   * @param pruner
+   * @return
+   * @throws SemanticException
+   */
+  private static Path[] execute(ParseContext ctx, Partition part, ExprNodeDesc pruner)
+      throws SemanticException {
+    Path[] finalPaths;
+
+    List<Path> selectedPaths = new ArrayList<Path>();
+
+    if (ListBucketingPrunerUtils.isUnknownState(pruner)) {
+      // Use full partition path for error case.
+      LOG.warn("List bucketing pruner is either null or in unknown state "
+          + " so that it uses full partition scan :" + part.getPath());
+      finalPaths = part.getPath();
+    } else {
+      // Retrieve skewed columns.
+      List<List<String>> sVals = part.getSkewedColValues();
+
+      assert ((sVals != null) && (sVals.size() > 0)) :
+        part.getName() + " skewed metadata is corrupted. No skewed value information.";
+      // Calculate collection.
+      List<List<String>> indexCollection = DynamicMultiDimensionalCollection
+          .generateCollection(sVals);
+
+      assert (indexCollection != null) : "Collection is null.";
+
+      // Calculate unique skewed elements for each skewed column.
+      List<List<String>> uniqSkewValues = DynamicMultiDimensionalCollection.uniqueSkewedValueList(
+          sVals);
+      // Decide skewed value directory selection.
+      List<Boolean> nonSkewedValueMatchResult = decideSkewedValueDirSelection(part, pruner,
+          selectedPaths, indexCollection, uniqSkewValues);
+
+      // Decide default directory selection.
+      decideDefaultDirSelection(part, selectedPaths, nonSkewedValueMatchResult);
+
+      // Finalize paths.
+      finalPaths = generateFinalPath(part, selectedPaths);
+
+    }
+    return finalPaths;
+  }
+
+  /**
+   * Walk through every entry in complete collection
+   * 1. calculate if it matches expression tree
+   * 2. decide if select skewed value directory
+   * 3. store match result for non-skewed value for later handle on default directory
+   * C1\C2 | a | b | c |Other
+   * 1 | (1,a) | X | (1,c) |X
+   * 2 | X |(2,b) | X |X
+   * other | X | X | X |X
+   * Final result
+   * Complete dynamic-multi-dimension collection
+   * (0,0) (1,a) * -> T
+   * (0,1) (1,b) -> T
+   * (0,2) (1,c) *-> F
+   * (0,3) (1,other)-> F
+   * (1,0) (2,a)-> F
+   * (1,1) (2,b) * -> T
+   * (1,2) (2,c)-> F
+   * (1,3) (2,other)-> F
+   * (2,0) (other,a) -> T
+   * (2,1) (other,b) -> T
+   * (2,2) (other,c) -> T
+   * (2,3) (other,other) -> T
+   *
+   * * is skewed value entry
+   *
+   * 1. directory for (1,a) is chosen because it 's skewed value and match returns true
+   * 2. directory for (2,b) is chosen because it 's skewed value and match returns true
+   *
+   * @param part
+   * @param pruner
+   * @param selectedPaths
+   * @param collections
+   * @param uniqSkewedValues
+   * @return
+   * @throws SemanticException
+   */
+  private static List<Boolean> decideSkewedValueDirSelection(Partition part, ExprNodeDesc pruner,
+      List<Path> selectedPaths, List<List<String>> collections,
+      List<List<String>> uniqSkewedValues) throws SemanticException {
+    // For each entry in dynamic-multi-dimension collection.
+    List<String> skewedCols = part.getSkewedColNames(); // Retrieve skewed column.
+    Map<List<String>, String> mappings = part.getSkewedColValueLocationMaps(); // Retrieve skewed
+                                                                               // map.
+    assert ListBucketingPrunerUtils.isListBucketingPart(part) : part.getName()
+        + " skewed metadata is corrupted. No skewed column and/or location mappings information.";
+    List<List<String>> skewedValues = part.getSkewedColValues();
+    List<Boolean> nonSkewedValueMatchResult = new ArrayList<Boolean>();
+    for (List<String> cell : collections) {
+      // Walk through the tree to decide value.
+      // Example: skewed column: C1, C2 ;
+      // index: (1,a) ;
+      // expression tree: ((c1=1) and (c2=a)) or ((c1=3) or (c2=b))
+      Boolean matchResult = ListBucketingPrunerUtils.evaluateExprOnCell(skewedCols, cell, pruner,
+          uniqSkewedValues);
+      // Handle skewed value.
+      if (skewedValues.contains(cell)) { // if it is skewed value
+        if ((matchResult == null) || matchResult) { // add directory to path unless value is false
+          assert (mappings.containsKey(cell)) : "Skewed location mappings doesn't have an entry "
+            + "for a skewed value: " + cell;
+          selectedPaths.add(new Path(mappings.get(cell)));
+        }
+      } else {
+        // Non-skewed value, add it to list for later handle on default directory.
+        nonSkewedValueMatchResult.add(matchResult);
+      }
+    }
+    return nonSkewedValueMatchResult;
+  }
+
+  /**
+   * Decide whether should select the default directory.
+   *
+   * @param part
+   * @param selectedPaths
+   * @param nonSkewedValueMatchResult
+   */
+  private static void decideDefaultDirSelection(Partition part, List<Path> selectedPaths,
+      List<Boolean> nonSkewedValueMatchResult) {
+    boolean skipDefDir = true;
+    for (Boolean v : nonSkewedValueMatchResult) {
+      if ((v == null) || v) {
+        skipDefDir = false; // we skip default directory only if all value is false
+        break;
+      }
+    }
+    if (!skipDefDir) {
+      StringBuilder builder = new StringBuilder();
+      builder.append(part.getLocation());
+      builder.append(Path.SEPARATOR);
+      builder.append((FileUtils.makeDefaultListBucketingDirName(
+          part.getSkewedColNames(), SessionState.get().getConf())));
+      selectedPaths.add(new Path(builder.toString()));
+    }
+  }
+
+  /**
+   * Decide the final path.
+   *
+   * @param part
+   * @param selectedPaths
+   * @return
+   */
+  private static Path[] generateFinalPath(Partition part, List<Path> selectedPaths) {
+    Path[] finalPaths;
+    if (selectedPaths.size() == 0) {
+      LOG.warn("Using full partition scan :" + part.getPath() + ".");
+      finalPaths = part.getPath();
+    } else {
+      finalPaths = selectedPaths.toArray(new Path[0]);
+    }
+    return finalPaths;
+  }
+
+
+  /**
+   * Note: this class is not designed to be used in general but for list bucketing pruner only.
+   * The structure addresses the following requirements:
+   * 1. multiple dimension collection
+   * 2. length of each dimension is dynamic. It's decided at runtime.
+   * The first user is list bucketing pruner and used in pruning phase:
+   * 1. Each skewed column has a batch of skewed elements.
+   * 2. One skewed column represents one dimension.
+   * 3. Length of dimension is size of skewed elements.
+   * 4. no. of skewed columns and length of dimension are dynamic and configured by user.
+   * use case:
+   * ========
+   * Use case #1:
+   * Multiple dimension collection represents if to select a directory representing by the cell.
+   * skewed column: C1, C2, C3
+   * skewed value: (1,a,x), (2,b,x), (1,c,x), (2,a,y)
+   * Other: represent value for the column which is not part of skewed value.
+   * C3 = x
+   * C1\C2 | a | b | c |Other
+   * 1 | Boolean(1,a,x) | X | Boolean(1,c,x) |X
+   * 2 | X |Boolean(2,b,x) | X |X
+   * other | X | X | X |X
+   * C3 = y
+   * C1\C2 | a | b | c |Other
+   * 1 | X | X | X |X
+   * 2 | Boolean(2,a,y) | X | X |X
+   * other | X | X | X |X
+   * Boolean is cell type which can be False/True/Null(Unknown).
+   * (1,a,x) is just for information purpose to explain which skewed value it represents.
+   * 1. value of Boolean(1,a,x) represents if we select the directory for list bucketing
+   * 2. value of Boolean(2,b,x) represents if we select the directory for list bucketing
+   * ...
+   * 3. All the rest, marked as "X", will decide if to pickup the default directory.
+   * 4. Not only "other" columns/rows but also the rest as long as it doesn't represent skewed
+   * value.
+   * For cell representing skewed value:
+   * 1. False, skip the directory
+   * 2. True/Unknown, select the directory
+   * For cells representing default directory:
+   * 1. only if all cells are false, skip the directory
+   * 2. all other cases, select the directory
+   * Use case #2:
+   * Multiple dimension collection represents skewed elements so that walk through tree one by one.
+   * Cell is a List<String> representing the value mapping from index path and skewed value.
+   * skewed column: C1, C2, C3
+   * skewed value: (1,a,x), (2,b,x), (1,c,x), (2,a,y)
+   * Other: represent value for the column which is not part of skewed value.
+   * C3 = x
+   * C1\C2 | a | b | c |Other
+   * 1 | (1,a,x) | X | (1,c,x) |X
+   * 2 | X |(2,b,x) | X |X
+   * other | X | X | X |X
+   * C3 = y
+   * C1\C2 | a | b | c |Other
+   * 1 | X | X | X |X
+   * 2 | (2,a,y) | X | X |X
+   * other | X | X | X |X
+   * Implementation:
+   * ==============
+   * please see another example in {@link ListBucketingPruner#prune}
+   * We will use a HasMap to represent the Dynamic-Multiple-Dimension collection:
+   * 1. Key is List<Integer> representing the index path to the cell
+   * 2. value represents the cell (Boolean for use case #1, List<String> for case #2)
+   * For example:
+   * 1. skewed column (list): C1, C2, C3
+   * 2. skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y)
+   * From skewed value, we calculate the unique skewed element for each skewed column:
+   * C1: (1,2)
+   * C2: (a,b,c)
+   * C3: (x,y)
+   * We store them in list of list. We don't need to store skewed column name since we use order to
+   * match:
+   * 1. Skewed column (list): C1, C2, C3
+   * 2. Unique skewed elements for each skewed column (list of list):
+   * (1,2,other), (a,b,c,other), (x,y,other)
+   * 3. index (0,1,2) (0,1,2,3) (0,1,2)
+   *
+   * We use the index,starting at 0. to construct hashmap representing dynamic-multi-dimension
+   * collection:
+   * key (what skewed value key represents) -> value (Boolean for use case #1, List<String> for case
+   * #2).
+   * (0,0,0) (1,a,x)
+   * (0,0,1) (1,a,y)
+   * (0,1,0) (1,b,x)
+   * (0,1,1) (1,b,y)
+   * (0,2,0) (1,c,x)
+   * (0,2,1) (1,c,y)
+   * (1,0,0) (2,a,x)
+   * (1,0,1) (2,a,y)
+   * (1,1,0) (2,b,x)
+   * (1,1,1) (2,b,y)
+   * (1,2,0) (2,c,x)
+   * (1,2,1) (2,c,y)
+   * ...
+   */
+  public static class DynamicMultiDimensionalCollection {
+
+    /**
+     * Find out complete skewed-element collection
+     * For example:
+     * 1. skewed column (list): C1, C2
+     * 2. skewed value (list of list): (1,a), (2,b), (1,c)
+     * It returns the complete collection
+     * (1,a) , (1,b) , (1,c) , (1,other), (2,a), (2,b) , (2,c), (2,other), (other,a), (other,b),
+     * (other,c), (other,other)
+     * @throws SemanticException
+     */
+    public static List<List<String>> generateCollection(List<List<String>> values)
+        throws SemanticException {
+      // Calculate unique skewed elements for each skewed column.
+      List<List<String>> uniqSkewedElements = DynamicMultiDimensionalCollection.uniqueElementsList(
+          values, DEFAULT_SKEWED_KEY);
+      // Calculate complete dynamic-multi-dimension collection.
+      return DynamicMultiDimensionalCollection.flat(uniqSkewedElements);
+    }
+
+    /**
+     * Convert value to unique element list. This is specific for skew value use case:
+     * For example:
+     * 1. skewed column (list): C1, C2, C3
+     * 2. skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y)
+     * Input: skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y)
+     * Output: Unique skewed elements for each skewed column (list of list):
+     * (1,2,other), (a,b,c,other), (x,y,other)
+     * Output matches order of skewed column. Output can be read as:
+     * C1 has unique element list (1,2,other)
+     * C2 has unique element list (a,b,c,other)
+     * C3 has unique element list (x,y,other)
+     * Other represents any value which is not part skewed-value combination.
+     * @param values
+     *          skewed value list
+     * @return a list of unique element lists
+     */
+    public static List<List<String>> uniqueElementsList(List<List<String>> values,
+        String defaultDirName) {
+      // Get unique skewed value list.
+      List<List<String>> result = uniqueSkewedValueList(values);
+
+      // Add default dir at the end of each list
+      for (List<String> list : result) {
+        list.add(defaultDirName);
+      }
+
+      return result;
+    }
+
+    /**
+     * Convert value to unique skewed value list. It is used in
+     * {@link ListBucketingPrunerUtils#evaluateExprOnCell}
+     *
+     * For example:
+     *
+     * 1. skewed column (list): C1, C2, C3
+     * 2. skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y)
+     *
+     * Input: skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y)
+     * Output: Unique skewed value for each skewed column (list of list):
+     * (1,2), (a,b,c), (x,y)
+     *
+     * Output matches order of skewed column. Output can be read as:
+     * C1 has unique skewed value list (1,2,)
+     * C2 has unique skewed value list (a,b,c)
+     * C3 has unique skewed value list (x,y)
+     *
+     * @param values
+     *          skewed value list
+     * @return a list of unique skewed value lists
+     */
+    public static List<List<String>> uniqueSkewedValueList(List<List<String>> values) {
+      if ((values == null) || (values.size() == 0)) {
+        return null;
+      }
+
+      // skewed value has the same length.
+      List<List<String>> result = new ArrayList<List<String>>();
+      for (int i = 0; i < values.get(0).size(); i++) {
+        result.add(new ArrayList<String>());
+      }
+
+      // add unique element to list per occurrence order in skewed value.
+      // occurrence order in skewed value doesn't matter.
+      // as long as we add them to a list, order is preserved from now on.
+      for (List<String> value : values) {
+        for (int i = 0; i < value.size(); i++) {
+          if (!result.get(i).contains(value.get(i))) {
+            result.get(i).add(value.get(i));
+          }
+        }
+      }
+
+      return result;
+    }
+
+    /**
+     * Flat a dynamic-multi-dimension collection.
+     *
+     * For example:
+     * 1. skewed column (list): C1, C2, C3
+     * 2. skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y)
+     *
+     * Unique skewed elements for each skewed column (list of list):
+     * (1,2,other), (a,b,c,other)
+     * Index: (0,1,2) (0,1,2,3)
+     *
+     * Complete dynamic-multi-dimension collection
+     * (0,0) (1,a) * -> T
+     * (0,1) (1,b) -> T
+     * (0,2) (1,c) *-> F
+     * (0,3) (1,other)-> F
+     * (1,0) (2,a)-> F
+     * (1,1) (2,b) * -> T
+     * (1,2) (2,c)-> F
+     * (1,3) (2,other)-> F
+     * (2,0) (other,a) -> T
+     * (2,1) (other,b) -> T
+     * (2,2) (other,c) -> T
+     * (2,3) (other,other) -> T
+     * * is skewed value entry
+     *
+     * @param uniqSkewedElements
+     *
+     * @return
+     */
+    public static List<List<String>> flat(List<List<String>> uniqSkewedElements)
+        throws SemanticException {
+      if (uniqSkewedElements == null) {
+        return null;
+      }
+      List<List<String>> collection = new ArrayList<List<String>>();
+      walker(collection, uniqSkewedElements, new ArrayList<String>(), 0);
+      return collection;
+    }
+
+    /**
+     * Flat the collection recursively.
+     *
+     * @param finalResult
+     * @param input
+     * @param listSoFar
+     * @param level
+     * @throws SemanticException
+     */
+    private static void walker(List<List<String>> finalResult, final List<List<String>> input,
+        List<String> listSoFar, final int level) throws SemanticException {
+      // Base case.
+      if (level == (input.size() - 1)) {
+        assert (input.get(level) != null) : "Unique skewed element list has null list in " + level
+            + "th position.";
+        for (String v : input.get(level)) {
+          List<String> oneCompleteIndex = new ArrayList<String>(listSoFar);
+          oneCompleteIndex.add(v);
+          finalResult.add(oneCompleteIndex);
+        }
+        return;
+      }
+
+      // Recursive.
+      for (String v : input.get(level)) {
+        List<String> clonedListSoFar = new ArrayList<String>(listSoFar);
+        clonedListSoFar.add(v);
+        int nextLevel = level + 1;
+        walker(finalResult, input, clonedListSoFar, nextLevel);
+      }
+    }
+
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPrunerUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPrunerUtils.java?rev=1404924&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPrunerUtils.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/listbucketingpruner/ListBucketingPrunerUtils.java Fri Nov  2 11:20:26 2012
@@ -0,0 +1,374 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;
+
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
+
+
+/**
+ * Utility for list bucketing prune.
+ *
+ */
+public final class ListBucketingPrunerUtils {
+
+  /**
+   * Decide if pruner skips the skewed directory
+   * Input: if the skewed value matches the expression tree
+   * Ouput: if pruner should skip the directory represented by the skewed value
+   * If match result is unknown(null) or true, pruner doesn't skip the directory
+   * If match result is false, pruner skips the dir.
+   * @param bool
+   *          if the skewed value matches the expression tree
+   * @return
+   */
+  public static boolean skipSkewedDirectory(Boolean bool) {
+    if (bool == null) {
+      return false;
+    }
+    return !bool.booleanValue();
+  }
+
+  /**
+   * or 2 Boolean operands in the context of pruning match
+   *
+   * Operand one|Operand another | or result
+   * unknown | T | T
+   * unknown | F | unknown
+   * unknown | unknown | unknown
+   * T | T | T
+   * T | F | T
+   * T | unknown | unknown
+   * F | T | T
+   * F | F | F
+   * F | unknown | unknown
+   */
+  public static Boolean orBoolOperand(Boolean o, Boolean a) {
+    // pick up unknown case
+    if (o == null) {
+      if ((a == null) || !a) {
+        return null;
+      } else {
+        return a;
+      }
+    } else if (a == null) {
+      return null;
+    }
+
+    return (o || a);
+  }
+
+  /**
+   * And 2 Boolean operands in the context of pruning match
+   *
+   * Operand one|Operand another | And result
+   * unknown | T | unknown
+   * unknown | F | F
+   * unknown | unknown | unknown
+   * T | T | T
+   * T | F | F
+   * T | unknown | unknown
+   * F | T | F
+   * F | F | F
+   * F | unknown | F
+   * @param o
+   *          one operand
+   * @param a
+   *          another operand
+   * @return result
+   */
+  public static Boolean andBoolOperand(Boolean o, Boolean a) {
+    // pick up unknown case and let and operator handle the rest
+    if (o == null) {
+      if ((a == null) || a) {
+        return null;
+      } else {
+        return a;
+      }
+    } else if (a == null) {
+      return o ? null : Boolean.FALSE;
+    }
+
+    return (o && a);
+  }
+
+  /**
+   * Not a Boolean operand in the context of pruning match
+   *
+   * Operand | Not
+   * T | F
+   * F | T
+   * unknown | unknown
+   * @param input
+   *          match result
+   * @return
+   */
+  public static Boolean notBoolOperand(Boolean input) {
+    if (input == null) {
+      return null;
+    }
+    return input ? Boolean.FALSE : Boolean.TRUE;
+  }
+
+  /**
+   * 1. Walk through the tree to decide value
+   * 1.1 true means the element matches the expression tree
+   * 1.2 false means the element doesn't match the expression tree
+   * 1.3 unknown means not sure if the element matches the expression tree
+   *
+   * Example:
+   * skewed column: C1, C2
+   * cell: (1,a) , (1,b) , (1,c) , (1,other), (2,a), (2,b) , (2,c), (2,other), (other,a), (other,b),
+   * (other,c), (other,other)
+   *
+   * * Expression Tree : ((c1=1) and (c2=a)) or ( (c1=3) or (c2=b))
+   *
+   * or
+   * / \
+   * and or
+   * / \ / \
+   * c1=1 c2=a c1=3 c2=b
+   * @throws SemanticException
+   *
+   */
+  static Boolean evaluateExprOnCell(List<String> skewedCols, List<String> cell,
+      ExprNodeDesc pruner, List<List<String>> uniqSkewedValues) throws SemanticException {
+    return recursiveExpr(pruner, skewedCols, cell, uniqSkewedValues);
+  }
+
+  /**
+   * Walk through expression tree recursively to evaluate.
+   *
+   *
+   * @param node
+   * @param skewedCols
+   * @param cell
+   * @return
+   * @throws SemanticException
+   */
+  private static Boolean recursiveExpr(final ExprNodeDesc node, final List<String> skewedCols,
+      final List<String> cell, final List<List<String>> uniqSkewedValues)
+      throws SemanticException {
+    if (isUnknownState(node)) {
+      return null;
+    }
+
+    if (node instanceof ExprNodeGenericFuncDesc) {
+      if (((ExprNodeGenericFuncDesc) node).getGenericUDF() instanceof GenericUDFOPEqual) {
+        return evaluateEqualNd(node, skewedCols, cell, uniqSkewedValues);
+      } else if (FunctionRegistry.isOpAnd(node)) {
+        return evaluateAndNode(node, skewedCols, cell, uniqSkewedValues);
+      } else if (FunctionRegistry.isOpOr(node)) {
+        return evaluateOrNode(node, skewedCols, cell, uniqSkewedValues);
+      } else if (FunctionRegistry.isOpNot(node)) {
+        return evaluateNotNode(node, skewedCols, cell, uniqSkewedValues);
+      } else {
+        return null;
+      }
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Evaluate equal node.
+   *
+   *
+   * @param node
+   * @param skewedCols
+   * @param cell
+   * @param uniqSkewedValues
+   * @return
+   * @throws SemanticException
+   */
+  private static Boolean evaluateEqualNd(final ExprNodeDesc node, final List<String> skewedCols,
+      final List<String> cell, final List<List<String>> uniqSkewedValues) throws SemanticException {
+    Boolean result = null;
+    List<ExprNodeDesc> children = ((ExprNodeGenericFuncDesc) node).getChildren();
+    assert ((children != null) && (children.size() == 2)) : "GenericUDFOPEqual should have 2 " +
+        "ExprNodeDesc. Node name : " + node.getName();
+    ExprNodeDesc left = children.get(0);
+    ExprNodeDesc right = children.get(1);
+    assert (left instanceof ExprNodeColumnDesc && right instanceof ExprNodeConstantDesc) :
+      "GenericUDFOPEqual should have 2 children: "
+        + " the first is ExprNodeColumnDesc and the second is ExprNodeConstantDesc. "
+        + "But this one, the first one is " + left.getName() + " and the second is "
+        + right.getName();
+    result = startComparisonInEqualNode(skewedCols, cell, uniqSkewedValues, result, left, right);
+    return result;
+  }
+
+  /**
+   * Comparison in equal node
+   *
+   * @param skewedCols
+   * @param cell
+   * @param uniqSkewedValues
+   * @param result
+   * @param left
+   * @param right
+   * @return
+   * @throws SemanticException
+   */
+  private static Boolean startComparisonInEqualNode(final List<String> skewedCols,
+      final List<String> cell, final List<List<String>> uniqSkewedValues, Boolean result,
+      ExprNodeDesc left, ExprNodeDesc right) throws SemanticException {
+    String columnNameInFilter = ((ExprNodeColumnDesc) left).getColumn();
+    String constantValueInFilter = ((ExprNodeConstantDesc) right).getValue().toString();
+    assert (skewedCols.contains(columnNameInFilter)) : "List bucketing pruner has a column name "
+        + columnNameInFilter
+        + " which is not found in the partiton's skewed column list";
+    int index = skewedCols.indexOf(columnNameInFilter);
+    assert (index < cell.size()) : "GenericUDFOPEqual has a ExprNodeColumnDesc ("
+        + columnNameInFilter + ") which is " + index + "th" + "skewed column. "
+        + " But it can't find the matching part in cell." + " Because the cell size is "
+        + cell.size();
+    String cellValueInPosition = cell.get(index);
+    assert (index < uniqSkewedValues.size()) : "GenericUDFOPEqual has a ExprNodeColumnDesc ("
+        + columnNameInFilter + ") which is " + index + "th" + "skewed column. "
+        + " But it can't find the matching part in uniq skewed value list."
+        + " Because the cell size is "
+        + uniqSkewedValues.size();
+    List<String> uniqSkewedValuesInPosition = uniqSkewedValues.get(index);
+    result = coreComparisonInEqualNode(constantValueInFilter, cellValueInPosition,
+        uniqSkewedValuesInPosition);
+    return result;
+  }
+
+  /**
+   * Compare
+   * @param constantValueInFilter
+   * @param cellValueInPosition
+   * @param uniqSkewedValuesInPosition
+   * @return
+   */
+  private static Boolean coreComparisonInEqualNode(String constantValueInFilter,
+      String cellValueInPosition, List<String> uniqSkewedValuesInPosition) {
+   Boolean result;
+    // Compare cell value with constant value in filter
+    // 1 if they match and cell value isn't other, return true
+    // 2 if they don't match but cell is other and value in filter is not skewed value,
+    //   return unknown. why not true? true is not enough. since not true is false,
+    //   but not unknown is unknown.
+    //   For example, skewed column C, skewed value 1, 2. clause: where not ( c =3)
+    //   cell is other, evaluate (not(c=3)).
+    //   other to (c=3), if ture. not(c=3) will be false. but it is wrong skip default dir
+    //   but, if unknown. not(c=3) will be unknown. we will choose default dir.
+    // 3 all others, return false
+    if (cellValueInPosition.equals(constantValueInFilter)
+        && !cellValueInPosition.equals(ListBucketingPruner.DEFAULT_SKEWED_KEY)) {
+      result = Boolean.TRUE;
+    } else if (cellValueInPosition.equals(ListBucketingPruner.DEFAULT_SKEWED_KEY)
+        && !uniqSkewedValuesInPosition.contains(constantValueInFilter)) {
+      result = null;
+    } else {
+      result = Boolean.FALSE;
+    }
+    return result;
+  }
+
+  private static Boolean evaluateNotNode(final ExprNodeDesc node, final List<String> skewedCols,
+      final List<String> cell, final List<List<String>> uniqSkewedValues) throws SemanticException {
+    List<ExprNodeDesc> children = ((ExprNodeGenericFuncDesc) node).getChildren();
+    if ((children == null) || (children.size() != 1)) {
+      throw new SemanticException("GenericUDFOPNot should have 1 ExprNodeDesc. Node name : "
+          + node.getName());
+    }
+    ExprNodeDesc child = children.get(0);
+    return notBoolOperand(recursiveExpr(child, skewedCols, cell, uniqSkewedValues));
+  }
+
+  private static Boolean evaluateOrNode(final ExprNodeDesc node, final List<String> skewedCols,
+      final List<String> cell, final List<List<String>> uniqSkewedValues) throws SemanticException {
+    List<ExprNodeDesc> children = ((ExprNodeGenericFuncDesc) node).getChildren();
+    if ((children == null) || (children.size() != 2)) {
+      throw new SemanticException("GenericUDFOPOr should have 2 ExprNodeDesc. Node name : "
+          + node.getName());
+    }
+    ExprNodeDesc left = children.get(0);
+    ExprNodeDesc right = children.get(1);
+    return orBoolOperand(recursiveExpr(left, skewedCols, cell, uniqSkewedValues),
+        recursiveExpr(right, skewedCols, cell, uniqSkewedValues));
+  }
+
+  private static Boolean evaluateAndNode(final ExprNodeDesc node, final List<String> skewedCols,
+      final List<String> cell, final List<List<String>> uniqSkewedValues) throws SemanticException {
+    List<ExprNodeDesc> children = ((ExprNodeGenericFuncDesc) node).getChildren();
+    if ((children == null) || (children.size() != 2)) {
+      throw new SemanticException("GenericUDFOPAnd should have 2 ExprNodeDesc. Node name : "
+          + node.getName());
+    }
+    ExprNodeDesc left = children.get(0);
+    ExprNodeDesc right = children.get(1);
+    return andBoolOperand(recursiveExpr(left, skewedCols, cell, uniqSkewedValues),
+        recursiveExpr(right, skewedCols, cell, uniqSkewedValues));
+  }
+
+  /**
+   * Check if the node is unknown
+   *
+   *
+   * unknown is marked in {@link #transform(ParseContext)} <blockquote>
+   *
+   * <pre>
+   * newcd = new ExprNodeConstantDesc(cd.getTypeInfo(), null)
+   * </pre>
+   *
+   * like
+   *
+   * 1. non-skewed column
+   *
+   * 2. non and/or/not ...
+   *
+   *
+   * @param descNd
+   * @return
+   */
+  static boolean isUnknownState(ExprNodeDesc descNd) {
+    boolean unknown = false;
+    if ((descNd == null)
+        || (descNd instanceof ExprNodeConstantDesc
+        && ((ExprNodeConstantDesc) descNd).getValue() == null)) {
+      unknown = true;
+    }
+    return unknown;
+  }
+
+  /**
+   * check if the partition is list bucketing
+   *
+   * @param part
+   * @return
+   */
+  public static boolean isListBucketingPart(Partition part) {
+    return (part.getSkewedColNames() != null) && (part.getSkewedColNames().size() > 0)
+        && (part.getSkewedColValues() != null) && (part.getSkewedColValues().size() > 0)
+        && (part.getSkewedColValueLocationMaps() != null)
+        && (part.getSkewedColValueLocationMaps().size() > 0);
+  }
+
+}



Mime
View raw message