Mailing-List: contact commits-help@hive.apache.org; run by ezmlm
Precedence: bulk
Reply-To: hive-dev@hive.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: weiz@apache.org
To: commits@hive.apache.org
Message-Id: <40eadc84aef9461ca1c1f140f9d7c1db@git.apache.org>
Subject: hive git commit: HIVE-13040 : Handle empty bucket creations more
 efficiently (Ashutosh Chauhan, reviewed by Prasanth Jayachandran)
Date: Thu, 14 Jul 2016 22:09:02 +0000 (UTC)
archived-at: Thu, 14 Jul 2016 22:09:05 -0000

Repository: hive
Updated Branches:
  refs/heads/branch-1 8f500f8ad -> 3e51861a2


HIVE-13040 : Handle empty bucket creations more efficiently (Ashutosh Chauhan, reviewed by Prasanth Jayachandran)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/3e51861a
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/3e51861a
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/3e51861a

Branch: refs/heads/branch-1
Commit: 3e51861a215f62e842489f584a87b5be96316a41
Parents: 8f500f8
Author: Wei Zheng <weiz@apache.org>
Authored: Thu Jul 14 15:09:48 2016 -0700
Committer: Wei Zheng <weiz@apache.org>
Committed: Thu Jul 14 15:09:48 2016 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/exec/StatsNoJobTask.java     | 67 ++++++++++---------
 .../apache/hadoop/hive/ql/exec/Utilities.java   |  5 +-
 .../org/apache/hadoop/hive/ql/io/AcidUtils.java | 17 +++--
 .../apache/hadoop/hive/ql/io/orc/OrcFile.java   |  8 +++
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   | 16 +++--
 .../hadoop/hive/ql/io/orc/OrcOutputFormat.java  | 17 ++---
 .../hive/ql/txn/compactor/CompactorMR.java      |  2 +-
 .../hadoop/hive/ql/txn/compactor/Initiator.java |  2 +-
 .../hive/ql/io/orc/TestInputOutputFormat.java   | 10 +--
 .../dynpart_sort_opt_vectorization.q.out        |  4 +-
 .../tez/dynpart_sort_opt_vectorization.q.out    |  8 +--
 .../tez/dynpart_sort_optimization.q.out         | 70 +++++++++-----------
 .../apache/hadoop/hive/shims/Hadoop23Shims.java |  2 +-
 13 files changed, 121 insertions(+), 107 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java
index 0d99cbc..fe49e15 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java
@@ -123,7 +123,7 @@ public class StatsNoJobTask extends Task<StatsNoJobWork> implements Serializable
 
   class StatsCollection implements Runnable {
 
-    private Partition partn;
+    private final Partition partn;
 
     public StatsCollection(Partition part) {
       this.partn = part;
@@ -148,7 +148,7 @@ public class StatsNoJobTask extends Task<StatsNoJobWork> implements Serializable
         boolean statsAvailable = false;
         for(FileStatus file: fileList) {
           if (!file.isDir()) {
-            InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtil.newInstance(
+            InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(
                 partn.getInputFormatClass(), jc);
             InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0,
                 new String[] { partn.getLocation() });
@@ -193,7 +193,7 @@ public class StatsNoJobTask extends Task<StatsNoJobWork> implements Serializable
             "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
 
         // Before updating the partition params, if any partition params is null
-        // and if statsReliable is true then updatePartition() function  will fail 
+        // and if statsReliable is true then updatePartition() function  will fail
         // the task by returning 1
         if (work.isStatsReliable()) {
           partUpdates.put(tPart.getSd().getLocation(), null);
@@ -244,40 +244,45 @@ public class StatsNoJobTask extends Task<StatsNoJobWork> implements Serializable
           boolean statsAvailable = false;
           for(FileStatus file: fileList) {
             if (!file.isDir()) {
-              InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtil.newInstance(
+              InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(
                   table.getInputFormatClass(), jc);
-              InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table
-                  .getDataLocation().toString() });
-              org.apache.hadoop.mapred.RecordReader<?, ?> recordReader =
-                  inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
-              StatsProvidingRecordReader statsRR;
-              if (recordReader instanceof StatsProvidingRecordReader) {
-                statsRR = (StatsProvidingRecordReader) recordReader;
-                numRows += statsRR.getStats().getRowCount();
-                rawDataSize += statsRR.getStats().getRawDataSize();
-                fileSize += file.getLen();
+              InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[]{table
+                  .getDataLocation().toString()});
+              if (file.getLen() == 0) {
                 numFiles += 1;
                 statsAvailable = true;
+              } else {
+                org.apache.hadoop.mapred.RecordReader<?, ?> recordReader =
+                    inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
+                StatsProvidingRecordReader statsRR;
+                if (recordReader instanceof StatsProvidingRecordReader) {
+                  statsRR = (StatsProvidingRecordReader) recordReader;
+                  numRows += statsRR.getStats().getRowCount();
+                  rawDataSize += statsRR.getStats().getRawDataSize();
+                  fileSize += file.getLen();
+                  numFiles += 1;
+                  statsAvailable = true;
+                }
+                recordReader.close();
               }
-              recordReader.close();
             }
-          }
 
-          if (statsAvailable) {
-            parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
-            parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
-            parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
-            parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
-            parameters.put(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK, StatsSetupConst.TRUE);
-
-            db.alterTable(tableFullName, new Table(tTable));
-
-            String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']';
-            LOG.debug(msg);
-            console.printInfo(msg);
-          } else {
-            String msg = "Table " + tableFullName + " does not provide stats.";
-            LOG.debug(msg);
+            if (statsAvailable) {
+              parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
+              parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
+              parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
+              parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
+              parameters.put(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK, StatsSetupConst.TRUE);
+
+              db.alterTable(tableFullName, new Table(tTable));
+
+              String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']';
+              LOG.debug(msg);
+              console.printInfo(msg);
+            } else {
+              String msg = "Table " + tableFullName + " does not provide stats.";
+              LOG.debug(msg);
+            }
           }
         } catch (Exception e) {
           console.printInfo("[Warning] could not update stats for " + tableFullName + ".",

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
index 4093134..0a32e6c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
@@ -2135,7 +2135,7 @@ public final class Utilities {
 
         taskIDToFile = removeTempOrDuplicateFiles(items, fs);
         // if the table is bucketed and enforce bucketing, we should check and generate all buckets
-        if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null) {
+        if (dpCtx.getNumBuckets() > 0 && taskIDToFile != null && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) {
           // refresh the file list
           items = fs.listStatus(parts[i].getPath());
           // get the missing buckets and generate empty buckets
@@ -2155,8 +2155,7 @@ public final class Utilities {
       FileStatus[] items = fs.listStatus(path);
       taskIDToFile = removeTempOrDuplicateFiles(items, fs);
       if(taskIDToFile != null && taskIDToFile.size() > 0 && conf != null && conf.getTable() != null
-          && (conf.getTable().getNumBuckets() > taskIDToFile.size())
-          && (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEENFORCEBUCKETING))) {
+          && (conf.getTable().getNumBuckets() > taskIDToFile.size()) && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) {
           // get the missing buckets and generate empty buckets for non-dynamic partition
         String taskID1 = taskIDToFile.keySet().iterator().next();
         Path bucketPath = taskIDToFile.values().iterator().next().getPath();

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 8bcf6d7..7d1517d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -102,6 +102,7 @@ public class AcidUtils {
       Pattern.compile("[0-9]+_[0-9]+");
 
   public static final PathFilter hiddenFileFilter = new PathFilter(){
+    @Override
     public boolean accept(Path p){
       String name = p.getName();
       return !name.startsWith("_") && !name.startsWith(".");
@@ -460,7 +461,14 @@ public class AcidUtils {
     return false;
   }
 
-  /**
+  public static Directory getAcidState(Path directory,
+      Configuration conf,
+      ValidTxnList txnList
+      ) throws IOException {
+    return getAcidState(directory, conf, txnList, false);
+  }
+
+ /**
    * Get the ACID state of the given directory. It finds the minimal set of
    * base and diff directories. Note that because major compactions don't
    * preserve the history, we can't use a base directory that includes a
@@ -473,7 +481,8 @@ public class AcidUtils {
    */
   public static Directory getAcidState(Path directory,
                                        Configuration conf,
-                                       ValidTxnList txnList
+                                       ValidTxnList txnList,
+                                       boolean ignoreEmptyFiles
                                        ) throws IOException {
     FileSystem fs = directory.getFileSystem(conf);
     FileStatus bestBase = null;
@@ -513,7 +522,7 @@ public class AcidUtils {
         // it is possible that the cleaner is running and removing these original files,
         // in which case recursing through them could cause us to get an error.
         originalDirectories.add(child);
-      } else {
+      } else  if (!ignoreEmptyFiles || child.getLen() != 0) {
         original.add(child);
       }
     }
@@ -590,7 +599,7 @@ public class AcidUtils {
       }
     };
   }
-
+ 
   /**
    * Find the original files (non-ACID layout) recursively under the partition
    * directory.

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
index 906eb6b..dc00e38 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java
@@ -491,6 +491,14 @@ public final class OrcFile {
       return this;
     }
 
+    public FileSystem getFileSystem() {
+      return fileSystemValue;
+    }
+    
+    public Configuration getConfiguration() {
+      return configuration;
+    }
+ 
     public int getBufferSize() {
       return bufferSizeValue;
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 35469d1..94b5461 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -633,12 +633,14 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
     public List<OrcSplit> getSplits() throws IOException {
       List<OrcSplit> splits = Lists.newArrayList();
       for (FileStatus fileStatus : fileStatuses) {
-        TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
-        for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
-          OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), entry.getKey(),
-                  entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
-                  deltas, -1, fileStatus.getLen());
-          splits.add(orcSplit);
+        if (fileStatus.getLen() != 0) {
+          TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
+          for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
+            OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), entry.getKey(),
+                entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
+                deltas, -1, fileStatus.getLen());
+            splits.add(orcSplit);
+          }
         }
       }
 
@@ -710,7 +712,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
     public SplitStrategy call() throws IOException {
       final SplitStrategy splitStrategy;
       AcidUtils.Directory dirInfo = AcidUtils.getAcidState(dir,
-          context.conf, context.transactionList);
+          context.conf, context.transactionList, true);
       List<DeltaMetaData> deltas = AcidUtils.serializeDeltas(dirInfo.getCurrentDirectories());
       Path base = dirInfo.getBaseDirectory();
       List<FileStatus> original = dirInfo.getOriginalFiles();

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
index bc55677..7d1b994 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
@@ -93,23 +93,20 @@ public class OrcOutputFormat extends FileOutputFormat<NullWritable, OrcSerdeRow>
 
     @Override
     public void close(boolean b) throws IOException {
-      // if we haven't written any rows, we need to create a file with a
-      // generic schema.
       if (writer == null) {
-        // a row with no columns
-        ObjectInspector inspector = ObjectInspectorFactory.
-            getStandardStructObjectInspector(new ArrayList<String>(),
-                new ArrayList<ObjectInspector>());
-        options.inspector(inspector);
-        writer = OrcFile.createWriter(path, options);
+        // we are closing a file without writing any data in it
+        FileSystem fs = options.getFileSystem() == null ?
+            path.getFileSystem(options.getConfiguration()) : options.getFileSystem();        
+        fs.createNewFile(path);
+        return;
       }
       writer.close();
     }
 
     @Override
     public SerDeStats getStats() {
-      stats.setRawDataSize(writer.getRawDataSize());
-      stats.setRowCount(writer.getNumberOfRows());
+      stats.setRawDataSize(null == writer ? 0 : writer.getRawDataSize());
+      stats.setRowCount(null == writer ? 0 : writer.getNumberOfRows());
       return stats;
     }
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
index e7ea70f..d99bbd4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/CompactorMR.java
@@ -200,7 +200,7 @@ public class CompactorMR {
     // and discovering that in getSplits is too late as we then have no way to pass it to our
     // mapper.
 
-    AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns);
+    AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns, true);
     List<AcidUtils.ParsedDelta> parsedDeltas = dir.getCurrentDirectories();
     int maxDeltastoHandle = conf.getIntVar(HiveConf.ConfVars.COMPACTOR_MAX_NUM_DELTA);
     if(parsedDeltas.size() > maxDeltastoHandle) {

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java
index 1a63f99..d654b76 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/Initiator.java
@@ -253,7 +253,7 @@ public class Initiator extends CompactorThread {
     boolean noBase = false;
     Path location = new Path(sd.getLocation());
     FileSystem fs = location.getFileSystem(conf);
-    AcidUtils.Directory dir = AcidUtils.getAcidState(location, conf, txns);
+    AcidUtils.Directory dir = AcidUtils.getAcidState(location, conf, txns, false);
     Path base = dir.getBaseDirectory();
     long baseSize = 0;
     FileStatus stat = null;

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 31d561b..47abc74 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -537,11 +537,11 @@ public class TestInputOutputFormat {
   public void testFileGenerator() throws Exception {
     OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
     MockFileSystem fs = new MockFileSystem(conf,
-        new MockFile("mock:/a/b/part-00", 1000, new byte[0]),
-        new MockFile("mock:/a/b/part-01", 1000, new byte[0]),
-        new MockFile("mock:/a/b/_part-02", 1000, new byte[0]),
-        new MockFile("mock:/a/b/.part-03", 1000, new byte[0]),
-        new MockFile("mock:/a/b/part-04", 1000, new byte[0]));
+        new MockFile("mock:/a/b/part-00", 1000, new byte[1]),
+        new MockFile("mock:/a/b/part-01", 1000, new byte[1]),
+        new MockFile("mock:/a/b/_part-02", 1000, new byte[1]),
+        new MockFile("mock:/a/b/.part-03", 1000, new byte[1]),
+        new MockFile("mock:/a/b/part-04", 1000, new byte[1]));
     OrcInputFormat.FileGenerator gen =
       new OrcInputFormat.FileGenerator(context, fs,
           new MockPath(fs, "mock:/a/b"));

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
index 4c8ddd3..87f19ab 100644
--- a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
+++ b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
@@ -1114,7 +1114,7 @@ Partition Parameters:
 	numFiles            	8                   
 	numRows             	6                   
 	rawDataSize         	120                 
-	totalSize           	2212                
+	totalSize           	2016                
 #### A masked pattern was here ####
 	 	 
 # Storage Information	 	 
@@ -1200,7 +1200,7 @@ Partition Parameters:
 	numFiles            	8                   
 	numRows             	6                   
 	rawDataSize         	120                 
-	totalSize           	2212                
+	totalSize           	2016                
 #### A masked pattern was here ####
 	 	 
 # Storage Information	 	 

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
index 22afcbf..6967548 100644
--- a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
+++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
@@ -1163,10 +1163,10 @@ Protect Mode:       	None
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	true                
-	numFiles            	8                   
+	numFiles            	4                   
 	numRows             	6                   
 	rawDataSize         	120                 
-	totalSize           	2212                
+	totalSize           	2016                
 #### A masked pattern was here ####
 	 	 
 # Storage Information	 	 
@@ -1249,10 +1249,10 @@ Protect Mode:       	None
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	true                
-	numFiles            	8                   
+	numFiles            	4                   
 	numRows             	6                   
 	rawDataSize         	120                 
-	totalSize           	2212                
+	totalSize           	2016                
 #### A masked pattern was here ####
 	 	 
 # Storage Information	 	 

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
index 6df203a..3032b21 100644
--- a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
+++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
@@ -1076,7 +1076,7 @@ Protect Mode:       	None
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	true                
-	numFiles            	8                   
+	numFiles            	4                   
 	numRows             	6                   
 	rawDataSize         	156                 
 	totalSize           	162                 
@@ -1162,7 +1162,7 @@ Protect Mode:       	None
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	true                
-	numFiles            	8                   
+	numFiles            	4                   
 	numRows             	6                   
 	rawDataSize         	156                 
 	totalSize           	162                 
@@ -2435,7 +2435,6 @@ STAGE DEPENDENCIES:
 STAGE PLANS:
   Stage: Stage-1
     Tez
-#### A masked pattern was here ####
       Edges:
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -2449,15 +2448,15 @@ STAGE PLANS:
                     predicate: (s = 'foo') (type: boolean)
                     Statistics: Num rows: 1974 Data size: 53304 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
-                      expressions: si (type: smallint), b (type: bigint), f (type: float), t (type: tinyint), i (type: int)
-                      outputColumnNames: _col0, _col1, _col2, _col4, _col5
+                      expressions: si (type: smallint), b (type: bigint), f (type: float), 'foo' (type: string), t (type: tinyint), i (type: int)
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
                       Statistics: Num rows: 1974 Data size: 53304 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
-                        key expressions: 'foo' (type: string), _col4 (type: tinyint), _col5 (type: int)
+                        key expressions: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         sort order: +++
-                        Map-reduce partition columns: 'foo' (type: string), _col4 (type: tinyint), _col5 (type: int)
+                        Map-reduce partition columns: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         Statistics: Num rows: 1974 Data size: 53304 Basic stats: COMPLETE Column stats: NONE
-                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), 'foo' (type: string), _col4 (type: tinyint), _col5 (type: int)
+                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
         Reducer 2 
             Reduce Operator Tree:
               Select Operator
@@ -2506,7 +2505,6 @@ STAGE DEPENDENCIES:
 STAGE PLANS:
   Stage: Stage-1
     Tez
-#### A masked pattern was here ####
       Edges:
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -2520,15 +2518,15 @@ STAGE PLANS:
                     predicate: (t = 27) (type: boolean)
                     Statistics: Num rows: 429 Data size: 53255 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
-                      expressions: si (type: smallint), b (type: bigint), f (type: float), s (type: string), i (type: int)
-                      outputColumnNames: _col0, _col1, _col2, _col3, _col5
+                      expressions: si (type: smallint), b (type: bigint), f (type: float), s (type: string), 27 (type: tinyint), i (type: int)
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
                       Statistics: Num rows: 429 Data size: 53255 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
-                        key expressions: _col3 (type: string), 27 (type: tinyint), _col5 (type: int)
+                        key expressions: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         sort order: +++
-                        Map-reduce partition columns: _col3 (type: string), 27 (type: tinyint), _col5 (type: int)
+                        Map-reduce partition columns: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         Statistics: Num rows: 429 Data size: 53255 Basic stats: COMPLETE Column stats: NONE
-                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), 27 (type: tinyint), _col5 (type: int)
+                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
         Reducer 2 
             Reduce Operator Tree:
               Select Operator
@@ -2577,7 +2575,6 @@ STAGE DEPENDENCIES:
 STAGE PLANS:
   Stage: Stage-1
     Tez
-#### A masked pattern was here ####
       Edges:
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -2591,15 +2588,15 @@ STAGE PLANS:
                     predicate: (i = 100) (type: boolean)
                     Statistics: Num rows: 429 Data size: 53255 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
-                      expressions: si (type: smallint), b (type: bigint), f (type: float), s (type: string), t (type: tinyint)
-                      outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                      expressions: si (type: smallint), b (type: bigint), f (type: float), s (type: string), t (type: tinyint), 100 (type: int)
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
                       Statistics: Num rows: 429 Data size: 53255 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
-                        key expressions: _col3 (type: string), _col4 (type: tinyint), 100 (type: int)
+                        key expressions: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         sort order: +++
-                        Map-reduce partition columns: _col3 (type: string), _col4 (type: tinyint), 100 (type: int)
+                        Map-reduce partition columns: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         Statistics: Num rows: 429 Data size: 53255 Basic stats: COMPLETE Column stats: NONE
-                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), _col4 (type: tinyint), 100 (type: int)
+                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
         Reducer 2 
             Reduce Operator Tree:
               Select Operator
@@ -2648,7 +2645,6 @@ STAGE DEPENDENCIES:
 STAGE PLANS:
   Stage: Stage-1
     Tez
-#### A masked pattern was here ####
       Edges:
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -2662,15 +2658,15 @@ STAGE PLANS:
                     predicate: ((i = 100) and (t = 27)) (type: boolean)
                     Statistics: Num rows: 214 Data size: 26565 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
-                      expressions: si (type: smallint), b (type: bigint), f (type: float), s (type: string)
-                      outputColumnNames: _col0, _col1, _col2, _col3
+                      expressions: si (type: smallint), b (type: bigint), f (type: float), s (type: string), 27 (type: tinyint), 100 (type: int)
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
                       Statistics: Num rows: 214 Data size: 26565 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
-                        key expressions: _col3 (type: string), 27 (type: tinyint), 100 (type: int)
+                        key expressions: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         sort order: +++
-                        Map-reduce partition columns: _col3 (type: string), 27 (type: tinyint), 100 (type: int)
+                        Map-reduce partition columns: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         Statistics: Num rows: 214 Data size: 26565 Basic stats: COMPLETE Column stats: NONE
-                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), 27 (type: tinyint), 100 (type: int)
+                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
         Reducer 2 
             Reduce Operator Tree:
               Select Operator
@@ -2719,7 +2715,6 @@ STAGE DEPENDENCIES:
 STAGE PLANS:
   Stage: Stage-1
     Tez
-#### A masked pattern was here ####
       Edges:
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -2733,15 +2728,15 @@ STAGE PLANS:
                     predicate: ((i = 100) and (s = 'foo')) (type: boolean)
                     Statistics: Num rows: 987 Data size: 26652 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
-                      expressions: si (type: smallint), b (type: bigint), f (type: float), t (type: tinyint)
-                      outputColumnNames: _col0, _col1, _col2, _col4
+                      expressions: si (type: smallint), b (type: bigint), f (type: float), 'foo' (type: string), t (type: tinyint), 100 (type: int)
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
                       Statistics: Num rows: 987 Data size: 26652 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
-                        key expressions: 'foo' (type: string), _col4 (type: tinyint), 100 (type: int)
+                        key expressions: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         sort order: +++
-                        Map-reduce partition columns: 'foo' (type: string), _col4 (type: tinyint), 100 (type: int)
+                        Map-reduce partition columns: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         Statistics: Num rows: 987 Data size: 26652 Basic stats: COMPLETE Column stats: NONE
-                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), 'foo' (type: string), _col4 (type: tinyint), 100 (type: int)
+                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
         Reducer 2 
             Reduce Operator Tree:
               Select Operator
@@ -2790,7 +2785,6 @@ STAGE DEPENDENCIES:
 STAGE PLANS:
   Stage: Stage-1
     Tez
-#### A masked pattern was here ####
       Edges:
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -2804,15 +2798,15 @@ STAGE PLANS:
                     predicate: ((t = 27) and (s = 'foo')) (type: boolean)
                     Statistics: Num rows: 987 Data size: 26652 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
-                      expressions: si (type: smallint), b (type: bigint), f (type: float), i (type: int)
-                      outputColumnNames: _col0, _col1, _col2, _col5
+                      expressions: si (type: smallint), b (type: bigint), f (type: float), 'foo' (type: string), 27 (type: tinyint), i (type: int)
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
                       Statistics: Num rows: 987 Data size: 26652 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
-                        key expressions: 'foo' (type: string), 27 (type: tinyint), _col5 (type: int)
+                        key expressions: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         sort order: +++
-                        Map-reduce partition columns: 'foo' (type: string), 27 (type: tinyint), _col5 (type: int)
+                        Map-reduce partition columns: _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
                         Statistics: Num rows: 987 Data size: 26652 Basic stats: COMPLETE Column stats: NONE
-                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), 'foo' (type: string), 27 (type: tinyint), _col5 (type: int)
+                        value expressions: _col0 (type: smallint), _col1 (type: bigint), _col2 (type: float), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int)
         Reducer 2 
             Reduce Operator Tree:
               Select Operator

http://git-wip-us.apache.org/repos/asf/hive/blob/3e51861a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
----------------------------------------------------------------------
diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
index 2e09882..5f78481 100644
--- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
+++ b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
@@ -163,7 +163,7 @@ public class Hadoop23Shims extends HadoopShimsSecure {
         Iterator<FileStatus> it = result.iterator();
         while (it.hasNext()) {
           FileStatus stat = it.next();
-          if (!stat.isFile()) {
+          if (!stat.isFile() || (stat.getLen() == 0 && !stat.getPath().toUri().getScheme().equals("nullscan"))) {
             it.remove();
           }
         }