drill-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sudhe...@apache.org
Subject [2/2] drill git commit: DRILL-5009: Skip reading of empty row groups while reading Parquet metadata
Date Wed, 09 Nov 2016 17:55:50 GMT
DRILL-5009: Skip reading of empty row groups while reading Parquet metadata

+ We will no longer attempt to scan such row groups.

closes #651


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/4b1902c0
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/4b1902c0
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/4b1902c0

Branch: refs/heads/master
Commit: 4b1902c042d3e8f426f54ec04b78813ac64aa120
Parents: e03507a
Author: Parth Chandra <parthc@apache.org>
Authored: Mon Nov 7 20:29:23 2016 -0800
Committer: Sudheesh Katkam <sudheesh@apache.org>
Committed: Wed Nov 9 09:16:34 2016 -0800

----------------------------------------------------------------------
 .../hive/HiveDrillNativeScanBatchCreator.java   |  4 ++++
 .../drill/exec/store/parquet/Metadata.java      | 22 ++++++++++++++++++++
 2 files changed, 26 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/4b1902c0/contrib/storage-hive/core/src/main/java/org/apache/drill/exec/store/hive/HiveDrillNativeScanBatchCreator.java
----------------------------------------------------------------------
diff --git a/contrib/storage-hive/core/src/main/java/org/apache/drill/exec/store/hive/HiveDrillNativeScanBatchCreator.java
b/contrib/storage-hive/core/src/main/java/org/apache/drill/exec/store/hive/HiveDrillNativeScanBatchCreator.java
index d78c620..4be2ced 100644
--- a/contrib/storage-hive/core/src/main/java/org/apache/drill/exec/store/hive/HiveDrillNativeScanBatchCreator.java
+++ b/contrib/storage-hive/core/src/main/java/org/apache/drill/exec/store/hive/HiveDrillNativeScanBatchCreator.java
@@ -119,6 +119,10 @@ public class HiveDrillNativeScanBatchCreator implements BatchCreator<HiveDrillNa
         final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit,
parquetMetadata);
 
         for(int rowGroupNum : rowGroupNums) {
+          //DRILL-5009 : Skip the row group if the row count is zero
+          if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
+            continue;
+          }
           // Drill has only ever written a single row group per file, only detect corruption
           // in the first row group
           ParquetReaderUtility.DateCorruptionStatus containsCorruptDates =

http://git-wip-us.apache.org/repos/asf/drill/blob/4b1902c0/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
index ead0a8f..04a2476 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
@@ -22,6 +22,8 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
+import java.util.Iterator;
+
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 
@@ -64,10 +66,12 @@ import com.fasterxml.jackson.core.JsonParser;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.DeserializationContext;
 import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.JsonDeserializer;
 import com.fasterxml.jackson.databind.JsonSerializer;
 import com.fasterxml.jackson.databind.KeyDeserializer;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
 import com.fasterxml.jackson.databind.SerializerProvider;
 import com.fasterxml.jackson.databind.module.SimpleModule;
 import com.fasterxml.jackson.module.afterburner.AfterburnerModule;
@@ -437,6 +441,11 @@ public class Metadata {
         length += col.getTotalSize();
       }
 
+      // DRILL-5009: Skip the RowGroup if it is empty
+      // Note we still read the schema even if there are no values in the RowGroup
+      if (rowGroup.getRowCount() == 0) {
+        continue;
+      }
       RowGroupMetadata_v3 rowGroupMeta =
           new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
               getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
@@ -566,6 +575,19 @@ public class Metadata {
             (createMetaFilesRecursively(Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getLeft();
         newMetadata = true;
       }
+
+      // DRILL-5009: Remove the RowGroup if it is empty
+      List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
+      for (ParquetFileMetadata file : files) {
+        List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
+        for (Iterator<? extends RowGroupMetadata> iter = rowGroups.iterator(); iter.hasNext();
) {
+          RowGroupMetadata r = iter.next();
+          if (r.getRowCount() == 0) {
+            iter.remove();
+          }
+        }
+      }
+
     }
 
     if (newMetadata && metaContext != null) {


Mime
View raw message