drill-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From amansi...@apache.org
Subject drill git commit: DRILL-3937: Handle the case where min/max columns in metadata cache file are string or binary values.
Date Fri, 30 Oct 2015 02:00:48 GMT
Repository: drill
Updated Branches:
  refs/heads/master 1d067d26b -> e4b94a784


DRILL-3937: Handle the case where min/max columns in metadata cache file are string or binary
values.

Simplify serialization and check for nulls. Remove byte array comparison for min/max since
it is not needed anymore.

close apache/drill#220


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/e4b94a78
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/e4b94a78
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/e4b94a78

Branch: refs/heads/master
Commit: e4b94a78487f844be4fe71c4b9bf88b16c7f42f7
Parents: 1d067d2
Author: Aman Sinha <asinha@maprtech.com>
Authored: Mon Oct 26 00:02:11 2015 -0700
Committer: Aman Sinha <asinha@maprtech.com>
Committed: Thu Oct 29 18:57:33 2015 -0700

----------------------------------------------------------------------
 .../drill/exec/store/parquet/Metadata.java      | 46 ++++++++++++++++++--
 .../exec/store/parquet/ParquetGroupScan.java    | 41 ++++++++++++++---
 .../store/parquet/TestParquetMetadataCache.java | 46 ++++++++++++++++++--
 3 files changed, 121 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/e4b94a78/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
index 58f6d2a..522ff92 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java
@@ -29,22 +29,24 @@ import com.fasterxml.jackson.databind.module.SimpleModule;
 import com.google.common.base.Stopwatch;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+
 import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.common.expression.SchemaPath.De;
 import org.apache.drill.exec.store.TimedRunnable;
 import org.apache.drill.exec.store.dfs.DrillPathFilter;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+
 import parquet.column.statistics.Statistics;
 import parquet.hadoop.ParquetFileReader;
 import parquet.hadoop.metadata.BlockMetaData;
 import parquet.hadoop.metadata.ColumnChunkMetaData;
 import parquet.hadoop.metadata.ParquetMetadata;
+import parquet.io.api.Binary;
 import parquet.schema.GroupType;
 import parquet.schema.MessageType;
 import parquet.schema.OriginalType;
@@ -475,11 +477,12 @@ public class Metadata {
     @JsonProperty
     public OriginalType originalType;
     @JsonProperty
+    public Long nulls;
+
+    // JsonProperty for these are associated with the getters and setters
     public Object max;
-    @JsonProperty
     public Object min;
-    @JsonProperty
-    public Long nulls;
+
 
     public ColumnMetadata() {
       super();
@@ -494,5 +497,40 @@ public class Metadata {
       this.min = min;
       this.nulls = nulls;
     }
+
+    @JsonProperty(value = "min")
+    public Object getMin() {
+      if (primitiveType == PrimitiveTypeName.BINARY && min != null) {
+         return new String(((Binary) min).getBytes());
+      }
+      return min;
+    }
+
+    @JsonProperty(value = "max")
+    public Object getMax() {
+      if (primitiveType == PrimitiveTypeName.BINARY && max != null) {
+        return new String(((Binary) max).getBytes());
+      }
+      return max;
+    }
+
+    /**
+     * setter used during deserialization of the 'min' field of the metadata cache file.
+     * @param min
+     */
+    @JsonProperty(value = "min")
+    public void setMin(Object min) {
+      this.min = min;
+     }
+
+    /**
+     * setter used during deserialization of the 'max' field of the metadata cache file.
+     * @param max
+     */
+    @JsonProperty(value = "max")
+    public void setMax(Object max) {
+      this.max = max;
+    }
+
   }
 }

http://git-wip-us.apache.org/repos/asf/drill/blob/e4b94a78/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
index af16063..a145d79 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
@@ -19,6 +19,7 @@ package org.apache.drill.exec.store.parquet;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -28,6 +29,7 @@ import java.util.Set;
 import com.google.common.collect.ArrayListMultimap;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
+
 import org.apache.drill.common.exceptions.ExecutionSetupException;
 import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.common.logical.FormatPluginConfig;
@@ -83,8 +85,8 @@ import org.apache.drill.exec.vector.NullableVarCharVector;
 import org.apache.drill.exec.vector.ValueVector;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
-
 import org.joda.time.DateTimeUtils;
+
 import parquet.io.api.Binary;
 import parquet.org.codehaus.jackson.annotate.JsonCreator;
 
@@ -97,6 +99,7 @@ import com.google.common.base.Preconditions;
 import com.google.common.base.Stopwatch;
 import com.google.common.collect.ListMultimap;
 import com.google.common.collect.Lists;
+
 import parquet.schema.OriginalType;
 import parquet.schema.PrimitiveType.PrimitiveTypeName;
 
@@ -325,6 +328,15 @@ public class ParquetGroupScan extends AbstractFileGroupScan {
     Object max = columnChunkMetaData.max;
     Object min = columnChunkMetaData.min;
     return max != null && max.equals(min);
+/*
+    if (max != null && min != null) {
+      if (max instanceof byte[] && min instanceof byte[]) {
+        return Arrays.equals((byte[])max, (byte[])min);
+      }
+      return max.equals(min);
+    }
+    return false;
+*/
   }
 
   @Override
@@ -411,8 +423,17 @@ public class ParquetGroupScan extends AbstractFileGroupScan {
     }
     case VARBINARY: {
       NullableVarBinaryVector varBinaryVector = (NullableVarBinaryVector) v;
-      Binary value = (Binary) partitionValueMap.get(f).get(column);
-      byte[] bytes = value.getBytes();
+      Object s = partitionValueMap.get(f).get(column);
+      byte[] bytes;
+      if (s instanceof Binary) {
+        bytes = ((Binary) s).getBytes();
+      } else if (s instanceof String) {
+        bytes = ((String) s).getBytes();
+      } else if (s instanceof byte[]) {
+        bytes = (byte[])s;
+      } else {
+        throw new UnsupportedOperationException("Unable to create column data for type: "
+ type);
+      }
       varBinaryVector.getMutator().setSafe(index, bytes, 0, bytes.length);
       return;
     }
@@ -442,8 +463,17 @@ public class ParquetGroupScan extends AbstractFileGroupScan {
     }
     case VARCHAR: {
       NullableVarCharVector varCharVector = (NullableVarCharVector) v;
-      Binary value = (Binary) partitionValueMap.get(f).get(column);
-      byte[] bytes = value.getBytes();
+      Object s = partitionValueMap.get(f).get(column);
+      byte[] bytes;
+      if (s instanceof String) { // if the metadata was read from a JSON cache file it maybe
a string type
+        bytes = ((String) s).getBytes();
+      } else if (s instanceof Binary) {
+        bytes = ((Binary) s).getBytes();
+      } else if (s instanceof byte[]) {
+        bytes = (byte[])s;
+      } else {
+        throw new UnsupportedOperationException("Unable to create column data for type: "
+ type);
+      }
       varCharVector.getMutator().setSafe(index, bytes, 0, bytes.length);
       return;
     }
@@ -603,6 +633,7 @@ public class ParquetGroupScan extends AbstractFileGroupScan {
             }
             Object value = map.get(schemaPath);
             Object currentValue = column.max;
+//            Object currentValue = column.getMax();
             if (value != null) {
               if (value != currentValue) {
                 columnTypeMap.remove(schemaPath);

http://git-wip-us.apache.org/repos/asf/drill/blob/e4b94a78/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetMetadataCache.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetMetadataCache.java
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetMetadataCache.java
index ef481e3..0b8b7d9 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetMetadataCache.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetMetadataCache.java
@@ -18,12 +18,9 @@
 package org.apache.drill.exec.store.parquet;
 
 import com.google.common.base.Joiner;
-import org.apache.commons.io.filefilter.IOFileFilter;
-import org.apache.drill.BaseTestQuery;
 import org.apache.drill.PlanTestBase;
 import org.apache.drill.common.util.TestTools;
 import org.apache.commons.io.FileUtils;
-import org.apache.drill.exec.store.dfs.DrillPathFilter;
 import org.apache.hadoop.fs.Path;
 import org.junit.Assert;
 import org.junit.BeforeClass;
@@ -84,6 +81,49 @@ public class TestParquetMetadataCache extends PlanTestBase {
     PlanTestBase.testPlanMatchingPatterns(query, new String[]{numFilesPattern, usedMetaPattern},
new String[] {"Filter"});
   }
 
+  @Test // DRILL-3937 (partitioning column is varchar)
+  public void testPartitionPruningWithMetadataCache_3() throws Exception {
+    String tableName = "orders_ctas_varchar";
+    test("use dfs_test.tmp");
+
+    test(String.format("create table %s (o_orderdate, o_orderpriority) partition by (o_orderpriority)
"
+        + "as select o_orderdate, o_orderpriority from dfs_test.`%s/multilevel/parquet/1994/Q1`",
tableName, TEST_RES_PATH));
+    test(String.format("refresh table metadata %s", tableName));
+    checkForMetadataFile(tableName);
+    String query = String.format("select * from %s where o_orderpriority = '1-URGENT'", tableName);
+    int expectedRowCount = 3;
+    int expectedNumFiles = 1;
+
+    int actualRowCount = testSql(query);
+    assertEquals(expectedRowCount, actualRowCount);
+    String numFilesPattern = "numFiles=" + expectedNumFiles;
+    String usedMetaPattern = "usedMetadataFile=true";
+
+    testPlanMatchingPatterns(query, new String[]{numFilesPattern, usedMetaPattern}, new String[]
{});
+  }
+
+  @Test // DRILL-3937 (partitioning column is binary using convert_to)
+  public void testPartitionPruningWithMetadataCache_4() throws Exception {
+    String tableName = "orders_ctas_binary";
+    test("use dfs_test.tmp");
+
+    test(String.format("create table %s (o_orderdate, o_orderpriority) partition by (o_orderpriority)
"
+        + "as select o_orderdate, convert_to(o_orderpriority, 'UTF8') as o_orderpriority
"
+        + "from dfs_test.`%s/multilevel/parquet/1994/Q1`", tableName, TEST_RES_PATH));
+    test(String.format("refresh table metadata %s", tableName));
+    checkForMetadataFile(tableName);
+    String query = String.format("select * from %s where o_orderpriority = '1-URGENT'", tableName);
+    int expectedRowCount = 3;
+    int expectedNumFiles = 1;
+
+    int actualRowCount = testSql(query);
+    assertEquals(expectedRowCount, actualRowCount);
+    String numFilesPattern = "numFiles=" + expectedNumFiles;
+    String usedMetaPattern = "usedMetadataFile=true";
+
+    testPlanMatchingPatterns(query, new String[]{numFilesPattern, usedMetaPattern}, new String[]
{});
+  }
+
   @Test
   public void testCache() throws Exception {
     String tableName = "nation_ctas";


Mime
View raw message