hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sunc...@apache.org
Subject hive git commit: HIVE-15056: Support index shifting for struct fields (Chao Sun, reviewed by Ferdinand Xu)
Date Mon, 31 Oct 2016 16:15:57 GMT
Repository: hive
Updated Branches:
  refs/heads/master a7cf6549f -> 783972ad7


HIVE-15056: Support index shifting for struct fields (Chao Sun, reviewed by Ferdinand Xu)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/783972ad
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/783972ad
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/783972ad

Branch: refs/heads/master
Commit: 783972ad7b576003c57a00500645c9b9a6ac2b5a
Parents: a7cf654
Author: Chao Sun <sunchao@apache.org>
Authored: Mon Oct 31 09:13:45 2016 -0700
Committer: Chao Sun <sunchao@apache.org>
Committed: Mon Oct 31 09:15:38 2016 -0700

----------------------------------------------------------------------
 .../serde/ArrayWritableObjectInspector.java     |  66 ++++++++----
 .../ql/io/parquet/serde/ParquetHiveSerDe.java   | 101 ++++++++++++++++++-
 .../hive/ql/io/parquet/TestParquetSerDe.java    |  34 +++++++
 3 files changed, 182 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/783972ad/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java
index 5f852d0..8df0cc1 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java
@@ -21,7 +21,6 @@ import java.util.List;
 import org.apache.hadoop.hive.ql.io.parquet.serde.primitive.ParquetPrimitiveInspectorFactory;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.StructField;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
@@ -49,25 +48,45 @@ public class ArrayWritableObjectInspector extends SettableStructObjectInspector
   private final List<StructField> fields;
   private final HashMap<String, StructFieldImpl> fieldsByName;
 
+  // Whether this OI is for the column-level schema (as opposed to nested column fields).
+  private final boolean isRoot;
+
   public ArrayWritableObjectInspector(final StructTypeInfo rowTypeInfo) {
+    this(true, rowTypeInfo, null);
+  }
+
+  public ArrayWritableObjectInspector(StructTypeInfo originalTypeInfo, StructTypeInfo prunedTypeInfo)
{
+    this(true, originalTypeInfo, prunedTypeInfo);
+  }
 
-    typeInfo = rowTypeInfo;
-    fieldNames = rowTypeInfo.getAllStructFieldNames();
-    fieldInfos = rowTypeInfo.getAllStructFieldTypeInfos();
-    fields = new ArrayList<StructField>(fieldNames.size());
-    fieldsByName = new HashMap<String, StructFieldImpl>();
+  public ArrayWritableObjectInspector(boolean isRoot,
+      StructTypeInfo originalTypeInfo, StructTypeInfo prunedTypeInfo) {
+    this.isRoot = isRoot;
+    typeInfo = originalTypeInfo;
+    fieldNames = originalTypeInfo.getAllStructFieldNames();
+    fieldInfos = originalTypeInfo.getAllStructFieldTypeInfos();
+    fields = new ArrayList<>(fieldNames.size());
+    fieldsByName = new HashMap<>();
 
     for (int i = 0; i < fieldNames.size(); ++i) {
       final String name = fieldNames.get(i);
       final TypeInfo fieldInfo = fieldInfos.get(i);
 
-      final StructFieldImpl field = new StructFieldImpl(name, getObjectInspector(fieldInfo),
i);
+      StructFieldImpl field;
+      if (prunedTypeInfo != null && prunedTypeInfo.getAllStructFieldNames().indexOf(name)
>= 0) {
+        int adjustedIndex = prunedTypeInfo.getAllStructFieldNames().indexOf(name);
+        TypeInfo prunedFieldInfo = prunedTypeInfo.getAllStructFieldTypeInfos().get(adjustedIndex);
+        field = new StructFieldImpl(name, getObjectInspector(fieldInfo, prunedFieldInfo),
i, adjustedIndex);
+      } else {
+        field = new StructFieldImpl(name, getObjectInspector(fieldInfo, null), i, i);
+      }
       fields.add(field);
       fieldsByName.put(name.toLowerCase(), field);
     }
   }
 
-  private ObjectInspector getObjectInspector(final TypeInfo typeInfo) {
+  private ObjectInspector getObjectInspector(
+      TypeInfo typeInfo, TypeInfo prunedTypeInfo) {
     if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) {
       return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
     } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) {
@@ -83,18 +102,20 @@ public class ArrayWritableObjectInspector extends SettableStructObjectInspector
     }  else if (typeInfo instanceof DecimalTypeInfo) {
       return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector((DecimalTypeInfo)
typeInfo);
     } else if (typeInfo.getCategory().equals(Category.STRUCT)) {
-      return new ArrayWritableObjectInspector((StructTypeInfo) typeInfo);
+      return new ArrayWritableObjectInspector(false, (StructTypeInfo) typeInfo, (StructTypeInfo)
prunedTypeInfo);
     } else if (typeInfo.getCategory().equals(Category.LIST)) {
       final TypeInfo subTypeInfo = ((ListTypeInfo) typeInfo).getListElementTypeInfo();
-      return new ParquetHiveArrayInspector(getObjectInspector(subTypeInfo));
+      return new ParquetHiveArrayInspector(getObjectInspector(subTypeInfo, null));
     } else if (typeInfo.getCategory().equals(Category.MAP)) {
       final TypeInfo keyTypeInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo();
       final TypeInfo valueTypeInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo();
       if (keyTypeInfo.equals(TypeInfoFactory.stringTypeInfo) || keyTypeInfo.equals(TypeInfoFactory.byteTypeInfo)
               || keyTypeInfo.equals(TypeInfoFactory.shortTypeInfo)) {
-        return new DeepParquetHiveMapInspector(getObjectInspector(keyTypeInfo), getObjectInspector(valueTypeInfo));
+        return new DeepParquetHiveMapInspector(getObjectInspector(keyTypeInfo, null),
+            getObjectInspector(valueTypeInfo, null));
       } else {
-        return new StandardParquetHiveMapInspector(getObjectInspector(keyTypeInfo), getObjectInspector(valueTypeInfo));
+        return new StandardParquetHiveMapInspector(getObjectInspector(keyTypeInfo, null),
+            getObjectInspector(valueTypeInfo, null));
       }
     } else if (typeInfo.equals(TypeInfoFactory.byteTypeInfo)) {
       return ParquetPrimitiveInspectorFactory.parquetByteInspector;
@@ -139,8 +160,9 @@ public class ArrayWritableObjectInspector extends SettableStructObjectInspector
     if (data instanceof ArrayWritable) {
       final ArrayWritable arr = (ArrayWritable) data;
       final StructFieldImpl structField = (StructFieldImpl) fieldRef;
-      if (structField.getIndex() < arr.get().length) {
-        return arr.get()[structField.getIndex()];
+      int index = isRoot ? structField.getIndex() : structField.adjustedIndex;
+      if (index < arr.get().length) {
+        return arr.get()[index];
       } else {
         return null;
       }
@@ -170,7 +192,7 @@ public class ArrayWritableObjectInspector extends SettableStructObjectInspector
     if (data instanceof ArrayWritable) {
       final ArrayWritable arr = (ArrayWritable) data;
       final Object[] arrWritable = arr.get();
-      return new ArrayList<Object>(Arrays.asList(arrWritable));
+      return new ArrayList<>(Arrays.asList(arrWritable));
     }
 
     //since setStructFieldData and create return a list, getStructFieldData should be able
to
@@ -221,16 +243,26 @@ public class ArrayWritableObjectInspector extends SettableStructObjectInspector
     return hash;
   }
 
-  class StructFieldImpl implements StructField {
+  private class StructFieldImpl implements StructField {
 
     private final String name;
     private final ObjectInspector inspector;
     private final int index;
 
-    public StructFieldImpl(final String name, final ObjectInspector inspector, final int
index) {
+    // This is the adjusted index after nested column pruning.
+    // For instance, given the struct type: s:<struct<a:int, b:boolean>>
+    // If only 's.b' is used, the pruned type is: s:<struct<b:boolean>>.
+    // Here, the index of field 'b' is changed from 1 to 0.
+    // When we look up the data from Parquet, index needs to be adjusted accordingly.
+    // Note: currently this is only used in the read path.
+    final int adjustedIndex;
+
+    public StructFieldImpl(final String name, final ObjectInspector inspector,
+        final int index, int adjustedIndex) {
       this.name = name;
       this.inspector = inspector;
       this.index = index;
+      this.adjustedIndex = adjustedIndex;
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/hive/blob/783972ad/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
index 995b965..ef79760 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
@@ -15,12 +15,16 @@ package org.apache.hadoop.hive.ql.io.parquet.serde;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Properties;
 
+import com.google.common.base.Preconditions;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.hadoop.hive.serde2.AbstractSerDe;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.hadoop.hive.serde2.SerDeSpec;
 import org.apache.hadoop.hive.serde2.SerDeStats;
@@ -108,8 +112,16 @@ public class ParquetHiveSerDe extends AbstractSerDe {
         columnTypes);
     }
     // Create row related objects
-    rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
-    this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);
+    StructTypeInfo completeTypeInfo =
+        (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
+    StructTypeInfo prunedTypeInfo = null;
+    if (conf != null) {
+      String prunedColumnPaths = conf.get(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR);
+      if (prunedColumnPaths != null) {
+        prunedTypeInfo = pruneFromPaths(completeTypeInfo, prunedColumnPaths);
+      }
+    }
+    this.objInspector = new ArrayWritableObjectInspector(completeTypeInfo, prunedTypeInfo);
 
     // Stats part
     serializedSize = 0;
@@ -163,4 +175,89 @@ public class ParquetHiveSerDe extends AbstractSerDe {
     }
     return stats;
   }
+
+  /**
+   * Given a complete struct type info and pruned paths containing selected fields
+   * from the type info, return a pruned struct type info only with the selected fields.
+   *
+   * For instance, if 'originalTypeInfo' is: s:struct<a:struct<b:int, c:boolean>,
d:string>
+   *   and 'prunedPaths' is "s.a.b,s.d", then the result will be:
+   *   s:struct<a:struct<b:int>, d:string>
+   *
+   * @param originalTypeInfo the complete struct type info
+   * @param prunedPaths a string representing the pruned paths, separated by ','
+   * @return the pruned struct type info
+   */
+  private StructTypeInfo pruneFromPaths(
+      StructTypeInfo originalTypeInfo, String prunedPaths) {
+    PrunedStructTypeInfo prunedTypeInfo = new PrunedStructTypeInfo(originalTypeInfo);
+
+    String[] prunedPathList = prunedPaths.split(",");
+    for (String path : prunedPathList) {
+      pruneFromSinglePath(prunedTypeInfo, path);
+    }
+
+    return prunedTypeInfo.prune();
+  }
+
+  private void pruneFromSinglePath(PrunedStructTypeInfo prunedInfo, String path) {
+    Preconditions.checkArgument(prunedInfo != null,
+      "PrunedStructTypeInfo for path " + path + " should not be null");
+
+    int index = path.indexOf('.');
+    if (index < 0) {
+      index = path.length();
+    }
+
+    String fieldName = path.substring(0, index);
+    prunedInfo.markSelected(fieldName);
+    if (index < path.length()) {
+      pruneFromSinglePath(prunedInfo.children.get(fieldName), path.substring(index + 1));
+    }
+  }
+
+  private static class PrunedStructTypeInfo {
+    final StructTypeInfo typeInfo;
+    final Map<String, PrunedStructTypeInfo> children;
+    final boolean[] selected;
+
+    PrunedStructTypeInfo(StructTypeInfo typeInfo) {
+      this.typeInfo = typeInfo;
+      this.children = new HashMap<>();
+      this.selected = new boolean[typeInfo.getAllStructFieldTypeInfos().size()];
+      for (int i = 0; i < typeInfo.getAllStructFieldTypeInfos().size(); ++i) {
+        TypeInfo ti = typeInfo.getAllStructFieldTypeInfos().get(i);
+        if (ti.getCategory() == Category.STRUCT) {
+          this.children.put(typeInfo.getAllStructFieldNames().get(i),
+              new PrunedStructTypeInfo((StructTypeInfo) ti));
+        }
+      }
+    }
+
+    void markSelected(String fieldName) {
+      int index = typeInfo.getAllStructFieldNames().indexOf(fieldName);
+      if (index >= 0) {
+        selected[index] = true;
+      }
+    }
+
+    StructTypeInfo prune() {
+      List<String> newNames = new ArrayList<>();
+      List<TypeInfo> newTypes = new ArrayList<>();
+      List<String> oldNames = typeInfo.getAllStructFieldNames();
+      List<TypeInfo> oldTypes = typeInfo.getAllStructFieldTypeInfos();
+      for (int i = 0; i < oldNames.size(); ++i) {
+        String fn = oldNames.get(i);
+        if (selected[i]) {
+          newNames.add(fn);
+          if (children.containsKey(fn)) {
+            newTypes.add(children.get(fn).prune());
+          } else {
+            newTypes.add(oldTypes.get(i));
+          }
+        }
+      }
+      return (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(newNames, newTypes);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/783972ad/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java
index dbb2795..7aa293f 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetSerDe.java
@@ -19,12 +19,16 @@ import junit.framework.TestCase;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.hadoop.hive.serde2.SerDeUtils;
 import org.apache.hadoop.hive.serde2.io.ByteWritable;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
 import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord;
 import org.apache.hadoop.hive.serde2.io.ShortWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 import org.apache.hadoop.io.ArrayWritable;
 import org.apache.hadoop.io.BytesWritable;
@@ -81,6 +85,36 @@ public class TestParquetSerDe extends TestCase {
     }
   }
 
+  public void testParquetHiveSerDeComplexTypes() throws Throwable {
+    // Initialize
+    ParquetHiveSerDe serDe = new ParquetHiveSerDe();
+    Configuration conf = new Configuration();
+    Properties tblProperties = new Properties();
+
+    tblProperties.setProperty(serdeConstants.LIST_COLUMNS, "a,s");
+    tblProperties.setProperty(serdeConstants.LIST_COLUMN_TYPES, "int,struct<a:int,b:string>");
+    conf.set(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR, "s.b");
+
+    serDe.initialize(conf, tblProperties);
+
+    // Generate test data
+    Writable[] wb = new Writable[1];
+    wb[0] = new BytesWritable("foo".getBytes("UTF-8"));
+    Writable[] ws = new Writable[2];
+    ws[0] = null;
+    ArrayWritable awb = new ArrayWritable(Writable.class, wb);
+    ws[1] = awb;
+    ArrayWritable aws = new ArrayWritable(Writable.class, ws);
+
+    // Inspect the test data
+    StructObjectInspector soi = (StructObjectInspector) serDe.getObjectInspector();
+    StructField s = soi.getStructFieldRef("s");
+    assertEquals(awb, soi.getStructFieldData(aws, s));
+    StructObjectInspector boi = (StructObjectInspector) s.getFieldObjectInspector();
+    StructField b = boi.getStructFieldRef("b");
+    assertEquals(wb[0], boi.getStructFieldData(awb, b));
+  }
+
   private void deserializeAndSerializeLazySimple(final ParquetHiveSerDe serDe, final ArrayWritable
t) throws SerDeException {
 
     // Get the row structure


Mime
View raw message