iceberg-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [iceberg] rdblue commented on a change in pull request #1140: ORC: skip non-iceberg columns when converting schema to Iceberg
Date Tue, 30 Jun 2020 23:02:54 GMT

rdblue commented on a change in pull request #1140:
URL: https://github.com/apache/iceberg/pull/1140#discussion_r448026091



##########
File path: orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java
##########
@@ -403,114 +405,150 @@ private static boolean isRequired(TypeDescription orcType) {
         Types.NestedField.optional(icebergID, name, type);
   }
 
-  private static Types.NestedField convertOrcToIceberg(TypeDescription orcType, String name,
-                                                       TypeUtil.NextID nextID) {
+  private static class OrcToIcebergVisitor extends OrcSchemaVisitor<Optional<Types.NestedField>>
{
 
-    final int icebergID = icebergID(orcType).orElseGet(nextID::get);
-    final boolean isRequired = isRequired(orcType);
+    private final Map<Integer, OrcField> icebergToOrcMapping;
 
-    switch (orcType.getCategory()) {
-      case BOOLEAN:
-        return getIcebergType(icebergID, name, Types.BooleanType.get(), isRequired);
-      case BYTE:
-      case SHORT:
-      case INT:
-        return getIcebergType(icebergID, name, Types.IntegerType.get(), isRequired);
-      case LONG:
-        String longAttributeValue = orcType.getAttributeValue(ICEBERG_LONG_TYPE_ATTRIBUTE);
-        LongType longType = longAttributeValue == null ? LongType.LONG : LongType.valueOf(longAttributeValue);
-        switch (longType) {
-          case TIME:
-            return getIcebergType(icebergID, name, Types.TimeType.get(), isRequired);
-          case LONG:
-            return getIcebergType(icebergID, name, Types.LongType.get(), isRequired);
-          default:
-            throw new IllegalStateException("Invalid Long type found in ORC type attribute");
-        }
-      case FLOAT:
-        return getIcebergType(icebergID, name, Types.FloatType.get(), isRequired);
-      case DOUBLE:
-        return getIcebergType(icebergID, name, Types.DoubleType.get(), isRequired);
-      case STRING:
-      case CHAR:
-      case VARCHAR:
-        return getIcebergType(icebergID, name, Types.StringType.get(), isRequired);
-      case BINARY:
-        String binaryAttributeValue = orcType.getAttributeValue(ICEBERG_BINARY_TYPE_ATTRIBUTE);
-        BinaryType binaryType = binaryAttributeValue == null ? BinaryType.BINARY :
-            BinaryType.valueOf(binaryAttributeValue);
-        switch (binaryType) {
-          case UUID:
-            return getIcebergType(icebergID, name, Types.UUIDType.get(), isRequired);
-          case FIXED:
-            int fixedLength = Integer.parseInt(orcType.getAttributeValue(ICEBERG_FIELD_LENGTH));
-            return getIcebergType(icebergID, name, Types.FixedType.ofLength(fixedLength),
isRequired);
-          case BINARY:
-            return getIcebergType(icebergID, name, Types.BinaryType.get(), isRequired);
-          default:
-            throw new IllegalStateException("Invalid Binary type found in ORC type attribute");
-        }
-      case DATE:
-        return getIcebergType(icebergID, name, Types.DateType.get(), isRequired);
-      case TIMESTAMP:
-        return getIcebergType(icebergID, name, Types.TimestampType.withoutZone(), isRequired);
-      case TIMESTAMP_INSTANT:
-        return getIcebergType(icebergID, name, Types.TimestampType.withZone(), isRequired);
-      case DECIMAL:
-        return getIcebergType(icebergID, name,
-            Types.DecimalType.of(orcType.getPrecision(), orcType.getScale()),
-            isRequired);
-      case STRUCT: {
-        List<String> fieldNames = orcType.getFieldNames();
-        List<TypeDescription> fieldTypes = orcType.getChildren();
-        List<Types.NestedField> fields = new ArrayList<>(fieldNames.size());
-        for (int c = 0; c < fieldNames.size(); ++c) {
-          String childName = fieldNames.get(c);
-          TypeDescription type = fieldTypes.get(c);
-          Types.NestedField field = convertOrcToIceberg(type, childName, nextID);
-          fields.add(field);
-        }
+    OrcToIcebergVisitor(Map<Integer, OrcField> icebergToOrcMapping) {

Review comment:
       Also, if field names are passed into the `record` visitor method, why is this needed?
The visitor could use a before/after field callback to track names:
   
   ```
   private LinkedList<String> fieldNames;
   public beforeField(String name, TypeDescription type) {
      fieldNames.push(name);
   }
   
   public afterField(String name, TypeDescription type) {
     fieldNames.pop();
   }
   
   private Strign currentFieldName() {
     return fieldNames.peek();
   }
   ```

##########
File path: orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java
##########
@@ -403,114 +405,150 @@ private static boolean isRequired(TypeDescription orcType) {
         Types.NestedField.optional(icebergID, name, type);
   }
 
-  private static Types.NestedField convertOrcToIceberg(TypeDescription orcType, String name,
-                                                       TypeUtil.NextID nextID) {
+  private static class OrcToIcebergVisitor extends OrcSchemaVisitor<Optional<Types.NestedField>>
{
 
-    final int icebergID = icebergID(orcType).orElseGet(nextID::get);
-    final boolean isRequired = isRequired(orcType);
+    private final Map<Integer, OrcField> icebergToOrcMapping;
 
-    switch (orcType.getCategory()) {
-      case BOOLEAN:
-        return getIcebergType(icebergID, name, Types.BooleanType.get(), isRequired);
-      case BYTE:
-      case SHORT:
-      case INT:
-        return getIcebergType(icebergID, name, Types.IntegerType.get(), isRequired);
-      case LONG:
-        String longAttributeValue = orcType.getAttributeValue(ICEBERG_LONG_TYPE_ATTRIBUTE);
-        LongType longType = longAttributeValue == null ? LongType.LONG : LongType.valueOf(longAttributeValue);
-        switch (longType) {
-          case TIME:
-            return getIcebergType(icebergID, name, Types.TimeType.get(), isRequired);
-          case LONG:
-            return getIcebergType(icebergID, name, Types.LongType.get(), isRequired);
-          default:
-            throw new IllegalStateException("Invalid Long type found in ORC type attribute");
-        }
-      case FLOAT:
-        return getIcebergType(icebergID, name, Types.FloatType.get(), isRequired);
-      case DOUBLE:
-        return getIcebergType(icebergID, name, Types.DoubleType.get(), isRequired);
-      case STRING:
-      case CHAR:
-      case VARCHAR:
-        return getIcebergType(icebergID, name, Types.StringType.get(), isRequired);
-      case BINARY:
-        String binaryAttributeValue = orcType.getAttributeValue(ICEBERG_BINARY_TYPE_ATTRIBUTE);
-        BinaryType binaryType = binaryAttributeValue == null ? BinaryType.BINARY :
-            BinaryType.valueOf(binaryAttributeValue);
-        switch (binaryType) {
-          case UUID:
-            return getIcebergType(icebergID, name, Types.UUIDType.get(), isRequired);
-          case FIXED:
-            int fixedLength = Integer.parseInt(orcType.getAttributeValue(ICEBERG_FIELD_LENGTH));
-            return getIcebergType(icebergID, name, Types.FixedType.ofLength(fixedLength),
isRequired);
-          case BINARY:
-            return getIcebergType(icebergID, name, Types.BinaryType.get(), isRequired);
-          default:
-            throw new IllegalStateException("Invalid Binary type found in ORC type attribute");
-        }
-      case DATE:
-        return getIcebergType(icebergID, name, Types.DateType.get(), isRequired);
-      case TIMESTAMP:
-        return getIcebergType(icebergID, name, Types.TimestampType.withoutZone(), isRequired);
-      case TIMESTAMP_INSTANT:
-        return getIcebergType(icebergID, name, Types.TimestampType.withZone(), isRequired);
-      case DECIMAL:
-        return getIcebergType(icebergID, name,
-            Types.DecimalType.of(orcType.getPrecision(), orcType.getScale()),
-            isRequired);
-      case STRUCT: {
-        List<String> fieldNames = orcType.getFieldNames();
-        List<TypeDescription> fieldTypes = orcType.getChildren();
-        List<Types.NestedField> fields = new ArrayList<>(fieldNames.size());
-        for (int c = 0; c < fieldNames.size(); ++c) {
-          String childName = fieldNames.get(c);
-          TypeDescription type = fieldTypes.get(c);
-          Types.NestedField field = convertOrcToIceberg(type, childName, nextID);
-          fields.add(field);
-        }
+    OrcToIcebergVisitor(Map<Integer, OrcField> icebergToOrcMapping) {

Review comment:
       Also, if field names are passed into the `record` visitor method, why is this needed?
The visitor could use a before/after field callback to track names:
   
   ```java
   private LinkedList<String> fieldNames;
   
   public beforeField(String name, TypeDescription type) {
      fieldNames.push(name);
   }
   
   public afterField(String name, TypeDescription type) {
     fieldNames.pop();
   }
   
   private Strign currentFieldName() {
     return fieldNames.peek();
   }
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org
For additional commands, e-mail: issues-help@iceberg.apache.org


Mime
View raw message