parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject [parquet-mr] branch master updated: PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND and TimeUnit.NANOSECOND of Arrow (#469)
Date Mon, 07 May 2018 08:12:02 GMT
This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/master by this push:
     new e021734  PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND
and TimeUnit.NANOSECOND of Arrow (#469)
e021734 is described below

commit e021734b62ea5ac273e516b4ac83727cbb99ec08
Author: Masayuki Takahashi <masayuki038@gmail.com>
AuthorDate: Mon May 7 17:11:58 2018 +0900

    PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND and TimeUnit.NANOSECOND
of Arrow (#469)
    
    * PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND AND TimeUnit.NANOSECOND
of Arrow
    
    Arrow's 'Time' definition is below:
    
    { "name" : "time", "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND", "bitWidth": /*
integer: 32 or 64 */ }
    http://arrow.apache.org/docs/metadata.html
    
    But Parquet only supports 'TIME_MILLIS' and 'TIME_MICROS'.
    https://github.com/Apache/parquet-format/blob/master/LogicalTypes.md
    
    Therefore SchemaConverter should not convert from TimeUnit.SECOND AND TimeUnit.NANOSECOND
of Arrow to Parquet.
    
    Author: Masayuki Takahashi <masayuki038@gmail.com>
    
    * PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND AND TimeUnit.NANOSECOND
of Arrow
    
    Since the import statements were collected, I restored it.
    
    Author: Masayuki Takahashi <masayuki038@gmail.com>
    
    * PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND AND TimeUnit.NANOSECOND
of Arrow
    
    Remove unnecessary updates.
    
    Author: Masayuki Takahashi <masayuki038@gmail.com>
    
    * PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND AND TimeUnit.NANOSECOND
of Arrow
    
    Remove unnecessary package name
    
    Author: Masayuki Takahashi <masayuki038@gmail.com>
    
    * PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND AND TimeUnit.NANOSECOND
of Arrow
    
    Add a conversion pattern from Parquet's TIME_MICROS  to Arrow's MICROSECOND
    
    Author: Masayuki Takahashi <masayuki038@gmail.com>
    
    * PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND AND TimeUnit.NANOSECOND
of Arrow
    
    Fix to specify `expected` positions in assertEquals
    
    Author: Masayuki Takahashi <masayuki038@gmail.com>
    
    * PARQUET-1285: [Java] SchemaConverter should not convert from TimeUnit.SECOND AND TimeUnit.NANOSECOND
of Arrow
    
    Add a test to convert from Parquet's TIME_MICROS  to Arrow's MICROSECOND
    
    Author: Masayuki Takahashi <masayuki038@gmail.com>
---
 .../parquet/arrow/schema/SchemaConverter.java      | 24 +++++--
 .../parquet/arrow/schema/TestSchemaConverter.java  | 79 ++++++++++++++++++++--
 2 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
index 1d69c45..f298558 100644
--- a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
+++ b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
@@ -28,6 +28,7 @@ import static org.apache.parquet.schema.OriginalType.INT_64;
 import static org.apache.parquet.schema.OriginalType.INT_8;
 import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS;
 import static org.apache.parquet.schema.OriginalType.TIME_MILLIS;
+import static org.apache.parquet.schema.OriginalType.TIME_MICROS;
 import static org.apache.parquet.schema.OriginalType.UINT_16;
 import static org.apache.parquet.schema.OriginalType.UINT_32;
 import static org.apache.parquet.schema.OriginalType.UINT_64;
@@ -49,6 +50,7 @@ import java.util.List;
 
 import org.apache.arrow.vector.types.DateUnit;
 import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.TimeUnit;
 import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor;
 import org.apache.arrow.vector.types.pojo.ArrowType.Binary;
@@ -245,7 +247,14 @@ public class SchemaConverter {
 
       @Override
       public TypeMapping visit(Time type) {
-        return primitive(INT32, TIME_MILLIS);
+        int bitWidth = type.getBitWidth();
+        TimeUnit timeUnit = type.getUnit();
+        if (bitWidth == 32 && timeUnit == TimeUnit.MILLISECOND) {
+          return primitive(INT32, TIME_MILLIS);
+        } else if (bitWidth == 64 && timeUnit == TimeUnit.MICROSECOND) {
+          return primitive(INT64, TIME_MICROS);
+        }
+        throw new UnsupportedOperationException("Unsupported type " + type);
       }
 
       @Override
@@ -407,11 +416,11 @@ public class SchemaConverter {
           case DATE:
             return field(new ArrowType.Date(DateUnit.DAY));
           case TIMESTAMP_MICROS:
-            return field(new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND,
"UTC"));
+            return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"));
           case TIMESTAMP_MILLIS:
-            return field(new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND,
"UTC"));
+            return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"));
           case TIME_MILLIS:
-            return field(new ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.MILLISECOND,
32));
+            return field(new ArrowType.Time(TimeUnit.MILLISECOND, 32));
           default:
           case TIME_MICROS:
           case INT_64:
@@ -456,11 +465,12 @@ public class SchemaConverter {
           case DATE:
             return field(new ArrowType.Date(DateUnit.DAY));
           case TIMESTAMP_MICROS:
-            return field(new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND,
"UTC"));
+            return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"));
           case TIMESTAMP_MILLIS:
-            return field(new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND,
"UTC"));
-          default:
+            return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"));
           case TIME_MICROS:
+            return field(new ArrowType.Time(TimeUnit.MICROSECOND, 64));
+          default:
           case UTF8:
           case ENUM:
           case BSON:
diff --git a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
index 654f773..4c3da35 100644
--- a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
+++ b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
@@ -28,6 +28,7 @@ import static org.apache.parquet.schema.OriginalType.INT_64;
 import static org.apache.parquet.schema.OriginalType.INT_8;
 import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS;
 import static org.apache.parquet.schema.OriginalType.TIME_MILLIS;
+import static org.apache.parquet.schema.OriginalType.TIME_MICROS;
 import static org.apache.parquet.schema.OriginalType.UINT_16;
 import static org.apache.parquet.schema.OriginalType.UINT_32;
 import static org.apache.parquet.schema.OriginalType.UINT_64;
@@ -43,11 +44,12 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
 
 import java.io.IOException;
 import java.util.List;
-import org.apache.arrow.vector.types.IntervalUnit;
 
-import org.apache.arrow.vector.types.UnionMode;
 import org.apache.arrow.vector.types.DateUnit;
 import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.IntervalUnit;
+import org.apache.arrow.vector.types.TimeUnit;
+import org.apache.arrow.vector.types.UnionMode;
 import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.Schema;
@@ -86,7 +88,7 @@ public class TestSchemaConverter {
     field("e", new ArrowType.List(), field(null, new ArrowType.Date(DateUnit.DAY))),
     field("f", new ArrowType.FixedSizeList(1), field(null, new ArrowType.Date(DateUnit.DAY))),
     field("g", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)),
-    field("h", new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND,
"UTC")),
+    field("h", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")),
     field("i", new ArrowType.Interval(IntervalUnit.DAY_TIME))
   ));
   private final MessageType complexParquetSchema = Types.buildMessage()
@@ -129,11 +131,12 @@ public class TestSchemaConverter {
     field("k1", new ArrowType.Decimal(15, 5)),
     field("k2", new ArrowType.Decimal(25, 5)),
     field("l", new ArrowType.Date(DateUnit.DAY)),
-    field("m", new ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.SECOND, 32)),
-    field("n", new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND,
"UTC")),
+    field("m", new ArrowType.Time(TimeUnit.MILLISECOND, 32)),
+    field("n", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")),
     field("o", new ArrowType.Interval(IntervalUnit.DAY_TIME)),
     field("o1", new ArrowType.Interval(IntervalUnit.YEAR_MONTH))
   ));
+
   private final MessageType allTypesParquetSchema = Types.buildMessage()
     .addField(Types.optional(BINARY).named("a"))
     .addField(Types.optionalGroup()
@@ -191,8 +194,8 @@ public class TestSchemaConverter {
     field("j1", new ArrowType.Decimal(15, 5)),
     field("j2", new ArrowType.Decimal(25, 5)),
     field("k", new ArrowType.Date(DateUnit.DAY)),
-    field("l", new ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, 32)),
-    field("m", new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND,
"UTC"))
+    field("l", new ArrowType.Time(TimeUnit.MILLISECOND, 32)),
+    field("m", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"))
   ));
 
   private final MessageType supportedTypesParquetSchema = Types.buildMessage()
@@ -348,4 +351,66 @@ public class TestSchemaConverter {
     SchemaMapping map = converter.map(paperArrowSchema, Paper.schema);
     Assert.assertEquals("p, s<r<p>, r<p>>, r<s<r<s<p, p>>,
p>>", toSummaryString(map));
   }
+
+  @Test(expected = UnsupportedOperationException.class)
+  public void testArrowTimeSecondToParquet() {
+    converter.fromArrow(new Schema(asList(
+      field("a", new ArrowType.Time(TimeUnit.SECOND, 32))
+    ))).getParquetSchema();
+  }
+
+  @Test
+  public void testArrowTimeMillisecondToParquet() {
+    MessageType expected = converter.fromArrow(new Schema(asList(
+      field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32))
+    ))).getParquetSchema();
+    Assert.assertEquals(expected, Types.buildMessage().addField(Types.optional(INT32).as(TIME_MILLIS).named("a")).named("root"));
+  }
+
+  @Test
+  public void testArrowTimeMicrosecondToParquet() {
+    MessageType expected = converter.fromArrow(new Schema(asList(
+      field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64))
+    ))).getParquetSchema();
+    Assert.assertEquals(expected, Types.buildMessage().addField(Types.optional(INT64).as(TIME_MICROS).named("a")).named("root"));
+  }
+
+  @Test(expected = UnsupportedOperationException.class)
+  public void testArrowTimeNanosecondToParquet() {
+    converter.fromArrow(new Schema(asList(
+      field("a", new ArrowType.Time(TimeUnit.NANOSECOND, 64))
+    ))).getParquetSchema();
+  }
+
+  @Test
+  public void testParquetInt32TimeMillisToArrow() {
+    MessageType parquet = Types.buildMessage()
+      .addField(Types.optional(INT32).as(TIME_MILLIS).named("a")).named("root");
+    Schema expected = new Schema(asList(
+      field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32))
+    ));
+    Assert.assertEquals(expected, converter.fromParquet(parquet).getArrowSchema());
+  }
+
+  @Test
+  public void testParquetInt64TimeMicrosToArrow() {
+    MessageType parquet = Types.buildMessage()
+      .addField(Types.optional(INT64).as(TIME_MICROS).named("a")).named("root");
+    Schema expected = new Schema(asList(
+      field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64))
+    ));
+    Assert.assertEquals(expected, converter.fromParquet(parquet).getArrowSchema());
+  }
+
+  @Test(expected = IllegalStateException.class)
+  public void testParquetInt64TimeMillisToArrow() {
+    converter.fromParquet(Types.buildMessage()
+      .addField(Types.optional(INT64).as(TIME_MILLIS).named("a")).named("root"));
+  }
+
+  @Test(expected = IllegalStateException.class)
+  public void testParquetInt32TimeMicrosToArrow() {
+    converter.fromParquet(Types.buildMessage()
+      .addField(Types.optional(INT32).as(TIME_MICROS).named("a")).named("root"));
+  }
 }

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.

Mime
View raw message