parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From b...@apache.org
Subject incubator-parquet-mr git commit: PARQUET-214: Fix Avro string regression.
Date Tue, 31 Mar 2015 23:49:34 GMT
Repository: incubator-parquet-mr
Updated Branches:
  refs/heads/master 0ab001352 -> 4ed0bdf1c


PARQUET-214: Fix Avro string regression.

At some point, parquet-avro converted string fields to binary without
the UTF8 annotation. The change in PARQUET-139 to filter the file's
schema using the requested projection causes a regression because the
annotation is not present in some file schemas, but is present in the
projection schema converted from Avro.

This reverts the projection change to avoid a regression in a release.
Fixing the projection as in PARQUET-139 will need to be done as a
follow-up.

Author: Ryan Blue <blue@apache.org>

Closes #142 from rdblue/PARQUET-214-fix-avro-regression and squashes the following commits:

71e0207 [Ryan Blue] PARQUET-214: Add support for old avro.schema property.
95148f9 [Ryan Blue] PARQUET-214: Revert Schema projection change from PARQUET-139.


Project: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/commit/4ed0bdf1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/tree/4ed0bdf1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/diff/4ed0bdf1

Branch: refs/heads/master
Commit: 4ed0bdf1c73fd82d3080d15085675de96d5be0aa
Parents: 0ab0013
Author: Ryan Blue <blue@apache.org>
Authored: Tue Mar 31 16:49:30 2015 -0700
Committer: Ryan Blue <blue@apache.org>
Committed: Tue Mar 31 16:49:30 2015 -0700

----------------------------------------------------------------------
 .../main/java/parquet/avro/AvroReadSupport.java |   9 ++--
 .../parquet/avro/TestBackwardCompatibility.java |  51 +++++++++++++++++++
 .../src/test/resources/strings-2.parquet        | Bin 0 -> 282 bytes
 3 files changed, 57 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/4ed0bdf1/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java
----------------------------------------------------------------------
diff --git a/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java b/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java
index eacd369..9df3363 100644
--- a/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java
+++ b/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java
@@ -39,6 +39,8 @@ public class AvroReadSupport<T extends IndexedRecord> extends ReadSupport<T>
{
   private static final String AVRO_READ_SCHEMA = "parquet.avro.read.schema";
 
   static final String AVRO_SCHEMA_METADATA_KEY = "parquet.avro.schema";
+  // older files were written with the schema in this metadata key
+  static final String OLD_AVRO_SCHEMA_METADATA_KEY = "avro.schema";
   private static final String AVRO_READ_SCHEMA_METADATA_KEY = "avro.read.schema";
 
   public static String AVRO_DATA_SUPPLIER = "parquet.avro.data.supplier";
@@ -79,9 +81,7 @@ public class AvroReadSupport<T extends IndexedRecord> extends ReadSupport<T>
{
       metadata = new LinkedHashMap<String, String>();
       metadata.put(AVRO_READ_SCHEMA_METADATA_KEY, avroReadSchema);
     }
-    // use getSchemaForRead because it checks that the requested schema is a
-    // subset of the columns in the file schema
-    return new ReadContext(getSchemaForRead(fileSchema, projection), metadata);
+    return new ReadContext(projection, metadata);
   }
 
   @Override
@@ -97,6 +97,9 @@ public class AvroReadSupport<T extends IndexedRecord> extends ReadSupport<T>
{
     } else if (keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY) != null) {
       // use the Avro schema from the file metadata if present
       avroSchema = new Schema.Parser().parse(keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY));
+    } else if (keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY) != null) {
+      // use the Avro schema from the file metadata if present
+      avroSchema = new Schema.Parser().parse(keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY));
     } else {
       // default to converting the Parquet schema into an Avro schema
       avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema);

http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/4ed0bdf1/parquet-avro/src/test/java/parquet/avro/TestBackwardCompatibility.java
----------------------------------------------------------------------
diff --git a/parquet-avro/src/test/java/parquet/avro/TestBackwardCompatibility.java b/parquet-avro/src/test/java/parquet/avro/TestBackwardCompatibility.java
new file mode 100644
index 0000000..4e614b5
--- /dev/null
+++ b/parquet-avro/src/test/java/parquet/avro/TestBackwardCompatibility.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package parquet.avro;
+
+import com.google.common.io.Resources;
+import java.io.IOException;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.junit.Assert;
+import org.junit.Test;
+import parquet.hadoop.ParquetReader;
+
+public class TestBackwardCompatibility {
+
+  @Test
+  public void testStringCompatibility() throws IOException {
+    // some older versions of Parquet used avro.schema instead of
+    // parquet.avro.schema and didn't annotate binary with UTF8 when the type
+    // was converted from an Avro string. this validates that the old read
+    // schema is recognized and used to read the file as expected.
+    Path testFile = new Path(Resources.getResource("strings-2.parquet").getFile());
+    Configuration conf = new Configuration();
+    ParquetReader<GenericRecord> reader = AvroParquetReader
+        .builder(new AvroReadSupport<GenericRecord>(), testFile)
+        .withConf(conf)
+        .build();
+    GenericRecord r;
+    while ((r = reader.read()) != null) {
+      Assert.assertTrue("Should read value into a String",
+          r.get("text") instanceof String);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/4ed0bdf1/parquet-avro/src/test/resources/strings-2.parquet
----------------------------------------------------------------------
diff --git a/parquet-avro/src/test/resources/strings-2.parquet b/parquet-avro/src/test/resources/strings-2.parquet
new file mode 100644
index 0000000..3b1c94a
Binary files /dev/null and b/parquet-avro/src/test/resources/strings-2.parquet differ


Mime
View raw message