Repository: incubator-parquet-mr
Updated Branches:
refs/heads/master 0ab001352 -> 4ed0bdf1c
PARQUET-214: Fix Avro string regression.
At some point, parquet-avro converted string fields to binary without
the UTF8 annotation. The change in PARQUET-139 to filter the file's
schema using the requested projection causes a regression because the
annotation is not present in some file schemas, but is present in the
projection schema converted from Avro.
This reverts the projection change to avoid a regression in a release.
Fixing the projection as in PARQUET-139 will need to be done as a
follow-up.
Author: Ryan Blue <blue@apache.org>
Closes #142 from rdblue/PARQUET-214-fix-avro-regression and squashes the following commits:
71e0207 [Ryan Blue] PARQUET-214: Add support for old avro.schema property.
95148f9 [Ryan Blue] PARQUET-214: Revert Schema projection change from PARQUET-139.
Project: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/commit/4ed0bdf1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/tree/4ed0bdf1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/diff/4ed0bdf1
Branch: refs/heads/master
Commit: 4ed0bdf1c73fd82d3080d15085675de96d5be0aa
Parents: 0ab0013
Author: Ryan Blue <blue@apache.org>
Authored: Tue Mar 31 16:49:30 2015 -0700
Committer: Ryan Blue <blue@apache.org>
Committed: Tue Mar 31 16:49:30 2015 -0700
----------------------------------------------------------------------
.../main/java/parquet/avro/AvroReadSupport.java | 9 ++--
.../parquet/avro/TestBackwardCompatibility.java | 51 +++++++++++++++++++
.../src/test/resources/strings-2.parquet | Bin 0 -> 282 bytes
3 files changed, 57 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/4ed0bdf1/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java
----------------------------------------------------------------------
diff --git a/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java b/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java
index eacd369..9df3363 100644
--- a/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java
+++ b/parquet-avro/src/main/java/parquet/avro/AvroReadSupport.java
@@ -39,6 +39,8 @@ public class AvroReadSupport<T extends IndexedRecord> extends ReadSupport<T>
{
private static final String AVRO_READ_SCHEMA = "parquet.avro.read.schema";
static final String AVRO_SCHEMA_METADATA_KEY = "parquet.avro.schema";
+ // older files were written with the schema in this metadata key
+ static final String OLD_AVRO_SCHEMA_METADATA_KEY = "avro.schema";
private static final String AVRO_READ_SCHEMA_METADATA_KEY = "avro.read.schema";
public static String AVRO_DATA_SUPPLIER = "parquet.avro.data.supplier";
@@ -79,9 +81,7 @@ public class AvroReadSupport<T extends IndexedRecord> extends ReadSupport<T>
{
metadata = new LinkedHashMap<String, String>();
metadata.put(AVRO_READ_SCHEMA_METADATA_KEY, avroReadSchema);
}
- // use getSchemaForRead because it checks that the requested schema is a
- // subset of the columns in the file schema
- return new ReadContext(getSchemaForRead(fileSchema, projection), metadata);
+ return new ReadContext(projection, metadata);
}
@Override
@@ -97,6 +97,9 @@ public class AvroReadSupport<T extends IndexedRecord> extends ReadSupport<T>
{
} else if (keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY) != null) {
// use the Avro schema from the file metadata if present
avroSchema = new Schema.Parser().parse(keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY));
+ } else if (keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY) != null) {
+ // use the Avro schema from the file metadata if present
+ avroSchema = new Schema.Parser().parse(keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY));
} else {
// default to converting the Parquet schema into an Avro schema
avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema);
http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/4ed0bdf1/parquet-avro/src/test/java/parquet/avro/TestBackwardCompatibility.java
----------------------------------------------------------------------
diff --git a/parquet-avro/src/test/java/parquet/avro/TestBackwardCompatibility.java b/parquet-avro/src/test/java/parquet/avro/TestBackwardCompatibility.java
new file mode 100644
index 0000000..4e614b5
--- /dev/null
+++ b/parquet-avro/src/test/java/parquet/avro/TestBackwardCompatibility.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package parquet.avro;
+
+import com.google.common.io.Resources;
+import java.io.IOException;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.junit.Assert;
+import org.junit.Test;
+import parquet.hadoop.ParquetReader;
+
+public class TestBackwardCompatibility {
+
+ @Test
+ public void testStringCompatibility() throws IOException {
+ // some older versions of Parquet used avro.schema instead of
+ // parquet.avro.schema and didn't annotate binary with UTF8 when the type
+ // was converted from an Avro string. this validates that the old read
+ // schema is recognized and used to read the file as expected.
+ Path testFile = new Path(Resources.getResource("strings-2.parquet").getFile());
+ Configuration conf = new Configuration();
+ ParquetReader<GenericRecord> reader = AvroParquetReader
+ .builder(new AvroReadSupport<GenericRecord>(), testFile)
+ .withConf(conf)
+ .build();
+ GenericRecord r;
+ while ((r = reader.read()) != null) {
+ Assert.assertTrue("Should read value into a String",
+ r.get("text") instanceof String);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/4ed0bdf1/parquet-avro/src/test/resources/strings-2.parquet
----------------------------------------------------------------------
diff --git a/parquet-avro/src/test/resources/strings-2.parquet b/parquet-avro/src/test/resources/strings-2.parquet
new file mode 100644
index 0000000..3b1c94a
Binary files /dev/null and b/parquet-avro/src/test/resources/strings-2.parquet differ
|