Return-Path: X-Original-To: apmail-crunch-commits-archive@www.apache.org Delivered-To: apmail-crunch-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id C78C5102DA for ; Fri, 3 Jan 2014 02:53:13 +0000 (UTC) Received: (qmail 25705 invoked by uid 500); 3 Jan 2014 02:53:13 -0000 Delivered-To: apmail-crunch-commits-archive@crunch.apache.org Received: (qmail 25669 invoked by uid 500); 3 Jan 2014 02:53:13 -0000 Mailing-List: contact commits-help@crunch.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@crunch.apache.org Delivered-To: mailing list commits@crunch.apache.org Received: (qmail 25660 invoked by uid 99); 3 Jan 2014 02:53:13 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 03 Jan 2014 02:53:13 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 5D5269151D6; Fri, 3 Jan 2014 02:53:13 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: mkwhit@apache.org To: commits@crunch.apache.org Message-Id: <6ee416764ad04118993f54d93aa53b27@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: git commit: CRUNCH-316: Integration test for SafeAvroSerialization and ArrayIndexOutOfBoundsException Date: Fri, 3 Jan 2014 02:53:13 +0000 (UTC) Updated Branches: refs/heads/master b4cad9454 -> 64c20ad9c CRUNCH-316: Integration test for SafeAvroSerialization and ArrayIndexOutOfBoundsException Signed-off-by: Micah Whitacre Project: http://git-wip-us.apache.org/repos/asf/crunch/repo Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/64c20ad9 Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/64c20ad9 Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/64c20ad9 Branch: refs/heads/master Commit: 64c20ad9c2361786c225d874561511ec62dd408d Parents: b4cad94 Author: Ben Roling Authored: Thu Jan 2 16:55:53 2014 -0600 Committer: Micah Whitacre Committed: Thu Jan 2 20:30:47 2014 -0600 ---------------------------------------------------------------------- .../types/avro/SafeAvroSerializationIT.java | 159 +++++++++++++++++++ crunch-core/src/it/resources/CRUNCH-316.avsc | 39 +++++ 2 files changed, 198 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/crunch/blob/64c20ad9/crunch-core/src/it/java/org/apache/crunch/types/avro/SafeAvroSerializationIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/types/avro/SafeAvroSerializationIT.java b/crunch-core/src/it/java/org/apache/crunch/types/avro/SafeAvroSerializationIT.java new file mode 100644 index 0000000..1bbade9 --- /dev/null +++ b/crunch-core/src/it/java/org/apache/crunch/types/avro/SafeAvroSerializationIT.java @@ -0,0 +1,159 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ +package org.apache.crunch.types.avro; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.Collection; + +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericData.Record; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumWriter; +import org.apache.crunch.MapFn; +import org.apache.crunch.PTable; +import org.apache.crunch.Pair; +import org.apache.crunch.Pipeline; +import org.apache.crunch.impl.mr.MRPipeline; +import org.apache.crunch.io.At; +import org.apache.crunch.test.TemporaryPath; +import org.apache.crunch.test.TemporaryPaths; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.junit.Rule; +import org.junit.Test; + +@SuppressWarnings("serial") +public class SafeAvroSerializationIT implements Serializable { + @Rule + public transient TemporaryPath tmpDir = TemporaryPaths.create(); + + /** + * Test to prove CRUNCH-316 has been fixed + */ + @Test + public void testMapBufferTooSmallException() throws IOException { + Configuration configuration = tmpDir.getDefaultConfiguration(); + + // small io.sort.mb to make the test run faster with less resources + configuration.set("io.sort.mb", "1"); + + Pipeline pipeline = new MRPipeline(SafeAvroSerializationIT.class, + configuration); + + Schema schema = new Schema.Parser().parse(tmpDir + .copyResourceFile("CRUNCH-316.avsc")); + + PTable leftSide = pipeline.read( + At.avroFile( + new Path(populateLeftSide(schema).getAbsolutePath()), + Avros.generics(schema))).by( + new MapFn() { + @Override + public String map(GenericData.Record input) { + return (String) input.get("tag").toString(); + } + }, Avros.strings()); + + PTable rightSide = pipeline.read( + At.avroFile(new Path(populateRightSide().getAbsolutePath()), + Avros.strings())).by(new MapFn() { + @Override + public String map(String input) { + return input; + } + }, Avros.strings()); + + PTable> joinedTable = leftSide + .join(rightSide); + + // if CRUNCH-316 isn't fixed, this will result in an + // ArrayIndexOutOfBoundsException in the reduce + Collection>> joinRows = joinedTable + .asCollection().getValue(); + + assertEquals(1, joinRows.size()); + Pair> firstRow = joinRows.iterator() + .next(); + assertEquals("c", firstRow.first()); + assertEquals("c", firstRow.second().first().get("tag").toString()); + assertEquals(createString('c', 40), + firstRow.second().first().get("data1").toString()); + assertEquals(null, firstRow.second().first().get("data2")); + assertEquals("c", firstRow.second().second()); + } + + private File populateLeftSide(Schema schema) throws IOException { + File file = tmpDir.getFile("leftSide.avro"); + DatumWriter datumWriter = new GenericDatumWriter( + schema); + DataFileWriter dataFileWriter = new DataFileWriter( + datumWriter); + dataFileWriter.create(schema, file); + + GenericRecord record = new GenericData.Record(schema); + + // RECORD 1 + record.put("tag", "b"); + record.put("data1", createString('b', 996100)); + + // buffer space has to run out on a write of less than 512 bytes for the + // issue to occur + record.put("data2", createString('b', 250)); + + dataFileWriter.append(record); + + // RECORD 2 -- this record will be corrupted with overflow from RECORD 1 + record.put("tag", "c"); + record.put("data1", createString('c', 40)); + record.put("data2", null); + dataFileWriter.append(record); + + dataFileWriter.close(); + return file; + } + + private File populateRightSide() throws IOException { + File file = tmpDir.getFile("rightSide.avro"); + DatumWriter datumWriter = new GenericDatumWriter(Avros + .strings().getSchema()); + DataFileWriter dataFileWriter = new DataFileWriter( + datumWriter); + dataFileWriter.create(Avros.strings().getSchema(), file); + + // will join successfully to RECORD 2 from left side + dataFileWriter.append("c"); + + dataFileWriter.close(); + return file; + } + + private static String createString(Character ch, int len) { + StringBuilder buffer = new StringBuilder(len); + for (int i = 0; i < len; i++) { + buffer.append(ch); + } + return buffer.toString(); + } +} http://git-wip-us.apache.org/repos/asf/crunch/blob/64c20ad9/crunch-core/src/it/resources/CRUNCH-316.avsc ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/resources/CRUNCH-316.avsc b/crunch-core/src/it/resources/CRUNCH-316.avsc new file mode 100644 index 0000000..178fa94 --- /dev/null +++ b/crunch-core/src/it/resources/CRUNCH-316.avsc @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ +{ + "name": "rec", + "namespace": "crunch", + "type": "record", + "fields": [ + { + "name": "tag", + "type": "string" + }, + { + "name": "data1", + "type": "string" + }, + { + "name": "data2", + "type": [ + "string", + "null" + ] + } + ] +}