Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id CC154200CEB for ; Sat, 29 Jul 2017 01:25:29 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id CA60816DB9A; Fri, 28 Jul 2017 23:25:29 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 9FA9916DB96 for ; Sat, 29 Jul 2017 01:25:27 +0200 (CEST) Received: (qmail 20545 invoked by uid 500); 28 Jul 2017 23:25:26 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 20486 invoked by uid 99); 28 Jul 2017 23:25:26 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 28 Jul 2017 23:25:26 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 57827F32C3; Fri, 28 Jul 2017 23:25:26 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: blue@apache.org To: commits@parquet.apache.org Date: Fri, 28 Jul 2017 23:25:27 -0000 Message-Id: <1955db177fb54728b98a030942a9e18f@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [2/4] parquet-mr git commit: PARQUET-777: Add Parquet CLI. archived-at: Fri, 28 Jul 2017 23:25:30 -0000 http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/java/org/apache/parquet/cli/util/Codecs.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Codecs.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Codecs.java new file mode 100644 index 0000000..06f12fd --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Codecs.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import org.apache.avro.file.CodecFactory; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +import java.util.Locale; + +public class Codecs { + public static CompressionCodecName parquetCodec(String codec) { + try { + return CompressionCodecName.valueOf(codec.toUpperCase(Locale.ENGLISH)); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Unknown compression codec: " + codec); + } + } + + public static CodecFactory avroCodec(String codec) { + CompressionCodecName parquetCodec = parquetCodec(codec); + switch (parquetCodec) { + case UNCOMPRESSED: + return CodecFactory.nullCodec(); + case SNAPPY: + return CodecFactory.snappyCodec(); + case GZIP: + return CodecFactory.deflateCodec(9); + default: + throw new IllegalArgumentException( + "Codec incompatible with Avro: " + codec); + } + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java new file mode 100644 index 0000000..61f632a --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + + +public class Expressions { + private static final Pattern NUMERIC_RE = Pattern.compile("^\\d+$"); + + public static Object select(Schema schema, Object datum, String path) { + return select(schema, datum, Lists.newArrayList(parse(path))); + } + + @SuppressWarnings("unchecked") + private static Object select(Schema schema, Object datum, List tokens) { + if (tokens.isEmpty()) { + return datum; + } + + Preconditions.checkArgument(tokens.size() == 1, "Cannot return multiple values"); + PathExpr token = tokens.get(0); + + switch (schema.getType()) { + case RECORD: + if (!(datum instanceof GenericRecord) && "json".equals(schema.getName())) { + // skip the placeholder record schema + return select(schema.getField("value").schema(), datum, tokens); + } + Preconditions.checkArgument(token.type == PathExpr.Type.FIELD, + "Cannot dereference records"); + Preconditions.checkArgument(datum instanceof GenericRecord, + "Not a record: %s", datum); + GenericRecord record = (GenericRecord) datum; + Schema.Field field = schema.getField(token.value); + Preconditions.checkArgument(field != null, + "No such field '%s' in schema: %s", token.value, schema); + return select(field.schema(), record.get(token.value), token.children); + + case MAP: + Preconditions.checkArgument(datum instanceof Map, + "Not a map: %s", datum); + Map map = (Map) datum; + Object value = map.get(token.value); + if (value == null) { + // try with a Utf8 + value = map.get(new Utf8(token.value)); + } + return select(schema.getValueType(), value, token.children); + + case ARRAY: + Preconditions.checkArgument(token.type == PathExpr.Type.DEREF, + "Cannot access fields of an array"); + Preconditions.checkArgument(datum instanceof Collection, + "Not an array: %s", datum); + Preconditions.checkArgument(NUMERIC_RE.matcher(token.value).matches(), + "Not an array index: %s", token.value); + List list = (List) datum; + return select(schema.getElementType(), list.get(Integer.parseInt(token.value)), + token.children); + + case UNION: + int branch = GenericData.get().resolveUnion(schema, datum); + return select(schema.getTypes().get(branch), datum, tokens); + + default: + throw new IllegalArgumentException("Cannot access child of primitive value: " + datum); + } + } + + /** + * a.2.b[3]["key"] + * * optional (union with null) should be ignored + * * unions should match by position number or short name (e.g. 2, user) + * * fields should match by name + * * arrays are dereferenced by position [n] => schema is the element schema + * * maps are dereferenced by key => schema is the value schema + */ + public static Schema filterSchema(Schema schema, String... fieldPaths) { + return filterSchema(schema, Lists.newArrayList(fieldPaths)); + } + + public static Schema filterSchema(Schema schema, List fieldPaths) { + if (fieldPaths == null) { + return schema; + } + List paths = merge(Lists.newArrayList(fieldPaths)); + return filter(schema, paths); + } + + private static PathExpr parse(String path) { + PathExpr expr = null; + PathExpr last = null; + boolean inDeref = false; + boolean afterDeref = false; + int valueStart = 0; + for (int i = 0; i < path.length(); i += 1) { + switch (path.charAt(i)) { + case '.': + Preconditions.checkState(valueStart != i || afterDeref, "Empty reference: ''"); + if (!inDeref) { + if (valueStart != i) { + PathExpr current = PathExpr.field(path.substring(valueStart, i)); + if (last != null) { + last.children.add(current); + } else { + expr = current; + } + last = current; + } + valueStart = i + 1; + afterDeref = false; + } + break; + case '[': + Preconditions.checkState(!inDeref, "Cannot nest [ within []"); + Preconditions.checkState(valueStart != i || afterDeref, "Empty reference: ''"); + if (valueStart != i) { + PathExpr current = PathExpr.field(path.substring(valueStart, i)); + if (last != null) { + last.children.add(current); + } else { + expr = current; + } + last = current; + } + valueStart = i + 1; + inDeref = true; + afterDeref = false; + break; + case ']': + Preconditions.checkState(inDeref, "Cannot use ] without a starting ["); + Preconditions.checkState(valueStart != i, "Empty reference: ''"); + PathExpr current = PathExpr.deref(path.substring(valueStart, i)); + if (last != null) { + last.children.add(current); + } else { + expr = current; + } + last = current; + valueStart = i + 1; + inDeref = false; + afterDeref = true; + break; + default: + Preconditions.checkState(!afterDeref, "Fields after [] must start with ."); + } + } + Preconditions.checkState(!inDeref, "Fields after [ must end with ]"); + if (valueStart < path.length()) { + PathExpr current = PathExpr.field(path.substring(valueStart, path.length())); + if (last != null) { + last.children.add(current); + } else { + expr = current; + } + } + return expr; + } + + private static List merge(List fields) { + List paths = Lists.newArrayList(); + for (String field : fields) { + merge(paths, parse(field)); + } + return paths; + } + + private static List merge(List tokens, PathExpr toAdd) { + boolean merged = false; + for (PathExpr token : tokens) { + if ((token.type == toAdd.type) && + (token.type == PathExpr.Type.DEREF || token.value.equals(toAdd.value))) { + for (PathExpr child : toAdd.children) { + merge(token.children, child); + } + merged = true; + } + } + if (!merged) { + tokens.add(toAdd); + } + return tokens; + } + + private static Schema filter(Schema schema, List exprs) { + if (exprs.isEmpty()) { + return schema; + } + + switch (schema.getType()) { + case RECORD: + List fields = Lists.newArrayList(); + for (PathExpr expr : exprs) { + Schema.Field field = schema.getField(expr.value); + Preconditions.checkArgument(field != null, + "Cannot find field '%s' in schema: %s", expr.value, schema); + fields.add(new Schema.Field(expr.value, filter(field.schema(), expr.children), + field.doc(), field.defaultVal(), field.order())); + } + return Schema.createRecord(schema.getName(), + schema.getDoc(), schema.getNamespace(), schema.isError(), fields); + + case UNION: + // Ignore schemas that are a union with null because there is another token + if (schema.getTypes().size() == 2) { + if (schema.getTypes().get(0).getType() == Schema.Type.NULL) { + return filter(schema.getTypes().get(1), exprs); + } else if (schema.getTypes().get(1).getType() == Schema.Type.NULL) { + return filter(schema.getTypes().get(0), exprs); + } + } + + List schemas = Lists.newArrayList(); + for (PathExpr expr : exprs) { + schemas.add(filter(schema, expr)); + } + + if (schemas.size() > 1) { + return Schema.createUnion(schemas); + } else { + return schemas.get(0); + } + + case MAP: + Preconditions.checkArgument(exprs.size() == 1, + "Cannot find multiple children of map schema: %s", schema); + return filter(schema, exprs.get(0)); + + case ARRAY: + Preconditions.checkArgument(exprs.size() == 1, + "Cannot find multiple children of array schema: %s", schema); + return filter(schema, exprs.get(0)); + + default: + throw new IllegalArgumentException(String.format( + "Cannot find child of primitive schema: %s", schema)); + } + } + + private static Schema filter(Schema schema, PathExpr expr) { + if (expr == null) { + return schema; + } + + switch (schema.getType()) { + case RECORD: + Preconditions.checkArgument(expr.type == PathExpr.Type.FIELD, + "Cannot index a record: [%s]", expr.value); + Schema.Field field = schema.getField(expr.value); + if (field != null) { + return filter(field.schema(), expr.children); + } else { + throw new IllegalArgumentException(String.format( + "Cannot find field '%s' in schema: %s", expr.value, schema.toString(true))); + } + + case MAP: + return Schema.createMap(filter(schema.getValueType(), expr.children)); + + case ARRAY: + Preconditions.checkArgument(expr.type == PathExpr.Type.DEREF, + "Cannot find field '%s' in an array", expr.value); + Preconditions.checkArgument(NUMERIC_RE.matcher(expr.value).matches(), + "Cannot index array by non-numeric value '%s'", expr.value); + return Schema.createArray(filter(schema.getElementType(), expr.children)); + + case UNION: + // TODO: this should only return something if the type can match rather than explicitly + // accessing parts of a union. when selecting data, unions are ignored. + Preconditions.checkArgument(expr.type == PathExpr.Type.DEREF, + "Cannot find field '%s' in a union", expr.value); + List options = schema.getTypes(); + if (NUMERIC_RE.matcher(expr.value).matches()) { + // look up the option by position + int i = Integer.parseInt(expr.value); + if (i < options.size()) { + return filter(options.get(i), expr.children); + } + } else { + // look up the option by name + for (Schema option : options) { + if (expr.value.equalsIgnoreCase(option.getName())) { + return filter(option, expr.children); + } + } + } + throw new IllegalArgumentException(String.format( + "Invalid union index '%s' for schema: %s", expr.value, schema)); + + default: + throw new IllegalArgumentException(String.format( + "Cannot find '%s' in primitive schema: %s", expr.value, schema)); + } + } + + private static class PathExpr { + enum Type { + DEREF, + FIELD + } + + static PathExpr deref(String value) { + return new PathExpr(Type.DEREF, value); + } + + static PathExpr deref(String value, PathExpr child) { + return new PathExpr(Type.DEREF, value, Lists.newArrayList(child)); + } + + static PathExpr field(String value) { + return new PathExpr(Type.FIELD, value); + } + + static PathExpr field(String value, PathExpr child) { + return new PathExpr(Type.FIELD, value, Lists.newArrayList(child)); + } + + private final Type type; + private final String value; + private final List children; + + PathExpr(Type type, String value) { + this.type = type; + this.value = value; + this.children = Lists.newArrayList(); + } + + PathExpr(Type type, String value, List children) { + this.type = type; + this.value = value; + this.children = children; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + PathExpr pathExpr = (PathExpr) o; + + if (type != pathExpr.type) return false; + if (value != null ? !value.equals(pathExpr.value) : pathExpr.value != null) return false; + return children != null ? children.equals(pathExpr.children) : pathExpr.children == null; + } + + @Override + public int hashCode() { + int result = type != null ? type.hashCode() : 0; + result = 31 * result + (value != null ? value.hashCode() : 0); + result = 31 * result + (children != null ? children.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return Objects.toStringHelper(this) + .add("type", type) + .add("value", value) + .add("children", children) + .toString(); + } + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/java/org/apache/parquet/cli/util/Formats.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Formats.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Formats.java new file mode 100644 index 0000000..6895182 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Formats.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +public class Formats { + public enum Format { + PARQUET, + AVRO, + SEQUENCE, + TEXT + } + + public static Format detectFormat(InputStream stream) throws IOException { + byte[] first3 = new byte[3]; + stream.read(first3); + if (Arrays.equals(first3, new byte[]{'P', 'A', 'R'})) { + return Format.PARQUET; + } else if (Arrays.equals(first3, new byte[]{'O', 'b', 'j'})) { + return Format.AVRO; + } else if (Arrays.equals(first3, new byte[]{'S', 'E', 'Q'})) { + return Format.SEQUENCE; + } else { + return Format.TEXT; + } + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/java/org/apache/parquet/cli/util/GetClassLoader.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/GetClassLoader.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/GetClassLoader.java new file mode 100644 index 0000000..1cacbd5 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/GetClassLoader.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import java.net.URL; +import java.net.URLClassLoader; +import java.security.PrivilegedAction; +import java.util.List; + +public class GetClassLoader implements PrivilegedAction { + private final URL[] urls; + + public GetClassLoader(List urls) { + this.urls = urls.toArray(new URL[urls.size()]); + } + + @Override + public ClassLoader run() { + return new URLClassLoader( + urls, Thread.currentThread().getContextClassLoader()); + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/java/org/apache/parquet/cli/util/RecordException.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/RecordException.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/RecordException.java new file mode 100644 index 0000000..f7e7b6c --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/RecordException.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +/** + * Exception to signal that a record could not be read or written. + */ +public class RecordException extends RuntimeException { + public RecordException(String message) { + super(message); + } + + public RecordException(String message, Throwable cause) { + super(message, cause); + } + + /** + * Precondition-style validation that throws a {@link RecordException}. + * + * @param isValid + * {@code true} if valid, {@code false} if an exception should be + * thrown + * @param message + * A String message for the exception. + */ + public static void check(boolean isValid, String message, Object... args) { + if (!isValid) { + String[] argStrings = new String[args.length]; + for (int i = 0; i < args.length; i += 1) { + argStrings[i] = String.valueOf(args[i]); + } + throw new RecordException( + String.format(String.valueOf(message), (Object[]) argStrings)); + } + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/java/org/apache/parquet/cli/util/RuntimeIOException.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/RuntimeIOException.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/RuntimeIOException.java new file mode 100644 index 0000000..e723319 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/RuntimeIOException.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import java.io.IOException; + +/** + * RuntimeException wrapper for IOExceptions + */ +public class RuntimeIOException extends RuntimeException { + public RuntimeIOException(String message, IOException cause) { + super(message, cause); + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/java/org/apache/parquet/cli/util/Schemas.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Schemas.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Schemas.java new file mode 100644 index 0000000..877c7cc --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Schemas.java @@ -0,0 +1,498 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import com.google.common.io.Closeables; +import org.apache.parquet.cli.json.AvroJson; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.codehaus.jackson.node.NullNode; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +public class Schemas { + + public static Schema fromAvsc(InputStream in) throws IOException { + // the parser has state, so use a new one each time + return new Schema.Parser().parse(in); + } + + public static Schema fromAvro(InputStream in) throws IOException { + GenericDatumReader datumReader = + new GenericDatumReader(); + DataFileStream stream = null; + boolean threw = true; + + try { + stream = new DataFileStream<>(in, datumReader); + Schema schema = stream.getSchema(); + threw = false; + return schema; + } finally { + Closeables.close(stream, threw); + } + } + + public static Schema fromParquet(Configuration conf, URI location) throws IOException { + Path path = new Path(location); + FileSystem fs = path.getFileSystem(conf); + + ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path); + + String schemaString = footer.getFileMetaData() + .getKeyValueMetaData().get("parquet.avro.schema"); + if (schemaString == null) { + // try the older property + schemaString = footer.getFileMetaData() + .getKeyValueMetaData().get("avro.schema"); + } + + if (schemaString != null) { + return new Schema.Parser().parse(schemaString); + } else { + return new AvroSchemaConverter() + .convert(footer.getFileMetaData().getSchema()); + } + } + + public static Schema fromJSON(String name, InputStream in) throws IOException { + return AvroJson.inferSchema(in, name, 20); + } + + /** + * Returns whether null is allowed by the schema. + * + * @param schema a Schema + * @return true if schema allows the value to be null + */ + public static boolean nullOk(Schema schema) { + if (Schema.Type.NULL == schema.getType()) { + return true; + } else if (Schema.Type.UNION == schema.getType()) { + for (Schema possible : schema.getTypes()) { + if (nullOk(possible)) { + return true; + } + } + } + return false; + } + + /** + * Merges {@link Schema} instances if they are compatible. + *

+ * Schemas are incompatible if: + *

    + *
  • The {@link Schema.Type} does not match.
  • + *
  • For record schemas, the record name does not match
  • + *
  • For enum schemas, the enum name does not match
  • + *
+ *

+ * Map value, array element, and record field types types will use unions if + * necessary, and union schemas are merged recursively. + * + * @param schemas a set of {@code Schema} instances to merge + * @return a merged {@code Schema} + * @throws IllegalStateException if the schemas are not compatible + */ + public static Schema merge(Iterable schemas) { + Iterator iter = schemas.iterator(); + if (!iter.hasNext()) { + return null; + } + Schema result = iter.next(); + while (iter.hasNext()) { + result = merge(result, iter.next()); + } + return result; + } + + /** + * Merges {@link Schema} instances and creates a union of schemas if any are + * incompatible. + *

+ * Schemas are incompatible if: + *

    + *
  • The {@link Schema.Type} does not match.
  • + *
  • For record schemas, the record name does not match
  • + *
  • For enum schemas, the enum name does not match
  • + *
+ *

+ * Map value, array element, and record field types types will use unions if + * necessary, and union schemas are merged recursively. + * + * @param schemas a set of {@code Schema} instances to merge + * @return a combined {@code Schema} + */ + public static Schema mergeOrUnion(Iterable schemas) { + Iterator iter = schemas.iterator(); + if (!iter.hasNext()) { + return null; + } + Schema result = iter.next(); + while (iter.hasNext()) { + result = mergeOrUnion(result, iter.next()); + } + return result; + } + + /** + * Merges two {@link Schema} instances if they are compatible. + *

+ * Two schemas are incompatible if: + *

    + *
  • The {@link Schema.Type} does not match.
  • + *
  • For record schemas, the record name does not match
  • + *
  • For enum schemas, the enum name does not match
  • + *
+ *

+ * Map value and array element types will use unions if necessary, and union + * schemas are merged recursively. + * + * @param left a {@code Schema} + * @param right a {@code Schema} + * @return a merged {@code Schema} + * @throws IllegalStateException if the schemas are not compatible + */ + public static Schema merge(Schema left, Schema right) { + Schema merged = mergeOnly(left, right); + Preconditions.checkState(merged != null, + "Cannot merge %s and %s", left, right); + return merged; + } + + /** + * Merges two {@link Schema} instances or returns {@code null}. + *

+ * The two schemas are merged if they are the same type. Records are merged + * if the two records have the same name or have no names but have a + * significant number of shared fields. + *

+ * @see {@link #mergeOrUnion} to return a union when a merge is not possible. + * + * @param left a {@code Schema} + * @param right a {@code Schema} + * @return a {@code Schema} for both types + */ + private static Schema mergeOrUnion(Schema left, Schema right) { + Schema merged = mergeOnly(left, right); + if (merged != null) { + return merged; + } + return union(left, right); + } + + /** + * Creates a union of two {@link Schema} instances. + *

+ * If either {@code Schema} is a union, this will attempt to merge the other + * schema with the types contained in that union before adding more types to + * the union that is produced. + *

+ * If both schemas are not unions, no merge is attempted. + * + * @param left a {@code Schema} + * @param right a {@code Schema} + * @return a UNION schema of the to {@code Schema} instances + */ + private static Schema union(Schema left, Schema right) { + if (left.getType() == Schema.Type.UNION) { + if (right.getType() == Schema.Type.UNION) { + // combine the unions by adding each type in right individually + Schema combined = left; + for (Schema type : right.getTypes()) { + combined = union(combined, type); + } + return combined; + + } else { + boolean notMerged = true; + // combine a union with a non-union by checking if each type will merge + List types = Lists.newArrayList(); + Iterator schemas = left.getTypes().iterator(); + // try to merge each type and stop when one succeeds + while (schemas.hasNext()) { + Schema next = schemas.next(); + Schema merged = mergeOnly(next, right); + if (merged != null) { + types.add(merged); + notMerged = false; + break; + } else { + // merge didn't work, add the type + types.add(next); + } + } + // add the remaining types from the left union + while (schemas.hasNext()) { + types.add(schemas.next()); + } + + if (notMerged) { + types.add(right); + } + + return Schema.createUnion(types); + } + } else if (right.getType() == Schema.Type.UNION) { + return union(right, left); + } + + return Schema.createUnion(ImmutableList.of(left, right)); + } + + /** + * Merges two {@link Schema} instances or returns {@code null}. + *

+ * The two schemas are merged if they are the same type. Records are merged + * if the two records have the same name or have no names but have a + * significant number of shared fields. + *

+ * @see {@link #mergeOrUnion} to return a union when a merge is not possible. + * + * @param left a {@code Schema} + * @param right a {@code Schema} + * @return a merged {@code Schema} or {@code null} if merging is not possible + */ + private static Schema mergeOnly(Schema left, Schema right) { + if (Objects.equal(left, right)) { + return left; + } + + // handle primitive type promotion; doesn't promote integers to floats + switch (left.getType()) { + case INT: + if (right.getType() == Schema.Type.LONG) { + return right; + } + break; + case LONG: + if (right.getType() == Schema.Type.INT) { + return left; + } + break; + case FLOAT: + if (right.getType() == Schema.Type.DOUBLE) { + return right; + } + break; + case DOUBLE: + if (right.getType() == Schema.Type.FLOAT) { + return left; + } + } + + // any other cases where the types don't match must be combined by a union + if (left.getType() != right.getType()) { + return null; + } + + switch (left.getType()) { + case UNION: + return union(left, right); + case RECORD: + if (left.getName() == null && right.getName() == null && + fieldSimilarity(left, right) < SIMILARITY_THRESH) { + return null; + } else if (!Objects.equal(left.getName(), right.getName())) { + return null; + } + + Schema combinedRecord = Schema.createRecord( + coalesce(left.getName(), right.getName()), + coalesce(left.getDoc(), right.getDoc()), + coalesce(left.getNamespace(), right.getNamespace()), + false + ); + combinedRecord.setFields(mergeFields(left, right)); + + return combinedRecord; + + case MAP: + return Schema.createMap( + mergeOrUnion(left.getValueType(), right.getValueType())); + + case ARRAY: + return Schema.createArray( + mergeOrUnion(left.getElementType(), right.getElementType())); + + case ENUM: + if (!Objects.equal(left.getName(), right.getName())) { + return null; + } + Set symbols = Sets.newLinkedHashSet(); + symbols.addAll(left.getEnumSymbols()); + symbols.addAll(right.getEnumSymbols()); + return Schema.createEnum( + left.getName(), + coalesce(left.getDoc(), right.getDoc()), + coalesce(left.getNamespace(), right.getNamespace()), + ImmutableList.copyOf(symbols) + ); + + default: + // all primitives are handled before the switch by the equality check. + // schemas that reach this point are not primitives and also not any of + // the above known types. + throw new UnsupportedOperationException( + "Unknown schema type: " + left.getType()); + } + } + + private static final Schema NULL = Schema.create(Schema.Type.NULL); + private static final NullNode NULL_DEFAULT = NullNode.getInstance(); + + /** + * Returns a union {@link Schema} of NULL and the given {@code schema}. + *

+ * A NULL schema is always the first type in the union so that a null default + * value can be set. + * + * @param schema a {@code Schema} + * @return a union of null and the given schema + */ + private static Schema nullableForDefault(Schema schema) { + if (schema.getType() == Schema.Type.NULL) { + return schema; + } + + if (schema.getType() != Schema.Type.UNION) { + return Schema.createUnion(ImmutableList.of(NULL, schema)); + } + + if (schema.getTypes().get(0).getType() == Schema.Type.NULL) { + return schema; + } + + List types = Lists.newArrayList(); + types.add(NULL); + for (Schema type : schema.getTypes()) { + if (type.getType() != Schema.Type.NULL) { + types.add(type); + } + } + + return Schema.createUnion(types); + } + + private static List mergeFields(Schema left, Schema right) { + List fields = Lists.newArrayList(); + for (Schema.Field leftField : left.getFields()) { + Schema.Field rightField = right.getField(leftField.name()); + if (rightField != null) { + fields.add(new Schema.Field( + leftField.name(), + mergeOrUnion(leftField.schema(), rightField.schema()), + coalesce(leftField.doc(), rightField.doc()), + coalesce(leftField.defaultValue(), rightField.defaultValue()) + )); + } else { + if (leftField.defaultValue() != null) { + fields.add(copy(leftField)); + } else { + fields.add(new Schema.Field( + leftField.name(), nullableForDefault(leftField.schema()), + leftField.doc(), NULL_DEFAULT + )); + } + } + } + + for (Schema.Field rightField : right.getFields()) { + if (left.getField(rightField.name()) == null) { + if (rightField.defaultValue() != null) { + fields.add(copy(rightField)); + } else { + fields.add(new Schema.Field( + rightField.name(), nullableForDefault(rightField.schema()), + rightField.doc(), NULL_DEFAULT + )); + } + } + } + + return fields; + } + + /** + * Creates a new field with the same name, schema, doc, and default value as + * the incoming schema. + *

+ * Fields cannot be used in more than one record (not Immutable?). + */ + public static Schema.Field copy(Schema.Field field) { + return new Schema.Field( + field.name(), field.schema(), field.doc(), field.defaultValue()); + } + + private static float fieldSimilarity(Schema left, Schema right) { + // check whether the unnamed records appear to be the same record + Set leftNames = names(left.getFields()); + Set rightNames = names(right.getFields()); + int common = Sets.intersection(leftNames, rightNames).size(); + float leftRatio = ((float) common) / ((float) leftNames.size()); + float rightRatio = ((float) common) / ((float) rightNames.size()); + return hmean(leftRatio, rightRatio); + } + + private static Set names(Collection fields) { + Set names = Sets.newHashSet(); + for (Schema.Field field : fields) { + names.add(field.name()); + } + return names; + } + + private static float SIMILARITY_THRESH = 0.3f; + private static float hmean(float left, float right) { + return (2.0f * left * right) / (left + right); + } + + /** + * Returns the first non-null object that is passed in. + */ + @SafeVarargs + private static E coalesce(E... objects) { + for (E object : objects) { + if (object != null) { + return object; + } + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/java/org/apache/parquet/cli/util/SeekableFSDataInputStream.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/SeekableFSDataInputStream.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/SeekableFSDataInputStream.java new file mode 100644 index 0000000..8a8b41e --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/SeekableFSDataInputStream.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import org.apache.avro.file.SeekableInput; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import java.io.IOException; +import java.io.InputStream; + +/** + * A wrapper for FSDataInputStream that implements Avro's SeekableInput. + */ +public class SeekableFSDataInputStream extends InputStream implements SeekableInput { + private final FSDataInputStream in; + private final FileStatus stat; + + public SeekableFSDataInputStream(FileSystem fs, Path file) throws IOException { + this.in = fs.open(file); + this.stat = fs.getFileStatus(file); + } + + @Override + public void seek(long p) throws IOException { + in.seek(p); + } + + @Override + public long tell() throws IOException { + return in.getPos(); + } + + @Override + public long length() throws IOException { + return stat.getLen(); + } + + @Override + public int read(byte[] b) throws IOException { + return in.read(b); + } + + @Override + public int read() throws IOException { + return in.read(); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + return in.read(b, off, len); + } + + @Override + public void close() throws IOException { + in.close(); + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/resources/META-INF/LICENSE ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/resources/META-INF/LICENSE b/parquet-cli/src/main/resources/META-INF/LICENSE new file mode 100644 index 0000000..2b581f8 --- /dev/null +++ b/parquet-cli/src/main/resources/META-INF/LICENSE @@ -0,0 +1,348 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + +-------------------------------------------------------------------------------- + +This product depends on Apache Thrift and includes it in this binary artifact. + +Copyright: 2006-2010 The Apache Software Foundation. +Home page: https://thrift.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product depends on SLF4J and includes SLF4J in this binary artifact. SLF4J +is a simple logging facade for Java. + +Copyright: 2004-2013 QOS.ch. +Home page: http://www.slf4j.org/ +License: http://slf4j.org/license.html (MIT license) + +The following is the SLF4J license (MIT): + + Copyright (c) 2004-2013 QOS.ch + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This project includes code from Daniel Lemire's JavaFastPFOR project in this +binary artifact. The "Lemire" bit packing classes produced by parquet-generator +are derived from the JavaFastPFOR project. + +Copyright: 2013 Daniel Lemire +Home page: http://lemire.me/en/ +Project page: https://github.com/lemire/JavaFastPFOR +License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product depends on Apache Avro and includes it in this binary artifact. + +Copyright: 2010-2016 The Apache Software Foundation. +Home page: https://avro.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product depends on fastutil and includes it in this binary artifact. +Fastutil provides type-specific collection implementations. + +Copyright: 2002-2014 Sebastiano Vigna +Home page: http://fasutil.di.unimi.it/ +License: http://www.apache.org/licenses/LICENSE-2.0.html + +-------------------------------------------------------------------------------- + +This product depends on Jackson and includes it in this binary artifact. +Jackson is a high-performance JSON processor. + +Copyright: 2007-2015 Tatu Saloranta and other contributors +Home page: http://jackson.codehaus.org/ +Home page: http://wiki.fasterxml.com/JacksonHome +License: http://www.apache.org/licenses/LICENSE-2.0.txt + +-------------------------------------------------------------------------------- + +This product depends on snappy-java and includes it in this binary artifact. +Snappy is a fast compression codec that aims for high speeds and reasonable +compression, developed by Google. + +Copyright: 2011 Taro L. Saito and other contributors +Home page: http://www.xerial.org/ +License: http://www.apache.org/licenses/LICENSE-2.0.txt + +-------------------------------------------------------------------------------- + +This product depends on Apache Commons and includes commons-codec, +commons-pool, and commons-compress in this binary artifact. + +Copyright: 2002-2015 The Apache Software Foundation. +Home page: https://commons.apache.org/proper/commons-codec/ +Home page: https://commons.apache.org/proper/commons-pool/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +Commons Compress includes files derived from the LZMA SDK, version 9.20 (C/ and +CPP/7zip/), in the package org.apache.commons.compress.archivers.sevenz: + +| LZMA SDK is placed in the public domain. (http://www.7-zip.org/sdk.html) + +-------------------------------------------------------------------------------- + +This product depends on Google guava and includes it in this binary artifact. + +Copyright: 2010-2015 The Guava Authors +Home page: https://github.com/google/guava +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product depends on JCommander and includes it in this binary artifact. + +Copyright: Copyright 2012, Cedric Beust and contributors +Home page: http://jcommander.org +License: https://github.com/cbeust/jcommander/blob/master/license.txt + +-------------------------------------------------------------------------------- + +This product depends on OpenCSV and includes it in this binary artifact. + +Copyright: 2006 Glen Smith and contributors +Home page: http://opencsv.sourceforge.net/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +---------------------------------------------------------------------- + +License for paranamer, included in this binary artifact: + +Copyright (c) 2006 Paul Hammant & ThoughtWorks Inc +All rights reserved. + +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| 1. Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| 2. Redistributions in binary form must reproduce the above copyright +| notice, this list of conditions and the following disclaimer in the +| documentation and/or other materials provided with the distribution. +| 3. Neither the name of the copyright holders nor the names of its +| contributors may be used to endorse or promote products derived from +| this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +| THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +License for xz compression, included in this binary artifact: + +Home page: http://tukaani.org/xz/java.html + +| This Java implementation of XZ has been put into the public domain, thus you +| can do whatever you want with it. All the files in the package have been +| written by Lasse Collin, but some files are heavily based on public domain code +| written by Igor Pavlov. + http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/resources/META-INF/NOTICE ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/resources/META-INF/NOTICE b/parquet-cli/src/main/resources/META-INF/NOTICE new file mode 100644 index 0000000..f90733d --- /dev/null +++ b/parquet-cli/src/main/resources/META-INF/NOTICE @@ -0,0 +1,45 @@ + +Apache Parquet MR +Copyright 2016 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +-------------------------------------------------------------------------------- + +This project includes code from Kite, developed at Cloudera, Inc. with +the following copyright notice: + +| Copyright 2013 Cloudera Inc. +| +| Licensed under the Apache License, Version 2.0 (the "License"); +| you may not use this file except in compliance with the License. +| You may obtain a copy of the License at +| +| http://www.apache.org/licenses/LICENSE-2.0 +| +| Unless required by applicable law or agreed to in writing, software +| distributed under the License is distributed on an "AS IS" BASIS, +| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +| See the License for the specific language governing permissions and +| limitations under the License. + +-------------------------------------------------------------------------------- + +This project includes code from Netflix, Inc. with the following copyright +notice: + +| Copyright 2016 Netflix, Inc. +| +| Licensed under the Apache License, Version 2.0 (the "License"); +| you may not use this file except in compliance with the License. +| You may obtain a copy of the License at +| +| http://www.apache.org/licenses/LICENSE-2.0 +| +| Unless required by applicable law or agreed to in writing, software +| distributed under the License is distributed on an "AS IS" BASIS, +| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +| See the License for the specific language governing permissions and +| limitations under the License. + http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-cli/src/main/resources/cli-logging.properties ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/resources/cli-logging.properties b/parquet-cli/src/main/resources/cli-logging.properties new file mode 100644 index 0000000..7391985 --- /dev/null +++ b/parquet-cli/src/main/resources/cli-logging.properties @@ -0,0 +1,51 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# debug log4j configuration +#log4j.debug=true + +# by default, log anything but cli console to component logger +log4j.rootLogger = WARN, component + +# Set the appender named console to be a ConsoleAppender +log4j.appender.console=org.apache.log4j.ConsoleAppender + +# CLI console output +log4j.logger.org.apache.parquet.cli=INFO, console +log4j.additivity.org.apache.parquet.cli=false + +# Define the layout for console appender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%m%n + +# Change to turn on component logging +log4j.appender.component=org.apache.log4j.varia.NullAppender + +# Define the layout for component appender +log4j.appender.component.layout=org.apache.log4j.PatternLayout +log4j.appender.component.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p :: %m [%C]%n + +# silence native code warnings +log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR + +log4j.logger.org.apache.parquet.CorruptStatistics=ERROR + +# set up logging levels for MR +log4j.logger.org.apache.hadoop.mapred.LocalJobRunner=WARN, console +log4j.logger.org.apache.hadoop.mapreduce.Job=INFO, console http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-common/src/main/java/org/apache/parquet/Exceptions.java ---------------------------------------------------------------------- diff --git a/parquet-common/src/main/java/org/apache/parquet/Exceptions.java b/parquet-common/src/main/java/org/apache/parquet/Exceptions.java new file mode 100644 index 0000000..bdd531c --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/Exceptions.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet; + +public class Exceptions { + /** + * If the given throwable is an instance of E, throw it as an E. + */ + public static void throwIfInstance(Throwable t, + Class excClass) + throws E { + if (excClass.isAssignableFrom(t.getClass())) { + // the throwable is already an exception, so return it + throw excClass.cast(t); + } + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/ddbeb4dd/parquet-common/src/main/java/org/apache/parquet/util/DynConstructors.java ---------------------------------------------------------------------- diff --git a/parquet-common/src/main/java/org/apache/parquet/util/DynConstructors.java b/parquet-common/src/main/java/org/apache/parquet/util/DynConstructors.java new file mode 100644 index 0000000..e1dddf1 --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/util/DynConstructors.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.util; + +import org.apache.parquet.Preconditions; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.parquet.Exceptions.throwIfInstance; + +public class DynConstructors { + public static class Ctor extends DynMethods.UnboundMethod { + private final Constructor ctor; + private final Class constructed; + + private Ctor(Constructor constructor, Class constructed) { + super(null, "newInstance"); + this.ctor = constructor; + this.constructed = constructed; + } + + public Class getConstructedClass() { + return constructed; + } + + public C newInstanceChecked(Object... args) throws Exception { + try { + return ctor.newInstance(args); + } catch (InstantiationException e) { + throw e; + } catch (IllegalAccessException e) { + throw e; + } catch (InvocationTargetException e) { + throwIfInstance(e.getCause(), Exception.class); + throwIfInstance(e.getCause(), RuntimeException.class); + throw new RuntimeException(e.getCause()); + } + } + + public C newInstance(Object... args) { + try { + return newInstanceChecked(args); + } catch (Exception e) { + throwIfInstance(e, RuntimeException.class); + throw new RuntimeException(e); + } + } + + @Override + @SuppressWarnings("unchecked") + public R invoke(Object target, Object... args) { + Preconditions.checkArgument(target == null, + "Invalid call to constructor: target must be null"); + return (R) newInstance(args); + } + + @Override + @SuppressWarnings("unchecked") + public R invokeChecked(Object target, Object... args) throws Exception { + Preconditions.checkArgument(target == null, + "Invalid call to constructor: target must be null"); + return (R) newInstanceChecked(args); + } + + @Override + public DynMethods.BoundMethod bind(Object receiver) { + throw new IllegalStateException("Cannot bind constructors"); + } + + @Override + public boolean isStatic() { + return true; + } + + @Override + public String toString() { + return getClass().getSimpleName() + + "(constructor=" + ctor + ", class=" + constructed + ")"; + } + } + + public static class Builder { + private final Class baseClass; + private ClassLoader loader = Thread.currentThread().getContextClassLoader(); + private Ctor ctor = null; + private Map problems = new HashMap(); + + public Builder(Class baseClass) { + this.baseClass = baseClass; + } + + public Builder() { + this.baseClass = null; + } + + /** + * Set the {@link ClassLoader} used to lookup classes by name. + *

+ * If not set, the current thread's ClassLoader is used. + * + * @param loader a ClassLoader + * @return this Builder for method chaining + */ + public Builder loader(ClassLoader loader) { + this.loader = loader; + return this; + } + + public Builder impl(String className, Class... types) { + // don't do any work if an implementation has been found + if (ctor != null) { + return this; + } + + try { + Class targetClass = Class.forName(className, true, loader); + impl(targetClass, types); + } catch (NoClassDefFoundError e) { + // cannot load this implementation + problems.put(className, e); + } catch (ClassNotFoundException e) { + // not the right implementation + problems.put(className, e); + } + return this; + } + + public Builder impl(Class targetClass, Class... types) { + // don't do any work if an implementation has been found + if (ctor != null) { + return this; + } + + try { + ctor = new Ctor(targetClass.getConstructor(types), targetClass); + } catch (NoSuchMethodException e) { + // not the right implementation + problems.put(methodName(targetClass, types), e); + } + return this; + } + + public Builder hiddenImpl(Class... types) { + hiddenImpl(baseClass, types); + return this; + } + + @SuppressWarnings("unchecked") + public Builder hiddenImpl(String className, Class... types) { + // don't do any work if an implementation has been found + if (ctor != null) { + return this; + } + + try { + Class targetClass = Class.forName(className, true, loader); + hiddenImpl(targetClass, types); + } catch (NoClassDefFoundError e) { + // cannot load this implementation + problems.put(className, e); + } catch (ClassNotFoundException e) { + // not the right implementation + problems.put(className, e); + } + return this; + } + + public Builder hiddenImpl(Class targetClass, Class... types) { + // don't do any work if an implementation has been found + if (ctor != null) { + return this; + } + + try { + Constructor hidden = targetClass.getDeclaredConstructor(types); + AccessController.doPrivileged(new MakeAccessible(hidden)); + ctor = new Ctor(hidden, targetClass); + } catch (SecurityException e) { + // unusable + problems.put(methodName(targetClass, types), e); + } catch (NoSuchMethodException e) { + // not the right implementation + problems.put(methodName(targetClass, types), e); + } + return this; + } + + @SuppressWarnings("unchecked") + public Ctor buildChecked() throws NoSuchMethodException { + if (ctor != null) { + return ctor; + } + throw new NoSuchMethodException("Cannot find constructor for " + + baseClass + "\n" + formatProblems(problems)); + } + + @SuppressWarnings("unchecked") + public Ctor build() { + if (ctor != null) { + return ctor; + } + throw new RuntimeException("Cannot find constructor for " + + baseClass + "\n" + formatProblems(problems)); + } + } + + private static class MakeAccessible implements PrivilegedAction { + private Constructor hidden; + + public MakeAccessible(Constructor hidden) { + this.hidden = hidden; + } + + @Override + public Void run() { + hidden.setAccessible(true); + return null; + } + } + + private static String formatProblems(Map problems) { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Map.Entry problem : problems.entrySet()) { + if (first) { + first = false; + } else { + sb.append("\n"); + } + sb.append("\tMissing ").append(problem.getKey()).append(" [") + .append(problem.getValue().getClass().getName()).append(": ") + .append(problem.getValue().getMessage()).append("]"); + } + return sb.toString(); + } + + private static String methodName(Class targetClass, Class... types) { + StringBuilder sb = new StringBuilder(); + sb.append(targetClass.getName()).append("("); + boolean first = true; + for (Class type : types) { + if (first) { + first = false; + } else { + sb.append(","); + } + sb.append(type.getName()); + } + sb.append(")"); + return sb.toString(); + } +}