Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 8729F200C24 for ; Thu, 19 Jan 2017 05:01:57 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 84B68160B43; Thu, 19 Jan 2017 04:01:57 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id D516D160B44 for ; Thu, 19 Jan 2017 05:01:56 +0100 (CET) Received: (qmail 47644 invoked by uid 500); 19 Jan 2017 04:01:56 -0000 Mailing-List: contact dev-help@orc.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@orc.apache.org Delivered-To: mailing list dev@orc.apache.org Received: (qmail 47596 invoked by uid 99); 19 Jan 2017 04:01:55 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 19 Jan 2017 04:01:55 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 9B342DFA43; Thu, 19 Jan 2017 04:01:55 +0000 (UTC) From: prasanthj To: dev@orc.apache.org Reply-To: dev@orc.apache.org References: In-Reply-To: Subject: [GitHub] orc pull request #84: ORC-132. Implement a merge file method and fix the num... Content-Type: text/plain Message-Id: <20170119040155.9B342DFA43@git1-us-west.apache.org> Date: Thu, 19 Jan 2017 04:01:55 +0000 (UTC) archived-at: Thu, 19 Jan 2017 04:01:57 -0000 Github user prasanthj commented on a diff in the pull request: https://github.com/apache/orc/pull/84#discussion_r96788146 --- Diff: java/core/src/java/org/apache/orc/OrcFile.java --- @@ -642,4 +671,187 @@ public static Writer createWriter(Path path, return new WriterImpl(fs, path, opts); } + /** + * Do we understand the version in the reader? + * @param path the path of the file + * @param reader the ORC file reader + * @return is the version understood by this writer? + */ + static boolean understandFormat(Path path, Reader reader) { + if (reader.getFileVersion() == Version.FUTURE) { + LOG.info("Can't merge {} because it has a future version.", path); + return false; + } + if (reader.getWriterVersion() == WriterVersion.FUTURE) { + LOG.info("Can't merge {} because it has a future writerVersion.", path); + return false; + } + return true; + } + + /** + * Is the new reader compatible with the file that is being written? + * @param schema the writer schema + * @param fileVersion the writer fileVersion + * @param writerVersion the writer writerVersion + * @param compressionSize the compression buffer size + * @param rowIndexStride the row index stride + * @param compression the compression that was used + * @param userMetadata the user metadata + * @param path the new path name for warning messages + * @param reader the new reader + * @return is the reader compatible with the previous ones? + */ + static boolean readerIsCompatible(TypeDescription schema, + Version fileVersion, + WriterVersion writerVersion, + int compressionSize, + int rowIndexStride, + CompressionKind compression, + Map userMetadata, + Path path, + Reader reader) { + // now we have to check compatibility + if (!reader.getSchema().equals(schema)) { + LOG.info("Can't merge {} because of different schemas {} vs {}", + path, reader.getSchema(), schema); + return false; + } + if (reader.getCompressionKind() != compression) { + LOG.info("Can't merge {} because of different compression {} vs {}", + path, reader.getCompressionKind(), compression); + return false; + } + if (compression != CompressionKind.NONE && + reader.getCompressionSize() != compressionSize) { + LOG.info("Can't merge {} because of different compression sizes {} vs {}", + path, reader.getCompressionSize(), compressionSize); + return false; + } + if (reader.getFileVersion() != fileVersion) { + LOG.info("Can't merge {} because of different file versions {} vs {}", + path, reader.getFileVersion(), fileVersion); + return false; + } + if (reader.getWriterVersion() != writerVersion) { + LOG.info("Can't merge {} because of different writer versions {} vs {}", + path, reader.getFileVersion(), fileVersion); + return false; + } + if (reader.getRowIndexStride() != rowIndexStride) { + LOG.info("Can't merge {} because of different row index strides {} vs {}", + path, reader.getRowIndexStride(), rowIndexStride); + return false; + } + for(String key: reader.getMetadataKeys()) { + if (userMetadata.containsKey(key)) { + ByteBuffer currentValue = userMetadata.get(key); + ByteBuffer newValue = reader.getMetadataValue(key); + if (!newValue.equals(currentValue)) { + LOG.info("Can't merge {} because of different user metadata {}", path, + key); + return false; + } + } + } + return true; + } + + static void mergeMetadata(Map metadata, + Reader reader) { + for(String key: reader.getMetadataKeys()) { + metadata.put(key, reader.getMetadataValue(key)); + } + } + + /** + * Merges multiple ORC files that all have the same schema to produce + * a single ORC file. + * The merge will reject files that aren't compatible with the merged file + * so the output list may be shorter than the input list. + * The stripes are copied as serialized byte buffers. + * The user metadata are merged and files that disagree on the value + * associated with a key will be rejected. + * + * @param outputPath the output file + * @param options the options for writing with although the options related + * to the input files' encodings are overridden + * @param inputFiles the list of files to merge + * @return the list of files that were successfully merged + * @throws IOException + */ + public static List mergeFiles(Path outputPath, + WriterOptions options, + Path... inputFiles) throws IOException { --- End diff -- List instead of varargs? Or another interface that converts List to varargs. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastructure@apache.org or file a JIRA ticket with INFRA. ---