orc-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From prasanthj <...@git.apache.org>
Subject [GitHub] orc pull request #84: ORC-132. Implement a merge file method and fix the num...
Date Thu, 19 Jan 2017 04:01:55 GMT
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/84#discussion_r96788146
  
    --- Diff: java/core/src/java/org/apache/orc/OrcFile.java ---
    @@ -642,4 +671,187 @@ public static Writer createWriter(Path path,
         return new WriterImpl(fs, path, opts);
       }
     
    +  /**
    +   * Do we understand the version in the reader?
    +   * @param path the path of the file
    +   * @param reader the ORC file reader
    +   * @return is the version understood by this writer?
    +   */
    +  static boolean understandFormat(Path path, Reader reader) {
    +    if (reader.getFileVersion() == Version.FUTURE) {
    +      LOG.info("Can't merge {} because it has a future version.", path);
    +      return false;
    +    }
    +    if (reader.getWriterVersion() == WriterVersion.FUTURE) {
    +      LOG.info("Can't merge {} because it has a future writerVersion.", path);
    +      return false;
    +    }
    +    return true;
    +  }
    +
    +  /**
    +   * Is the new reader compatible with the file that is being written?
    +   * @param schema the writer schema
    +   * @param fileVersion the writer fileVersion
    +   * @param writerVersion the writer writerVersion
    +   * @param compressionSize the compression buffer size
    +   * @param rowIndexStride the row index stride
    +   * @param compression the compression that was used
    +   * @param userMetadata the user metadata
    +   * @param path the new path name for warning messages
    +   * @param reader the new reader
    +   * @return is the reader compatible with the previous ones?
    +   */
    +  static boolean readerIsCompatible(TypeDescription schema,
    +                                    Version fileVersion,
    +                                    WriterVersion writerVersion,
    +                                    int compressionSize,
    +                                    int rowIndexStride,
    +                                    CompressionKind compression,
    +                                    Map<String, ByteBuffer> userMetadata,
    +                                    Path path,
    +                                    Reader reader) {
    +    // now we have to check compatibility
    +    if (!reader.getSchema().equals(schema)) {
    +      LOG.info("Can't merge {} because of different schemas {} vs {}",
    +          path, reader.getSchema(), schema);
    +      return false;
    +    }
    +    if (reader.getCompressionKind() != compression) {
    +      LOG.info("Can't merge {} because of different compression {} vs {}",
    +          path, reader.getCompressionKind(), compression);
    +      return false;
    +    }
    +    if (compression != CompressionKind.NONE &&
    +        reader.getCompressionSize() != compressionSize) {
    +      LOG.info("Can't merge {} because of different compression sizes {} vs {}",
    +          path, reader.getCompressionSize(), compressionSize);
    +      return false;
    +    }
    +    if (reader.getFileVersion() != fileVersion) {
    +      LOG.info("Can't merge {} because of different file versions {} vs {}",
    +          path, reader.getFileVersion(), fileVersion);
    +      return false;
    +    }
    +    if (reader.getWriterVersion() != writerVersion) {
    +      LOG.info("Can't merge {} because of different writer versions {} vs {}",
    +          path, reader.getFileVersion(), fileVersion);
    +      return false;
    +    }
    +    if (reader.getRowIndexStride() != rowIndexStride) {
    +      LOG.info("Can't merge {} because of different row index strides {} vs {}",
    +          path, reader.getRowIndexStride(), rowIndexStride);
    +      return false;
    +    }
    +    for(String key: reader.getMetadataKeys()) {
    +      if (userMetadata.containsKey(key)) {
    +        ByteBuffer currentValue = userMetadata.get(key);
    +        ByteBuffer newValue = reader.getMetadataValue(key);
    +        if (!newValue.equals(currentValue)) {
    +          LOG.info("Can't merge {} because of different user metadata {}", path,
    +              key);
    +          return false;
    +        }
    +      }
    +    }
    +    return true;
    +  }
    +
    +  static void mergeMetadata(Map<String,ByteBuffer> metadata,
    +                            Reader reader) {
    +    for(String key: reader.getMetadataKeys()) {
    +      metadata.put(key, reader.getMetadataValue(key));
    +    }
    +  }
    +
    +  /**
    +   * Merges multiple ORC files that all have the same schema to produce
    +   * a single ORC file.
    +   * The merge will reject files that aren't compatible with the merged file
    +   * so the output list may be shorter than the input list.
    +   * The stripes are copied as serialized byte buffers.
    +   * The user metadata are merged and files that disagree on the value
    +   * associated with a key will be rejected.
    +   *
    +   * @param outputPath the output file
    +   * @param options the options for writing with although the options related
    +   *                to the input files' encodings are overridden
    +   * @param inputFiles the list of files to merge
    +   * @return the list of files that were successfully merged
    +   * @throws IOException
    +   */
    +  public static List<Path> mergeFiles(Path outputPath,
    +                                      WriterOptions options,
    +                                      Path... inputFiles) throws IOException {
    --- End diff --
    
    List instead of varargs? Or another interface that converts List to varargs.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message