orc-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From xndai <...@git.apache.org>
Subject [GitHub] orc pull request #122: ORC-192 Implement zlib compresion stream
Date Mon, 22 May 2017 19:08:09 GMT
Github user xndai commented on a diff in the pull request:

    https://github.com/apache/orc/pull/122#discussion_r117822601
  
    --- Diff: c++/src/Compression.cc ---
    @@ -636,6 +884,33 @@ DIAGNOSTIC_POP
         return static_cast<uint64_t>(result);
       }
     
    +  std::unique_ptr<BufferedOutputStream>
    +     createCompressor(
    +                      CompressionKind kind,
    +                      OutputStream * outStream,
    +                      CompressionStrategy strategy,
    +                      uint64_t bufferCapacity,
    +                      uint64_t blockSize,
    +                      MemoryPool& pool) {
    +    switch (static_cast<int64_t>(kind)) {
    +    case CompressionKind_NONE: {
    +      return std::unique_ptr<BufferedOutputStream>
    +        (new BufferedOutputStream(pool, outStream, bufferCapacity, blockSize));
    +    }
    +    case CompressionKind_ZLIB: {
    +      int level = (strategy == CompressionStrategy_SPEED) ? -1 : 9;
    --- End diff --
    
    According to this - https://orc.apache.org/docs/hive-config.html, there are only two compression
strategy defined: SPEED and COMPRESSION. I also checked Java implementation, SPEED maps to
zlib level Z_BEST_SPEED + 1, and COMPRESSION maps to Z_DEFAULT_COMPRESSION. I will do the
same for C++.
    
    Java implementation for your reference -
    
    WriterImpl.java
    
    `
        CompressionCodec result = physicalWriter.getCompressionCodec();
        if (result != null) {
          switch (kind) {
            case BLOOM_FILTER:
            case DATA:
            case DICTIONARY_DATA:
            case BLOOM_FILTER_UTF8:
              if (compressionStrategy == OrcFile.CompressionStrategy.SPEED) {
                result = result.modify(EnumSet.of(CompressionCodec.Modifier.FAST,
                    CompressionCodec.Modifier.TEXT));
              } else {
                result = result.modify(EnumSet.of(CompressionCodec.Modifier.DEFAULT,
                    CompressionCodec.Modifier.TEXT));
              }
              break;
            case LENGTH:
            case DICTIONARY_COUNT:
            case PRESENT:
            case ROW_INDEX:
            case SECONDARY:
              // easily compressed using the fastest modes
              result = result.modify(EnumSet.of(CompressionCodec.Modifier.FASTEST,
                  CompressionCodec.Modifier.BINARY));
              break;
            default:
              LOG.info("Missing ORC compression modifiers for " + kind);
              break;
          }
        }
    
    `
    
    ZlibCodec.java
    
    `
    public CompressionCodec modify(/* @Nullable */ EnumSet<Modifier> modifiers) {
    
        if (modifiers == null) {
          return this;
        }
    
        int l = this.level;
        int s = this.strategy;
    
        for (Modifier m : modifiers) {
          switch (m) {
          case BINARY:
            /* filtered == less LZ77, more huffman */
            s = Deflater.FILTERED;
            break;
          case TEXT:
            s = Deflater.DEFAULT_STRATEGY;
            break;
          case FASTEST:
            // deflate_fast looking for 8 byte patterns
            l = Deflater.BEST_SPEED;
            break;
          case FAST:
            // deflate_fast looking for 16 byte patterns
            l = Deflater.BEST_SPEED + 1;
            break;
          case DEFAULT:
            // deflate_slow looking for 128 byte patterns
            l = Deflater.DEFAULT_COMPRESSION;
            break;
          default:
            break;
          }
        }
        return new ZlibCodec(l, s);
      }
    `


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message