parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-769: Add support for Brotli compression
Date Sat, 26 Nov 2016 19:30:22 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 3e0e5da1c -> 1219fa48f


PARQUET-769: Add support for Brotli compression

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #194 from xhochy/PARQUET-769 and squashes the following commits:

aad390f [Uwe L. Korn] Pass buffer sizes also as in parameter
9847171 [Uwe L. Korn] make format
855250d [Uwe L. Korn] make format
40e93de [Uwe L. Korn] Add FindBrotli
47b9d03 [Uwe L. Korn] PARQUET-769: Add support for Brotli compression


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/1219fa48
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/1219fa48
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/1219fa48

Branch: refs/heads/master
Commit: 1219fa48ff8193829cd5ac5cf64b012de527eb24
Parents: 3e0e5da
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Sat Nov 26 14:30:12 2016 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sat Nov 26 14:30:12 2016 -0500

----------------------------------------------------------------------
 CMakeLists.txt                            |  14 ++++
 cmake_modules/FindBrotli.cmake            | 105 +++++++++++++++++++++++++
 src/parquet/column/column-writer-test.cc  |  10 +++
 src/parquet/compression/brotli-codec.cc   |  53 +++++++++++++
 src/parquet/compression/codec-test.cc     |   4 +
 src/parquet/compression/codec.cc          |   2 +-
 src/parquet/compression/codec.h           |  14 ++++
 src/parquet/file/file-deserialize-test.cc |   3 +-
 src/parquet/file/file-serialize-test.cc   |   4 +
 thirdparty/build_thirdparty.sh            |   9 +++
 thirdparty/download_thirdparty.sh         |   5 ++
 thirdparty/set_thirdparty_env.sh          |   1 +
 thirdparty/versions.sh                    |   4 +
 13 files changed, 226 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f0e14ae..a9fe089 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -294,6 +294,16 @@ include_directories(SYSTEM ${SNAPPY_INCLUDE_DIR})
 add_library(snappystatic STATIC IMPORTED)
 set_target_properties(snappystatic PROPERTIES IMPORTED_LOCATION ${SNAPPY_STATIC_LIB})
 
+## Brotli
+find_package(Brotli REQUIRED)
+include_directories(SYSTEM ${BROTLI_INCLUDE_DIR})
+add_library(brotlistatic_enc STATIC IMPORTED)
+set_target_properties(brotlistatic_enc PROPERTIES IMPORTED_LOCATION ${BROTLI_LIBRARY_ENC})
+add_library(brotlistatic_dec STATIC IMPORTED)
+set_target_properties(brotlistatic_dec PROPERTIES IMPORTED_LOCATION ${BROTLI_LIBRARY_DEC})
+add_library(brotlistatic_common STATIC IMPORTED)
+set_target_properties(brotlistatic_common PROPERTIES IMPORTED_LOCATION ${BROTLI_LIBRARY_COMMON})
+
 ## ZLIB
 find_package(ZLIB REQUIRED)
 include_directories(SYSTEM ${ZLIB_INCLUDE_DIRS})
@@ -512,6 +522,7 @@ set(LIBPARQUET_SRCS
   src/parquet/column/statistics.cc
 
   src/parquet/compression/codec.cc
+  src/parquet/compression/brotli-codec.cc
   src/parquet/compression/snappy-codec.cc
   src/parquet/compression/gzip-codec.cc
 
@@ -539,6 +550,9 @@ set(LIBPARQUET_LINK_LIBS
 
 set(LIBPARQUET_PRIVATE_LINK_LIBS
   parquet_thrift
+  brotlistatic_dec
+  brotlistatic_enc
+  brotlistatic_common
   snappystatic
   thriftstatic
   zlibstatic

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/cmake_modules/FindBrotli.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/FindBrotli.cmake b/cmake_modules/FindBrotli.cmake
new file mode 100644
index 0000000..9df15ee
--- /dev/null
+++ b/cmake_modules/FindBrotli.cmake
@@ -0,0 +1,105 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Tries to find Brotli headers and libraries.
+#
+# Usage of this module as follows:
+#
+#  find_package(Brotli)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Brotli_HOME - When set, this path is inspected instead of standard library
+#                locations as the root of the Brotli installation.
+#                The environment variable BROTLI_HOME overrides this veriable.
+#
+# This module defines
+#  BROTLI_INCLUDE_DIR, directory containing headers
+#  BROTLI_LIBS, directory containing brotli libraries
+#  BROTLI_STATIC_LIB, path to libbrotli.a
+#  BROTLI_SHARED_LIB, path to libbrotli's shared library
+#  BROTLI_FOUND, whether brotli has been found
+
+if( NOT "$ENV{BROTLI_HOME}" STREQUAL "")
+    file( TO_CMAKE_PATH "$ENV{BROTLI_HOME}" _native_path )
+    list( APPEND _brotli_roots ${_native_path} )
+elseif ( Brotli_HOME )
+    list( APPEND _brotli_roots ${Brotli_HOME} )
+endif()
+
+# Try the parameterized roots, if they exist
+if ( _brotli_roots )
+    find_path( BROTLI_INCLUDE_DIR NAMES brotli/decode.h
+        PATHS ${_brotli_roots} NO_DEFAULT_PATH
+        PATH_SUFFIXES "include" )
+    find_library( BROTLI_LIBRARY_ENC NAMES brotlienc
+        PATHS ${_brotli_roots} NO_DEFAULT_PATH
+        PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
+    find_library( BROTLI_LIBRARY_DEC NAMES brotlidec
+        PATHS ${_brotli_roots} NO_DEFAULT_PATH
+        PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
+    find_library( BROTLI_LIBRARY_COMMON NAMES brotlicommon
+        PATHS ${_brotli_roots} NO_DEFAULT_PATH
+        PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
+else ()
+    find_path( BROTLI_INCLUDE_DIR NAMES brotli.h )
+    find_library( BROTLI_LIBRARIES NAMES brotlienc )
+endif ()
+
+set(BROTLI_LIBRARIES ${BROTLI_LIBRARY_ENC} ${BROTLI_LIBRARY_DEC}
+    ${BROTLI_LIBRARY_COMMON})
+
+if (BROTLI_INCLUDE_DIR AND BROTLI_LIBRARIES)
+  set(BROTLI_FOUND TRUE)
+  get_filename_component( BROTLI_LIBS ${BROTLI_LIBRARY_ENC} PATH )
+  set(BROTLI_LIB_NAME libbrotli)
+  set(BROTLI_STATIC_LIB
+      ${BROTLI_LIBS}/${BROTLI_LIB_NAME}enc.a
+      ${BROTLI_LIBS}/${BROTLI_LIB_NAME}dec.a
+      ${BROTLI_LIBS}/${BROTLI_LIB_NAME}common.a)
+  set(BROTLI_SHARED_LIB
+      ${BROTLI_LIBS}/${BROTLI_LIB_NAME}enc${CMAKE_SHARED_LIBRARY_SUFFIX}
+      ${BROTLI_LIBS}/${BROTLI_LIB_NAME}dec${CMAKE_SHARED_LIBRARY_SUFFIX}
+      ${BROTLI_LIBS}/${BROTLI_LIB_NAME}common${CMAKE_SHARED_LIBRARY_SUFFIX})
+else ()
+  set(BROTLI_FOUND FALSE)
+endif ()
+
+if (BROTLI_FOUND)
+  if (NOT Brotli_FIND_QUIETLY)
+    message(STATUS "Found the Brotli library: ${BROTLI_LIBRARIES}")
+  endif ()
+else ()
+  if (NOT Brotli_FIND_QUIETLY)
+    set(BROTLI_ERR_MSG "Could not find the Brotli library. Looked in ")
+    if ( _brotli_roots )
+      set(BROTLI_ERR_MSG "${BROTLI_ERR_MSG} in ${_brotli_roots}.")
+    else ()
+      set(BROTLI_ERR_MSG "${BROTLI_ERR_MSG} system search paths.")
+    endif ()
+    if (Brotli_FIND_REQUIRED)
+      message(FATAL_ERROR "${BROTLI_ERR_MSG}")
+    else (Brotli_FIND_REQUIRED)
+      message(STATUS "${BROTLI_ERR_MSG}")
+    endif (Brotli_FIND_REQUIRED)
+  endif ()
+endif ()
+
+mark_as_advanced(
+  BROTLI_INCLUDE_DIR
+  BROTLI_LIBS
+  BROTLI_LIBRARIES
+  BROTLI_STATIC_LIB
+  BROTLI_SHARED_LIB
+)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 0a20ac1..5a65175 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -259,6 +259,11 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithSnappyCompression) {
       Encoding::PLAIN, Compression::SNAPPY, false, false, LARGE_SIZE);
 }
 
+TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithBrotliCompression) {
+  this->TestRequiredWithSettings(
+      Encoding::PLAIN, Compression::BROTLI, false, false, LARGE_SIZE);
+}
+
 TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithGzipCompression) {
   this->TestRequiredWithSettings(
       Encoding::PLAIN, Compression::GZIP, false, false, LARGE_SIZE);
@@ -274,6 +279,11 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndSnappyCompression)
{
       Encoding::PLAIN, Compression::SNAPPY, false, true, LARGE_SIZE);
 }
 
+TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndBrotliCompression) {
+  this->TestRequiredWithSettings(
+      Encoding::PLAIN, Compression::BROTLI, false, true, LARGE_SIZE);
+}
+
 TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndGzipCompression) {
   this->TestRequiredWithSettings(
       Encoding::PLAIN, Compression::GZIP, false, true, LARGE_SIZE);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/compression/brotli-codec.cc
----------------------------------------------------------------------
diff --git a/src/parquet/compression/brotli-codec.cc b/src/parquet/compression/brotli-codec.cc
new file mode 100644
index 0000000..24ff230
--- /dev/null
+++ b/src/parquet/compression/brotli-codec.cc
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstdlib>
+#include <brotli/decode.h>
+#include <brotli/encode.h>
+
+#include "parquet/compression/codec.h"
+#include "parquet/exception.h"
+
+namespace parquet {
+
+void BrotliCodec::Decompress(
+    int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer)
{
+  size_t output_size = output_len;
+  if (BrotliDecoderDecompress(input_len, input, &output_size, output_buffer) !=
+      BROTLI_DECODER_RESULT_SUCCESS) {
+    throw parquet::ParquetException("Corrupt brotli compressed data.");
+  }
+}
+
+int64_t BrotliCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) {
+  return BrotliEncoderMaxCompressedSize(input_len);
+}
+
+int64_t BrotliCodec::Compress(int64_t input_len, const uint8_t* input,
+    int64_t output_buffer_len, uint8_t* output_buffer) {
+  size_t output_len = output_buffer_len;
+  // TODO: Make quality configurable. We use 8 as a default as it is the best
+  //       trade-off for Parquet workload
+  if (BrotliEncoderCompress(8, BROTLI_DEFAULT_WINDOW, BROTLI_DEFAULT_MODE, input_len,
+          input, &output_len, output_buffer) == BROTLI_FALSE) {
+    throw parquet::ParquetException("Brotli compression failure.");
+  }
+  return output_len;
+}
+
+}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/compression/codec-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/compression/codec-test.cc b/src/parquet/compression/codec-test.cc
index 2f7cc1a..f2be84b 100644
--- a/src/parquet/compression/codec-test.cc
+++ b/src/parquet/compression/codec-test.cc
@@ -73,6 +73,10 @@ TEST(TestCompressors, Snappy) {
   CheckCodec<SnappyCodec>();
 }
 
+TEST(TestCompressors, Brotli) {
+  CheckCodec<BrotliCodec>();
+}
+
 TEST(TestCompressors, GZip) {
   CheckCodec<GZipCodec>();
 }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/compression/codec.cc
----------------------------------------------------------------------
diff --git a/src/parquet/compression/codec.cc b/src/parquet/compression/codec.cc
index f5aaefd..a7e5fba 100644
--- a/src/parquet/compression/codec.cc
+++ b/src/parquet/compression/codec.cc
@@ -38,7 +38,7 @@ std::unique_ptr<Codec> Codec::Create(Compression::type codec_type)
{
       ParquetException::NYI("LZO codec not implemented");
       break;
     case Compression::BROTLI:
-      ParquetException::NYI("BROTLI codec not implemented");
+      result.reset(new BrotliCodec());
       break;
     default:
       ParquetException::NYI("Unrecognized codec");

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/compression/codec.h
----------------------------------------------------------------------
diff --git a/src/parquet/compression/codec.h b/src/parquet/compression/codec.h
index ca823c5..e803a8c 100644
--- a/src/parquet/compression/codec.h
+++ b/src/parquet/compression/codec.h
@@ -59,6 +59,20 @@ class SnappyCodec : public Codec {
   virtual const char* name() const { return "snappy"; }
 };
 
+// Brotli codec.
+class BrotliCodec : public Codec {
+ public:
+  void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
+      uint8_t* output_buffer) override;
+
+  int64_t Compress(int64_t input_len, const uint8_t* input,
+      int64_t output_buffer_len, uint8_t* output_buffer) override;
+
+  int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;
+
+  const char* name() const override { return "brotli"; }
+};
+
 // GZip codec.
 class GZipCodec : public Codec {
  public:

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/file/file-deserialize-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/file-deserialize-test.cc b/src/parquet/file/file-deserialize-test.cc
index 8f832df..5d97cd9 100644
--- a/src/parquet/file/file-deserialize-test.cc
+++ b/src/parquet/file/file-deserialize-test.cc
@@ -165,7 +165,8 @@ TEST_F(TestPageSerde, TestFailLargePageHeaders) {
 }
 
 TEST_F(TestPageSerde, Compression) {
-  Compression::type codec_types[2] = {Compression::GZIP, Compression::SNAPPY};
+  Compression::type codec_types[3] = {
+      Compression::GZIP, Compression::SNAPPY, Compression::BROTLI};
 
   // This is a dummy number
   data_page_header_.num_values = 32;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/file/file-serialize-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/file-serialize-test.cc b/src/parquet/file/file-serialize-test.cc
index 42a73c9..3a11cd8 100644
--- a/src/parquet/file/file-serialize-test.cc
+++ b/src/parquet/file/file-serialize-test.cc
@@ -119,6 +119,10 @@ TYPED_TEST(TestSerialize, SmallFileSnappy) {
   this->FileSerializeTest(Compression::SNAPPY);
 }
 
+TYPED_TEST(TestSerialize, SmallFileBrotli) {
+  this->FileSerializeTest(Compression::BROTLI);
+}
+
 TYPED_TEST(TestSerialize, SmallFileGzip) {
   this->FileSerializeTest(Compression::GZIP);
 }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/thirdparty/build_thirdparty.sh
----------------------------------------------------------------------
diff --git a/thirdparty/build_thirdparty.sh b/thirdparty/build_thirdparty.sh
index 4a91516..727e722 100755
--- a/thirdparty/build_thirdparty.sh
+++ b/thirdparty/build_thirdparty.sh
@@ -33,6 +33,7 @@ else
   for arg in "$@"; do
     case $arg in
       "arrow")      F_ARROW=1 ;;
+      "brotli")     F_BROTLI=1 ;;
       "zlib")       F_ZLIB=1 ;;
       "gbenchmark") F_GBENCHMARK=1 ;;
       "gtest")      F_GTEST=1 ;;
@@ -144,5 +145,13 @@ if [ -n "$F_ALL" -o -n "$F_ARROW" ]; then
     # :
 fi
 
+# build brotli
+if [ -n "$F_ALL" -o -n "$F_BROTLI" ]; then
+    cd $TP_DIR/$BROTLI_BASEDIR
+    cmake -DCMAKE_INSTALL_PREFIX=$PREFIX -DBUILD_SHARED_LIBS=OFF .
+    make -j$PARALLEL install
+    # :
+fi
+
 echo "---------------------"
 echo "Thirdparty dependencies built and installed into $PREFIX successfully"

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/thirdparty/download_thirdparty.sh
----------------------------------------------------------------------
diff --git a/thirdparty/download_thirdparty.sh b/thirdparty/download_thirdparty.sh
index 3483321..4831bbf 100755
--- a/thirdparty/download_thirdparty.sh
+++ b/thirdparty/download_thirdparty.sh
@@ -42,6 +42,11 @@ if [ ! -d ${ARROW_BASEDIR} ]; then
   download_extract_and_cleanup $ARROW_URL
 fi
 
+if [ ! -d ${BROTLI_BASEDIR} ]; then
+  echo "Fetching brotli"
+  download_extract_and_cleanup $BROTLI_URL
+fi
+
 if [ ! -d ${SNAPPY_BASEDIR} ]; then
   echo "Fetching snappy"
   download_extract_and_cleanup $SNAPPY_URL

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/thirdparty/set_thirdparty_env.sh
----------------------------------------------------------------------
diff --git a/thirdparty/set_thirdparty_env.sh b/thirdparty/set_thirdparty_env.sh
index 547ed54..e8a6068 100644
--- a/thirdparty/set_thirdparty_env.sh
+++ b/thirdparty/set_thirdparty_env.sh
@@ -25,6 +25,7 @@ if [ -z "$THIRDPARTY_DIR" ]; then
 fi
 
 export ARROW_HOME=$THIRDPARTY_DIR/installed
+export BROTLI_HOME=$THIRDPARTY_DIR/installed
 export SNAPPY_HOME=$THIRDPARTY_DIR/installed
 export ZLIB_HOME=$THIRDPARTY_DIR/installed
 # build script doesn't support building thrift on OSX

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/thirdparty/versions.sh
----------------------------------------------------------------------
diff --git a/thirdparty/versions.sh b/thirdparty/versions.sh
index 855b6f7..ff5644e 100755
--- a/thirdparty/versions.sh
+++ b/thirdparty/versions.sh
@@ -19,6 +19,10 @@ ARROW_VERSION="d946e7917d55cb220becd6469ae93430f2e60764"
 ARROW_URL="https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz"
 ARROW_BASEDIR="arrow-${ARROW_VERSION}"
 
+BROTLI_VERSION="5db62dcc9d386579609540cdf8869e95ad334bbd"
+BROTLI_URL="https://github.com/google/brotli/archive/${BROTLI_VERSION}.tar.gz"
+BROTLI_BASEDIR="brotli-${BROTLI_VERSION}"
+
 SNAPPY_VERSION=1.1.3
 SNAPPY_URL="https://github.com/google/snappy/releases/download/${SNAPPY_VERSION}/snappy-${SNAPPY_VERSION}.tar.gz"
 SNAPPY_BASEDIR=snappy-$SNAPPY_VERSION


Mime
View raw message