parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-512: Add Google benchmark for performance testing
Date Mon, 02 May 2016 00:53:14 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 674dbb39c -> ff14d97ef


PARQUET-512: Add Google benchmark for performance testing

Based on @emkornfield 's work in https://github.com/apache/arrow/pull/29

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #93 from xhochy/parquet-512 and squashes the following commits:

ebc10d2 [Uwe L. Korn] Fix signed/unsigned comparison
684dbc6 [Uwe L. Korn] Fix c&p bug
5a8e239 [Uwe L. Korn] Build benchmarks but don't run them in Travis
e7dc34c [Uwe L. Korn] Remove Arrow references
f6b02da [Uwe L. Korn] PARQUET-512: Add Google benchmark for performance testing


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/ff14d97e
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/ff14d97e
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/ff14d97e

Branch: refs/heads/master
Commit: ff14d97ef46168a7c505c554c0335f91b351f0a4
Parents: 674dbb3
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Sun May 1 17:53:07 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Sun May 1 17:53:07 2016 -0700

----------------------------------------------------------------------
 .travis.yml                                 |   2 +-
 CMakeLists.txt                              |  84 ++++++++++++++-
 README.md                                   |  22 +++-
 build-support/run-test.sh                   | 132 +++++++++++++++++------
 ci/travis_script_cpp.sh                     |   4 +-
 cmake_modules/FindGBenchmark.cmake          |  88 +++++++++++++++
 src/parquet/column/CMakeLists.txt           |   2 +
 src/parquet/column/column-io-benchmark.cc   | 119 ++++++++++++++++++++
 src/parquet/encodings/CMakeLists.txt        |   1 +
 src/parquet/encodings/encoding-benchmark.cc |  87 +++++++++++++++
 src/parquet/util/CMakeLists.txt             |  14 +++
 src/parquet/util/benchmark_main.cc          |  24 +++++
 thirdparty/build_thirdparty.sh              |  24 ++++-
 thirdparty/download_thirdparty.sh           |   5 +
 thirdparty/set_thirdparty_env.sh            |   1 +
 thirdparty/versions.sh                      |   5 +
 16 files changed, 573 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/.travis.yml
----------------------------------------------------------------------
diff --git a/.travis.yml b/.travis.yml
index 87ee43b..8fe291c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,7 +28,7 @@ matrix:
     os: linux
     before_script:
     - source $TRAVIS_BUILD_DIR/ci/before_script_travis.sh
-    - cmake -DCMAKE_CXX_FLAGS="-Werror" -DPARQUET_TEST_MEMCHECK=ON -DPARQUET_GENERATE_COVERAGE=1
$TRAVIS_BUILD_DIR
+    - cmake -DCMAKE_CXX_FLAGS="-Werror" -DPARQUET_TEST_MEMCHECK=ON -DPARQUET_BUILD_BENCHMARKS=ON
-DPARQUET_GENERATE_COVERAGE=1 $TRAVIS_BUILD_DIR
     - export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/data
   - compiler: clang
     os: linux

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56e9dea..39f7585 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,9 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
   option(PARQUET_USE_SSE
 	"Build with SSE4 optimizations"
 	OFF)
+  option(PARQUET_BUILD_BENCHMARKS
+	"Build the libparquet benchmark suite"
+    OFF)
   option(PARQUET_BUILD_TESTS
 	"Build the libparquet test suite"
 	ON)
@@ -103,6 +106,60 @@ else()
 endif()
 
 ############################################################
+# Benchmarking
+############################################################
+# Add a new micro benchmark, with or without an executable that should be built.
+# If benchmarks are enabled then they will be run along side unit tests with ctest.
+# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests,
+# respectively.
+#
+# REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component
+# (e.g. monotime-benchmark) or contain additional components (e.g.
+# net/net_util-benchmark). Either way, the last component must be a globally
+# unique name.
+
+# The benchmark will registered as unit test with ctest with a label
+# of 'benchmark'.
+#
+# Arguments after the test name will be passed to set_tests_properties().
+function(ADD_PARQUET_BENCHMARK REL_BENCHMARK_NAME)
+    if(NOT PARQUET_BUILD_BENCHMARKS)
+    return()
+  endif()
+  get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE)
+
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc)
+    # This benchmark has a corresponding .cc file, set it up as an executable.
+    set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}")
+    add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc")
+    target_link_libraries(${BENCHMARK_NAME} ${PARQUET_BENCHMARK_LINK_LIBS})
+    add_dependencies(runbenchmark ${BENCHMARK_NAME})
+    set(NO_COLOR "--color_print=false")
+  else()
+    # No executable, just invoke the benchmark (probably a script) directly.
+    set(BENCHMARK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME})
+    set(NO_COLOR "")
+  endif()
+
+  add_test(${BENCHMARK_NAME}
+    ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR})
+  set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark")
+  if(ARGN)
+    set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN})
+  endif()
+endfunction()
+
+# A wrapper for add_dependencies() that is compatible with NO_BENCHMARKS.
+function(ADD_PARQUET_BENCHMARK_DEPENDENCIES REL_BENCHMARK_NAME)
+  if(NOT PARQUET_BUILD_BENCHMARKS)
+    return()
+  endif()
+  get_filename_component(BENCMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE)
+
+  add_dependencies(${BENCHMARK_NAME} ${ARGN})
+endfunction()
+
+############################################################
 # Testing
 ############################################################
 
@@ -113,6 +170,9 @@ endif()
 # net/net_util-test). Either way, the last component must be a globally
 # unique name.
 #
+# The unit test is added with a label of "unittest" to support filtering with
+# ctest.
+#
 # Arguments after the test name will be passed to set_tests_properties().
 function(ADD_PARQUET_TEST REL_TEST_NAME)
   if(NOT PARQUET_BUILD_TESTS)
@@ -124,6 +184,7 @@ function(ADD_PARQUET_TEST REL_TEST_NAME)
     # This test has a corresponding .cc file, set it up as an executable.
     set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}")
     add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc")
+    add_dependencies(unittest ${TEST_NAME})
 
 	if(APPLE)
 	  # On OS X / Thrift >= 0.9.2, tr1/tuple.h is not in libc++
@@ -149,8 +210,9 @@ function(ADD_PARQUET_TEST REL_TEST_NAME)
 	  valgrind --tool=memcheck --leak-check=full --error-exitcode=1 ${TEST_PATH})
   else()
 	add_test(${TEST_NAME}
-      ${BUILD_SUPPORT_DIR}/run-test.sh ${TEST_PATH})
+        ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH})
   endif()
+  set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest")
   if(ARGN)
     set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN})
   endif()
@@ -213,11 +275,26 @@ add_library(zlibstatic STATIC IMPORTED)
 set_target_properties(zlibstatic PROPERTIES IMPORTED_LOCATION ${ZLIB_STATIC_LIB})
 
 ## GTest
+add_custom_target(unittest ctest -L unittest)
 find_package(GTest REQUIRED)
 include_directories(SYSTEM ${GTEST_INCLUDE_DIR})
 add_library(gtest STATIC IMPORTED)
 set_target_properties(gtest PROPERTIES IMPORTED_LOCATION ${GTEST_STATIC_LIB})
 
+## Google Benchmark
+if ("$ENV{GBENCHMARK_HOME}" STREQUAL "")
+  set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed)
+endif()
+
+if(PARQUET_BUILD_BENCHMARKS)
+  add_custom_target(runbenchmark ctest -L benchmark)
+  find_package(GBenchmark REQUIRED)
+  include_directories(SYSTEM ${GBENCHMARK_INCLUDE_DIR})
+  message(${GBENCHMARK_STATIC_LIB})
+  add_library(gbenchmark STATIC IMPORTED)
+  set_target_properties(gbenchmark PROPERTIES IMPORTED_LOCATION ${GBENCHMARK_STATIC_LIB})
+endif()
+
 # Thrift requires these definitions for some types that we use
 add_definitions(-DHAVE_INTTYPES_H -DHAVE_NETINET_IN_H -DHAVE_NETDB_H)
 add_definitions(-fPIC)
@@ -332,6 +409,11 @@ set(PARQUET_MIN_TEST_LIBS
 set(PARQUET_TEST_LINK_LIBS ${PARQUET_MIN_TEST_LIBS})
 
 #############################################################
+# Benchmark linking
+
+set(PARQUET_BENCHMARK_LINK_LIBS parquet parquet_benchmark_main)
+
+#############################################################
 # Code coverage
 
 # Adapted from Apache Kudu (incubating)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index a38b555..cf48ff7 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@
 - zlib
 - thrift 0.7+ [install instructions](https://thrift.apache.org/docs/install/)
 - googletest 1.7.0 (cannot be installed with package managers)
+- Google Benchmark (only required if building benchmarks)
 
 You can install these dependencies using a package manager or using the
 `thirdparty/` scripts in this repository. On Homebrew, you can run:
@@ -87,7 +88,7 @@ This library uses Google's `googletest` unit test framework. After building
 with `make`, you can run the test suite by running
 
 ```
-ctest
+make unittest
 ```
 
 The test suite relies on an environment variable `PARQUET_TEST_DATA` pointing
@@ -107,6 +108,19 @@ you can use valgrind with ctest to look for memory leaks:
 valgrind --tool=memcheck --leak-check=yes ctest
 ```
 
+## Building/Running benchmarks
+
+Follow the directions for simple build except run cmake
+with the `--PARQUET_BUILD_BENCHMARKS` parameter set correctly:
+
+    cmake -DPARQUET_BUILD_BENCHMARKS=ON ..
+
+and instead of make unittest run either `make; ctest` to run both unit tests
+and benchmarks or `make runbenchmark` to run only the benchmark tests.
+
+Benchmark logs will be placed in the build directory under `build/benchmark-logs`.
+
+
 ## Out-of-source builds
 
 parquet-cpp supports out of source builds. For example:
@@ -116,7 +130,7 @@ mkdir test-build
 cd test-build
 cmake ..
 make
-ctest
+ctest -L unittest
 ```
 
 By using out-of-source builds you can preserve your current build state in case
@@ -172,7 +186,7 @@ mkdir coverage-build
 cd coverage-build
 cmake -DPARQUET_GENERATE_COVERAGE=1
 make -j$PARALLEL
-ctest
+ctest -L unittest
 ```
 
 The `gcov` artifacts are not located in a place that works well with either
@@ -205,4 +219,4 @@ coveralls -t $PARQUET_CPP_COVERAGE_TOKEN --gcov-options '\-l' -r $PARQUET_ROOT
-
 
 
 Note that `gcov` throws off artifacts from the STL, so I excluded my toolchain
-root stored in `$NATIVE_TOOLCHAIN` to avoid a cluttered coverage report.
\ No newline at end of file
+root stored in `$NATIVE_TOOLCHAIN` to avoid a cluttered coverage report.

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/build-support/run-test.sh
----------------------------------------------------------------------
diff --git a/build-support/run-test.sh b/build-support/run-test.sh
index 889e2a2..7c3b570 100755
--- a/build-support/run-test.sh
+++ b/build-support/run-test.sh
@@ -20,15 +20,23 @@
 # Script which wraps running a test and redirects its output to a
 # test log directory.
 #
-# If PARQUET_COMPRESS_TEST_OUTPUT is non-empty, then the logs will be
-# gzip-compressed while they are written.
+# Arguments:
+#    $1 - Base path for logs/artifacts.
+#    $2 - type of test (e.g. test or benchmark)
+#    $3 - path to executable
+#    $ARGN - arguments for executable
+#
 
+OUTPUT_ROOT=$1
+shift
 ROOT=$(cd $(dirname $BASH_SOURCE)/..; pwd)
 
-TEST_LOGDIR=$ROOT/build/test-logs
+TEST_LOGDIR=$OUTPUT_ROOT/build/$1-logs
 mkdir -p $TEST_LOGDIR
 
-TEST_DEBUGDIR=$ROOT/build/test-debug
+RUN_TYPE=$1
+shift
+TEST_DEBUGDIR=$OUTPUT_ROOT/build/$RUN_TYPE-debug
 mkdir -p $TEST_DEBUGDIR
 
 TEST_DIRNAME=$(cd $(dirname $1); pwd)
@@ -37,11 +45,8 @@ shift
 TEST_EXECUTABLE="$TEST_DIRNAME/$TEST_FILENAME"
 TEST_NAME=$(echo $TEST_FILENAME | perl -pe 's/\..+?$//') # Remove path and extension (if
any).
 
-TEST_EXECUTION_ATTEMPTS=1
-
-
 # We run each test in its own subdir to avoid core file related races.
-TEST_WORKDIR=$ROOT/build/test-work/$TEST_NAME
+TEST_WORKDIR=$OUTPUT_ROOT/build/test-work/$TEST_NAME
 mkdir -p $TEST_WORKDIR
 pushd $TEST_WORKDIR >/dev/null || exit 1
 rm -f *
@@ -51,40 +56,57 @@ set -o pipefail
 LOGFILE=$TEST_LOGDIR/$TEST_NAME.txt
 XMLFILE=$TEST_LOGDIR/$TEST_NAME.xml
 
-# Remove both the compressed and uncompressed output, so the developer
-# doesn't accidentally get confused and read output from a prior test
-# run.
+TEST_EXECUTION_ATTEMPTS=1
+
+# Remove both the uncompressed output, so the developer doesn't accidentally get confused
+# and read output from a prior test run.
 rm -f $LOGFILE $LOGFILE.gz
 
-if [ -n "$PARQUET_COMPRESS_TEST_OUTPUT" ] && [ "$PARQUET_COMPRESS_TEST_OUTPUT" -ne
0 ] ; then
-  pipe_cmd=gzip
-  LOGFILE=${LOGFILE}.gz
-else
-  pipe_cmd=cat
-fi
+pipe_cmd=cat
 
 # Allow for collecting core dumps.
 PARQUET_TEST_ULIMIT_CORE=${PARQUET_TEST_ULIMIT_CORE:-0}
 ulimit -c $PARQUET_TEST_ULIMIT_CORE
 
-# Run the actual test.
-for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
-  if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then
-    # If the test fails, the test output may or may not be left behind,
-    # depending on whether the test cleaned up or exited immediately. Either
-    # way we need to clean it up. We do this by comparing the data directory
-    # contents before and after the test runs, and deleting anything new.
-    #
-    # The comm program requires that its two inputs be sorted.
-    TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort)
+
+function setup_sanitizers() {
+  # Sets environment variables for different sanitizers (it configures how) the run_tests.
Function works.
+
+  # Configure TSAN (ignored if this isn't a TSAN build).
+  #
+  # Deadlock detection (new in clang 3.5) is disabled because:
+  # 1. The clang 3.5 deadlock detector crashes in some unit tests. It
+  #    needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others.
+  # 2. Many unit tests report lock-order-inversion warnings; they should be
+  #    fixed before reenabling the detector.
+  TSAN_OPTIONS="$TSAN_OPTIONS detect_deadlocks=0"
+  TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$ROOT/build-support/tsan-suppressions.txt"
+  TSAN_OPTIONS="$TSAN_OPTIONS history_size=7"
+  export TSAN_OPTIONS
+
+  # Enable leak detection even under LLVM 3.4, where it was disabled by default.
+  # This flag only takes effect when running an ASAN build.
+  ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1"
+  export ASAN_OPTIONS
+
+  # Set up suppressions for LeakSanitizer
+  LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$ROOT/build-support/lsan-suppressions.txt"
+  export LSAN_OPTIONS
+
+  # Suppressions require symbolization. We'll default to using the symbolizer in
+  # thirdparty.
+  if [ -z "$ASAN_SYMBOLIZER_PATH" ]; then
+    export ASAN_SYMBOLIZER_PATH=$(find $NATIVE_TOOLCHAIN/llvm-3.7.0/bin -name llvm-symbolizer)
   fi
+}
+
+function run_test() {
+  # Run gtest style tests with sanitizers if they are setup appropriately.
 
   # gtest won't overwrite old junit test files, resulting in a build failure
   # even when retries are successful.
   rm -f $XMLFILE
 
-  echo "Running $TEST_NAME, redirecting output into $LOGFILE" \
-    "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)"
   $TEST_EXECUTABLE "$@" 2>&1 \
     | $ROOT/build-support/stacktrace_addr2line.pl $TEST_EXECUTABLE \
     | $pipe_cmd > $LOGFILE
@@ -104,6 +126,46 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
     STATUS=1
     rm -f $XMLFILE
   fi
+}
+
+function post_process_tests() {
+  # If we have a LeakSanitizer report, and XML reporting is configured, add a new test
+  # case result to the XML file for the leak report. Otherwise Jenkins won't show
+  # us which tests had LSAN errors.
+  if zgrep --silent "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then
+      echo Test had memory leaks. Editing XML
+      perl -p -i -e '
+      if (m#</testsuite>#) {
+        print "<testcase name=\"LeakSanitizer\" status=\"run\" classname=\"LSAN\">\n";
+        print "  <failure message=\"LeakSanitizer failed\" type=\"\">\n";
+        print "    See txt log file for details\n";
+        print "  </failure>\n";
+        print "</testcase>\n";
+      }' $XMLFILE
+  fi
+}
+
+function run_other() {
+  # Generic run function for test like executables that aren't actually gtest
+  $TEST_EXECUTABLE "$@" 2>&1 | $pipe_cmd > $LOGFILE
+  STATUS=$?
+}
+
+if [ $RUN_TYPE = "test" ]; then
+    setup_sanitizers
+fi
+
+# Run the actual test.
+for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
+  if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then
+    # If the test fails, the test output may or may not be left behind,
+    # depending on whether the test cleaned up or exited immediately. Either
+    # way we need to clean it up. We do this by comparing the data directory
+    # contents before and after the test runs, and deleting anything new.
+    #
+    # The comm program requires that its two inputs be sorted.
+    TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort)
+  fi
 
   if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then
     # Now delete any new test output.
@@ -123,7 +185,13 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
       fi
     done
   fi
-
+  echo "Running $TEST_NAME, redirecting output into $LOGFILE" \
+    "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)"
+  if [ $RUN_TYPE = "test" ]; then
+    run_test $*
+  else
+    run_other $*
+  fi
   if [ "$STATUS" -eq "0" ]; then
     break
   elif [ "$ATTEMPT_NUMBER" -lt "$TEST_EXECUTION_ATTEMPTS" ]; then
@@ -132,6 +200,10 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
   fi
 done
 
+if [ $RUN_TYPE = "test" ]; then
+  post_process_tests
+fi
+
 # Capture and compress core file and binary.
 COREFILES=$(ls | grep ^core)
 if [ -n "$COREFILES" ]; then

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/ci/travis_script_cpp.sh
----------------------------------------------------------------------
diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh
index c8c0ac0..16291a9 100755
--- a/ci/travis_script_cpp.sh
+++ b/ci/travis_script_cpp.sh
@@ -14,13 +14,13 @@ fi
 
 if [ $TRAVIS_OS_NAME == "linux" ]; then
   make -j4 || exit 1
-  ctest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1;
}
+  ctest -L unittest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log;
exit 1; }
   sudo pip install cpp_coveralls
   export PARQUET_ROOT=$TRAVIS_BUILD_DIR
   $TRAVIS_BUILD_DIR/ci/upload_coverage.sh
 else
   make -j4 || exit 1
-  ctest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1;
}
+  ctest -L unittest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log;
exit 1; }
 fi
 
 popd

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/cmake_modules/FindGBenchmark.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/FindGBenchmark.cmake b/cmake_modules/FindGBenchmark.cmake
new file mode 100644
index 0000000..3e46a60
--- /dev/null
+++ b/cmake_modules/FindGBenchmark.cmake
@@ -0,0 +1,88 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Tries to find Google benchmark headers and libraries.
+#
+# Usage of this module as follows:
+#
+#  find_package(GBenchark)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  GBenchmark_HOME - When set, this path is inspected instead of standard library
+#                    locations as the root of the benchark installation.
+#                    The environment variable GBENCHMARK_HOME overrides this veriable.
+#
+# This module defines
+#  GBENCHMARK_INCLUDE_DIR, directory containing benchmark header directory
+#  GBENCHMARK_LIBS, directory containing benchmark libraries
+#  GBENCHMARK_STATIC_LIB, path to libbenchmark.a
+#  GBENCHMARK_FOUND, whether gbenchmark has been found
+
+if( NOT "$ENV{GBENCHMARK_HOME}" STREQUAL "")
+    file( TO_CMAKE_PATH "$ENV{GBENCHMARK_HOME}" _native_path )
+    list( APPEND _gbenchmark_roots ${_native_path} )
+elseif ( GBenchmark_HOME )
+    list( APPEND _gbenchmark_roots ${GBenchmark_HOME} )
+endif()
+
+# Try the parameterized roots, if they exist
+if ( _gbenchmark_roots )
+    find_path( GBENCHMARK_INCLUDE_DIR NAMES benchmark/benchmark.h 
+        PATHS ${_gbenchmark_roots} NO_DEFAULT_PATH
+        PATH_SUFFIXES "include" )
+    find_library( GBENCHMARK_LIBRARIES NAMES benchmark
+        PATHS ${_gbenchmark_roots} NO_DEFAULT_PATH
+        PATH_SUFFIXES "lib" )
+else ()
+    find_path( GBENCHMARK_INCLUDE_DIR NAMES benchmark/benchmark.hh )
+    find_library( GBENCHMARK_LIBRARIES NAMES benchmark )
+endif ()
+
+
+if (GBENCHMARK_INCLUDE_DIR AND GBENCHMARK_LIBRARIES)
+  set(GBENCHMARK_FOUND TRUE)
+  get_filename_component( GBENCHMARK_LIBS ${GBENCHMARK_LIBRARIES} PATH )
+  set(GBENCHMARK_LIB_NAME libbenchmark)
+  set(GBENCHMARK_STATIC_LIB ${GBENCHMARK_LIBS}/${GBENCHMARK_LIB_NAME}.a)
+else ()
+  set(GBENCHMARK_FOUND FALSE)
+endif ()
+
+if (GBENCHMARK_FOUND)
+  if (NOT GBenchmark_FIND_QUIETLY)
+    message(STATUS "Found the GBenchmark library: ${GBENCHMARK_LIBRARIES}")
+  endif ()
+else ()
+  if (NOT GBenchmark_FIND_QUIETLY)
+    set(GBENCHMARK_ERR_MSG "Could not find the GBenchmark library. Looked in ")
+    if ( _gbenchmark_roots )
+      set(GBENCHMARK_ERR_MSG "${GBENCHMARK_ERR_MSG} in ${_gbenchmark_roots}.")
+    else ()
+      set(GBENCHMARK_ERR_MSG "${GBENCHMARK_ERR_MSG} system search paths.")
+    endif ()
+    if (GBenchmark_FIND_REQUIRED)
+      message(FATAL_ERROR "${GBENCHMARK_ERR_MSG}")
+    else (GBenchmark_FIND_REQUIRED)
+      message(STATUS "${GBENCHMARK_ERR_MSG}")
+    endif (GBenchmark_FIND_REQUIRED)
+  endif ()
+endif ()
+
+mark_as_advanced(
+  GBENCHMARK_INCLUDE_DIR
+  GBENCHMARK_LIBS
+  GBENCHMARK_LIBRARIES
+  GBENCHMARK_STATIC_LIB
+)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/src/parquet/column/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/column/CMakeLists.txt b/src/parquet/column/CMakeLists.txt
index e11c7a8..ace0072 100644
--- a/src/parquet/column/CMakeLists.txt
+++ b/src/parquet/column/CMakeLists.txt
@@ -27,3 +27,5 @@ ADD_PARQUET_TEST(column-reader-test)
 ADD_PARQUET_TEST(column-writer-test)
 ADD_PARQUET_TEST(levels-test)
 ADD_PARQUET_TEST(scanner-test)
+
+ADD_PARQUET_BENCHMARK(column-io-benchmark)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/src/parquet/column/column-io-benchmark.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-io-benchmark.cc b/src/parquet/column/column-io-benchmark.cc
new file mode 100644
index 0000000..8007ed5
--- /dev/null
+++ b/src/parquet/column/column-io-benchmark.cc
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include "parquet/file/reader-internal.h"
+#include "parquet/file/writer-internal.h"
+#include "parquet/column/reader.h"
+#include "parquet/column/writer.h"
+#include "parquet/util/input.h"
+
+namespace parquet {
+
+using format::ColumnChunk;
+using schema::PrimitiveNode;
+
+namespace benchmark {
+
+std::unique_ptr<Int64Writer> BuildWriter(int64_t output_size, OutputStream* dst,
+    ColumnChunk* metadata, ColumnDescriptor* schema) {
+  std::unique_ptr<SerializedPageWriter> pager(
+      new SerializedPageWriter(dst, Compression::UNCOMPRESSED, metadata));
+  return std::unique_ptr<Int64Writer>(
+      new Int64Writer(schema, std::move(pager), output_size));
+}
+
+std::shared_ptr<ColumnDescriptor> Int64Schema(Repetition::type repetition) {
+  auto node = PrimitiveNode::Make("int64", repetition, Type::INT64);
+  return std::make_shared<ColumnDescriptor>(
+      node, repetition != Repetition::REQUIRED, repetition == Repetition::REPEATED);
+}
+
+template <Repetition::type repetition>
+static void BM_WriteInt64Column(::benchmark::State& state) {
+  format::ColumnChunk metadata;
+  std::vector<int64_t> values(state.range_x(), 128);
+  std::vector<int16_t> definition_levels(state.range_x(), 1);
+  std::vector<int16_t> repetition_levels(state.range_x(), 0);
+  std::shared_ptr<ColumnDescriptor> schema = Int64Schema(repetition);
+
+  while (state.KeepRunning()) {
+    InMemoryOutputStream dst;
+    std::unique_ptr<Int64Writer> writer =
+        BuildWriter(state.range_x(), &dst, &metadata, schema.get());
+    writer->WriteBatch(
+        values.size(), definition_levels.data(), repetition_levels.data(), values.data());
+    writer->Close();
+  }
+}
+
+BENCHMARK_TEMPLATE(BM_WriteInt64Column, Repetition::REQUIRED)->Range(1024, 65536);
+
+BENCHMARK_TEMPLATE(BM_WriteInt64Column, Repetition::OPTIONAL)->Range(1024, 65536);
+
+BENCHMARK_TEMPLATE(BM_WriteInt64Column, Repetition::REPEATED)->Range(1024, 65536);
+
+std::unique_ptr<Int64Reader> BuildReader(
+    std::shared_ptr<Buffer>& buffer, ColumnDescriptor* schema) {
+  std::unique_ptr<InMemoryInputStream> source(new InMemoryInputStream(buffer));
+  std::unique_ptr<SerializedPageReader> page_reader(
+      new SerializedPageReader(std::move(source), Compression::UNCOMPRESSED));
+  return std::unique_ptr<Int64Reader>(new Int64Reader(schema, std::move(page_reader)));
+}
+
+template <Repetition::type repetition>
+static void BM_ReadInt64Column(::benchmark::State& state) {
+  format::ColumnChunk metadata;
+  std::vector<int64_t> values(state.range_x(), 128);
+  std::vector<int16_t> definition_levels(state.range_x(), 1);
+  std::vector<int16_t> repetition_levels(state.range_x(), 0);
+  std::shared_ptr<ColumnDescriptor> schema = Int64Schema(repetition);
+
+  InMemoryOutputStream dst;
+  std::unique_ptr<Int64Writer> writer =
+      BuildWriter(state.range_x(), &dst, &metadata, schema.get());
+  writer->WriteBatch(
+      values.size(), definition_levels.data(), repetition_levels.data(), values.data());
+  writer->Close();
+
+  std::shared_ptr<Buffer> src = dst.GetBuffer();
+  std::vector<int64_t> values_out(state.range_y());
+  std::vector<int16_t> definition_levels_out(state.range_y());
+  std::vector<int16_t> repetition_levels_out(state.range_y());
+  while (state.KeepRunning()) {
+    std::unique_ptr<Int64Reader> reader = BuildReader(src, schema.get());
+    int64_t values_read = 0;
+    for (size_t i = 0; i < values.size(); i += values_read) {
+      reader->ReadBatch(values_out.size(), definition_levels_out.data(),
+          repetition_levels_out.data(), values_out.data(), &values_read);
+    }
+  }
+}
+
+BENCHMARK_TEMPLATE(BM_ReadInt64Column, Repetition::REQUIRED)
+    ->RangePair(1024, 65536, 1, 1024);
+
+BENCHMARK_TEMPLATE(BM_ReadInt64Column, Repetition::OPTIONAL)
+    ->RangePair(1024, 65536, 1, 1024);
+
+BENCHMARK_TEMPLATE(BM_ReadInt64Column, Repetition::REPEATED)
+    ->RangePair(1024, 65536, 1, 1024);
+
+}  // namespace benchmark
+
+}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/src/parquet/encodings/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/CMakeLists.txt b/src/parquet/encodings/CMakeLists.txt
index eb4cc3c..00565b2 100644
--- a/src/parquet/encodings/CMakeLists.txt
+++ b/src/parquet/encodings/CMakeLists.txt
@@ -27,3 +27,4 @@ install(FILES
   DESTINATION include/parquet/encodings)
 
 ADD_PARQUET_TEST(encoding-test)
+ADD_PARQUET_BENCHMARK(encoding-benchmark)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/src/parquet/encodings/encoding-benchmark.cc
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/encoding-benchmark.cc b/src/parquet/encodings/encoding-benchmark.cc
new file mode 100644
index 0000000..92bc29e
--- /dev/null
+++ b/src/parquet/encodings/encoding-benchmark.cc
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include "parquet/encodings/plain-encoding.h"
+
+namespace parquet {
+
+namespace benchmark {
+
+static void BM_PlainEncodingBoolean(::benchmark::State& state) {
+  std::vector<bool> values(state.range_x(), 64);
+  PlainEncoder<BooleanType> encoder(nullptr);
+
+  while (state.KeepRunning()) {
+    InMemoryOutputStream dst;
+    encoder.Encode(values, values.size(), &dst);
+  }
+}
+
+BENCHMARK(BM_PlainEncodingBoolean)->Range(1024, 65536);
+
+static void BM_PlainDecodingBoolean(::benchmark::State& state) {
+  std::vector<bool> values(state.range_x(), 64);
+  bool* output = new bool[state.range_x()];
+  PlainEncoder<BooleanType> encoder(nullptr);
+  InMemoryOutputStream dst;
+  encoder.Encode(values, values.size(), &dst);
+  std::shared_ptr<Buffer> buf = dst.GetBuffer();
+
+  while (state.KeepRunning()) {
+    PlainDecoder<BooleanType> decoder(nullptr);
+    decoder.SetData(values.size(), buf->data(), buf->size());
+    decoder.Decode(output, values.size());
+  }
+
+  delete[] output;
+}
+
+BENCHMARK(BM_PlainDecodingBoolean)->Range(1024, 65536);
+
+static void BM_PlainEncodingInt64(::benchmark::State& state) {
+  std::vector<int64_t> values(state.range_x(), 64);
+  PlainEncoder<Int64Type> encoder(nullptr);
+
+  while (state.KeepRunning()) {
+    InMemoryOutputStream dst;
+    encoder.Encode(values.data(), values.size(), &dst);
+  }
+}
+
+BENCHMARK(BM_PlainEncodingInt64)->Range(1024, 65536);
+
+static void BM_PlainDecodingInt64(::benchmark::State& state) {
+  std::vector<int64_t> values(state.range_x(), 64);
+  PlainEncoder<Int64Type> encoder(nullptr);
+  InMemoryOutputStream dst;
+  encoder.Encode(values.data(), values.size(), &dst);
+  std::shared_ptr<Buffer> buf = dst.GetBuffer();
+
+  while (state.KeepRunning()) {
+    PlainDecoder<Int64Type> decoder(nullptr);
+    decoder.SetData(values.size(), buf->data(), buf->size());
+    decoder.Decode(values.data(), values.size());
+  }
+}
+
+BENCHMARK(BM_PlainDecodingInt64)->Range(1024, 65536);
+
+}  // namespace benchmark
+
+}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/src/parquet/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt
index b4faaa1..171c054 100644
--- a/src/parquet/util/CMakeLists.txt
+++ b/src/parquet/util/CMakeLists.txt
@@ -63,6 +63,20 @@ if(PARQUET_BUILD_TESTS)
   endif()
 endif()
 
+if (PARQUET_BUILD_BENCHMARKS)
+  add_library(parquet_benchmark_main benchmark_main.cc)
+  if (APPLE)
+    target_link_libraries(parquet_benchmark_main
+      gbenchmark
+    )
+  else()
+    target_link_libraries(parquet_benchmark_main
+      gbenchmark
+      pthread
+    )
+  endif()
+endif()
+
 ADD_PARQUET_TEST(bit-util-test)
 ADD_PARQUET_TEST(buffer-test)
 ADD_PARQUET_TEST(input-output-test)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/src/parquet/util/benchmark_main.cc
----------------------------------------------------------------------
diff --git a/src/parquet/util/benchmark_main.cc b/src/parquet/util/benchmark_main.cc
new file mode 100644
index 0000000..c9739af
--- /dev/null
+++ b/src/parquet/util/benchmark_main.cc
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/thirdparty/build_thirdparty.sh
----------------------------------------------------------------------
diff --git a/thirdparty/build_thirdparty.sh b/thirdparty/build_thirdparty.sh
index 5f00055..b637a36 100755
--- a/thirdparty/build_thirdparty.sh
+++ b/thirdparty/build_thirdparty.sh
@@ -17,6 +17,7 @@ else
     case $arg in
       "lz4")        F_LZ4=1 ;;
       "zlib")       F_ZLIB=1 ;;
+      "gbenchmark") F_GBENCHMARK=1 ;;
       "gtest")      F_GTEST=1 ;;
       "snappy")     F_SNAPPY=1 ;;
       "thrift")     F_THRIFT=1 ;;
@@ -55,17 +56,34 @@ if [ -n "$F_ALL" -o -n "$F_SNAPPY" ]; then
   make -j$PARALLEL install
 fi
 
+STANDARD_DARWIN_FLAGS="-std=c++11 -stdlib=libc++"
+
 # build googletest
+GOOGLETEST_ERROR="failed for googletest!"
 if [ -n "$F_ALL" -o -n "$F_GTEST" ]; then
   cd $TP_DIR/$GTEST_BASEDIR
 
   if [[ "$OSTYPE" == "darwin"* ]]; then
-    cmake -DCMAKE_CXX_FLAGS="-fPIC -std=c++11 -stdlib=libc++ -DGTEST_USE_OWN_TR1_TUPLE=1
-Wno-unused-value -Wno-ignored-attributes"
+    CXXFLAGS=-fPIC cmake -DCMAKE_CXX_FLAGS="$STANDARD_DARWIN_FLAGS -DGTEST_USE_OWN_TR1_TUPLE=1
-Wno-unused-value -Wno-ignored-attributes" || { echo "cmake $GOOGLETEST_ERROR" ; exit  1;
}
   else
-    CXXFLAGS=-fPIC cmake -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX .
+    CXXFLAGS=-fPIC cmake . || { echo "cmake $GOOGLETEST_ERROR"; exit  1; }
+  fi
+
+  make VERBOSE=1 || { echo "Make $GOOGLETEST_ERROR" ; exit  1; }
+fi
+
+# build google benchmark
+GBENCHMARK_ERROR="failed for google benchmark"
+if [ -n "$F_ALL" -o -n "$F_GBENCHMARK" ]; then
+  cd $TP_DIR/$GBENCHMARK_BASEDIR
+
+  CMAKE_CXX_FLAGS="--std=c++11"
+  if [[ "$OSTYPE" == "darwin"* ]]; then
+    CMAKE_CXX_FLAGS=$STANDARD_DARWIN_FLAGS
   fi
+  cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_CXX_FLAGS="-fPIC
$CMAKE_CXX_FLAGS" . || { echo "cmake $GBENCHMARK_ERROR" ; exit 1; }
 
-  make
+  make VERBOSE=1 install || { echo "make $GBENCHMARK_ERROR" ; exit 1; }
 fi
 
 # build lz4

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/thirdparty/download_thirdparty.sh
----------------------------------------------------------------------
diff --git a/thirdparty/download_thirdparty.sh b/thirdparty/download_thirdparty.sh
index 1ea2eba..a0bd14d 100755
--- a/thirdparty/download_thirdparty.sh
+++ b/thirdparty/download_thirdparty.sh
@@ -29,6 +29,11 @@ if [ ! -d ${GTEST_BASEDIR} ]; then
   download_extract_and_cleanup $GTEST_URL
 fi
 
+if [ ! -d ${GBENCHMARK_BASEDIR} ]; then
+  echo "Fetching gtest"
+  download_extract_and_cleanup $GBENCHMARK_URL
+fi
+
 if [ ! -d ${THRIFT_BASEDIR} ]; then
   echo "Fetching thrift"
   download_extract_and_cleanup $THRIFT_URL

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/thirdparty/set_thirdparty_env.sh
----------------------------------------------------------------------
diff --git a/thirdparty/set_thirdparty_env.sh b/thirdparty/set_thirdparty_env.sh
index 72b7074..52b705d 100644
--- a/thirdparty/set_thirdparty_env.sh
+++ b/thirdparty/set_thirdparty_env.sh
@@ -16,3 +16,4 @@ if [ "$(uname)" != "Darwin" ]; then
 fi
 
 export GTEST_HOME=$THIRDPARTY_DIR/$GTEST_BASEDIR
+export GBENCHMARK_HOME=$THIRDPARTY_DIR/installed

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ff14d97e/thirdparty/versions.sh
----------------------------------------------------------------------
diff --git a/thirdparty/versions.sh b/thirdparty/versions.sh
index 8c22265..8380580 100755
--- a/thirdparty/versions.sh
+++ b/thirdparty/versions.sh
@@ -10,6 +10,11 @@ THRIFT_VERSION=0.9.1
 THRIFT_URL="http://archive.apache.org/dist/thrift/${THRIFT_VERSION}/thrift-${THRIFT_VERSION}.tar.gz"
 THRIFT_BASEDIR=thrift-$THRIFT_VERSION
 
+
+GBENCHMARK_VERSION=1.0.0
+GBENCHMARK_URL="https://github.com/google/benchmark/archive/v${GBENCHMARK_VERSION}.tar.gz"
+GBENCHMARK_BASEDIR=benchmark-$GBENCHMARK_VERSION
+
 GTEST_VERSION=1.7.0
 GTEST_URL="https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz"
 GTEST_BASEDIR=googletest-release-$GTEST_VERSION


Mime
View raw message