From commits-return-1365-archive-asf-public=cust-asf.ponee.io@parquet.apache.org Wed Aug 1 15:34:40 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 928E3180634 for ; Wed, 1 Aug 2018 15:34:39 +0200 (CEST) Received: (qmail 42864 invoked by uid 500); 1 Aug 2018 13:34:38 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 42855 invoked by uid 99); 1 Aug 2018 13:34:38 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Aug 2018 13:34:38 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 1C48C81EB6; Wed, 1 Aug 2018 13:34:38 +0000 (UTC) Date: Wed, 01 Aug 2018 13:34:38 +0000 To: "commits@parquet.apache.org" Subject: [parquet-cpp] branch master updated: PARQUET-1357: FormatStatValue truncates binary statistics on zero character MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <153313047801.7507.758615735228597912@gitbox.apache.org> From: uwe@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: parquet-cpp X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 49062489f575659c4325f96260284e10e46ed11b X-Git-Newrev: 646e2258172112036e3c4c2e6541b0f86b5fb35f X-Git-Rev: 646e2258172112036e3c4c2e6541b0f86b5fb35f X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. uwe pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git The following commit(s) were added to refs/heads/master by this push: new 646e225 PARQUET-1357: FormatStatValue truncates binary statistics on zero character 646e225 is described below commit 646e2258172112036e3c4c2e6541b0f86b5fb35f Author: Korn, Uwe AuthorDate: Wed Aug 1 15:34:32 2018 +0200 PARQUET-1357: FormatStatValue truncates binary statistics on zero character Author: Korn, Uwe Closes #479 from xhochy/PARQUET-1357 and squashes the following commits: 4135976 [Korn, Uwe] Add macro for deprecations e0e4946 [Korn, Uwe] Update to clang-format 6.0 0073b56 [Korn, Uwe] PARQUET-1357: FormatStatValue truncates binary statistics on zero character --- cmake_modules/FindClangTools.cmake | 26 +++++++++++++++++++++----- src/parquet/printer.cc | 9 ++++----- src/parquet/types-test.cc | 27 +++++++++++++++++++++++++++ src/parquet/types.cc | 35 +++++++++++++++++++++++++++++++++++ src/parquet/types.h | 6 ++++++ src/parquet/util/macros.h | 15 +++++++++++++++ 6 files changed, 108 insertions(+), 10 deletions(-) diff --git a/cmake_modules/FindClangTools.cmake b/cmake_modules/FindClangTools.cmake index e9221ff..215a5cd 100644 --- a/cmake_modules/FindClangTools.cmake +++ b/cmake_modules/FindClangTools.cmake @@ -30,6 +30,12 @@ # CLANG_FORMAT_BIN, The path to the clang format binary # CLANG_TIDY_FOUND, Whether clang format was found +if (DEFINED ENV{HOMEBREW_PREFIX}) + set(HOMEBREW_PREFIX "${ENV{HOMEBREW_PREFIX}") +else() + set(HOMEBREW_PREFIX "/usr/local") +endif() + find_program(CLANG_TIDY_BIN NAMES clang-tidy-4.0 clang-tidy-3.9 @@ -37,7 +43,7 @@ find_program(CLANG_TIDY_BIN clang-tidy-3.7 clang-tidy-3.6 clang-tidy - PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin + PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin" NO_DEFAULT_PATH ) @@ -55,7 +61,7 @@ if (CLANG_FORMAT_VERSION) PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} - /usr/local/bin /usr/bin + /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin" NO_DEFAULT_PATH ) @@ -67,16 +73,26 @@ if (CLANG_FORMAT_VERSION) if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0") find_program(CLANG_FORMAT_BIN NAMES clang-format - PATHS /usr/local/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin + PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin" NO_DEFAULT_PATH ) else() find_program(CLANG_FORMAT_BIN NAMES clang-format - PATHS /usr/local/opt/llvm@${CLANG_FORMAT_VERSION}/bin + PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_VERSION}/bin" NO_DEFAULT_PATH ) endif() + + if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") + # binary was still not found, look into Cellar + file(GLOB CLANG_FORMAT_PATH "${HOMEBREW_PREFIX}/Cellar/llvm/${CLANG_FORMAT_VERSION}.*") + find_program(CLANG_FORMAT_BIN + NAMES clang-format + PATHS "${CLANG_FORMAT_PATH}/bin" + NO_DEFAULT_PATH + ) + endif() endif() else() find_program(CLANG_FORMAT_BIN @@ -86,7 +102,7 @@ else() clang-format-3.7 clang-format-3.6 clang-format - PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin + PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin" NO_DEFAULT_PATH ) endif() diff --git a/src/parquet/printer.cc b/src/parquet/printer.cc index 88b5528..3f18a5c 100644 --- a/src/parquet/printer.cc +++ b/src/parquet/printer.cc @@ -84,8 +84,8 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte std::string min = stats->EncodeMin(), max = stats->EncodeMax(); stream << ", Null Values: " << stats->null_count() << ", Distinct Values: " << stats->distinct_count() << std::endl - << " Max: " << FormatStatValue(descr->physical_type(), max.c_str()) - << ", Min: " << FormatStatValue(descr->physical_type(), min.c_str()); + << " Max: " << FormatStatValue(descr->physical_type(), max) + << ", Min: " << FormatStatValue(descr->physical_type(), min); } else { stream << " Statistics Not Set"; } @@ -207,9 +207,8 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected std::string min = stats->EncodeMin(), max = stats->EncodeMax(); stream << "\"NumNulls\": \"" << stats->null_count() << "\", " << "\"DistinctValues\": \"" << stats->distinct_count() << "\", " - << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max.c_str()) - << "\", " - << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min.c_str()) + << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", " + << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min) << "\" },"; } else { stream << "\"False\","; diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc index 4e75982..6b184e3 100644 --- a/src/parquet/types-test.cc +++ b/src/parquet/types-test.cc @@ -62,54 +62,81 @@ TEST(TestLogicalTypeToString, LogicalTypes) { } TEST(TypePrinter, StatisticsTypes) { +#if !(defined(_WIN32) || defined(__CYGWIN__)) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif std::string smin; std::string smax; int32_t int_min = 1024; int32_t int_max = 2048; smin = std::string(reinterpret_cast(&int_min), sizeof(int32_t)); smax = std::string(reinterpret_cast(&int_max), sizeof(int32_t)); + ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin).c_str()); ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin.c_str()).c_str()); + ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax).c_str()); ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax.c_str()).c_str()); int64_t int64_min = 10240000000000; int64_t int64_max = 20480000000000; smin = std::string(reinterpret_cast(&int64_min), sizeof(int64_t)); smax = std::string(reinterpret_cast(&int64_max), sizeof(int64_t)); + ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin).c_str()); ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin.c_str()).c_str()); + ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax).c_str()); ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax.c_str()).c_str()); float float_min = 1.024f; float float_max = 2.048f; smin = std::string(reinterpret_cast(&float_min), sizeof(float)); smax = std::string(reinterpret_cast(&float_max), sizeof(float)); + ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin).c_str()); ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin.c_str()).c_str()); + ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax).c_str()); ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax.c_str()).c_str()); double double_min = 1.0245; double double_max = 2.0489; smin = std::string(reinterpret_cast(&double_min), sizeof(double)); smax = std::string(reinterpret_cast(&double_max), sizeof(double)); + ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin).c_str()); ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin.c_str()).c_str()); + ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax).c_str()); ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax.c_str()).c_str()); Int96 Int96_min = {{1024, 2048, 4096}}; Int96 Int96_max = {{2048, 4096, 8192}}; smin = std::string(reinterpret_cast(&Int96_min), sizeof(Int96)); smax = std::string(reinterpret_cast(&Int96_max), sizeof(Int96)); + ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin).c_str()); ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin.c_str()).c_str()); + ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax).c_str()); ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax.c_str()).c_str()); smin = std::string("abcdef"); smax = std::string("ijklmnop"); + ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin).c_str()); ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin.c_str()).c_str()); + ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax).c_str()); ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax.c_str()).c_str()); + // PARQUET-1357: FormatStatValue truncates binary statistics on zero character + smax.push_back('\0'); + ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax)); + // This fails, thus the call to FormatStatValue(.., const char*) was deprecated. + // ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax.c_str())); + smin = std::string("abcdefgh"); smax = std::string("ijklmnop"); + ASSERT_STREQ("abcdefgh", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin).c_str()); ASSERT_STREQ("abcdefgh", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin.c_str()).c_str()); + ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax).c_str()); ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax.c_str()).c_str()); +#if !(defined(_WIN32) || defined(__CYGWIN__)) +#pragma GCC diagnostic pop +#endif } } // namespace parquet diff --git a/src/parquet/types.cc b/src/parquet/types.cc index 79bc5d1..3120963 100644 --- a/src/parquet/types.cc +++ b/src/parquet/types.cc @@ -24,6 +24,41 @@ namespace parquet { +std::string FormatStatValue(Type::type parquet_type, const std::string& val) { + std::stringstream result; + switch (parquet_type) { + case Type::BOOLEAN: + result << reinterpret_cast(val.c_str())[0]; + break; + case Type::INT32: + result << reinterpret_cast(val.c_str())[0]; + break; + case Type::INT64: + result << reinterpret_cast(val.c_str())[0]; + break; + case Type::DOUBLE: + result << reinterpret_cast(val.c_str())[0]; + break; + case Type::FLOAT: + result << reinterpret_cast(val.c_str())[0]; + break; + case Type::INT96: { + auto const i32_val = reinterpret_cast(val.c_str()); + result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2]; + break; + } + case Type::BYTE_ARRAY: { + return val; + } + case Type::FIXED_LEN_BYTE_ARRAY: { + return val; + } + default: + break; + } + return result.str(); +} + std::string FormatStatValue(Type::type parquet_type, const char* val) { std::stringstream result; switch (parquet_type) { diff --git a/src/parquet/types.h b/src/parquet/types.h index 04cfc4b..0f4cfc2 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -27,6 +27,7 @@ #include "arrow/util/macros.h" +#include "parquet/util/macros.h" #include "parquet/util/visibility.h" namespace parquet { @@ -292,6 +293,11 @@ PARQUET_EXPORT std::string LogicalTypeToString(LogicalType::type t); PARQUET_EXPORT std::string TypeToString(Type::type t); +PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, + const std::string& val); + +/// \deprecated Since 1.5.0 +PARQUET_DEPRECATED("Use std::string instead of char* as input") PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val); PARQUET_EXPORT int GetTypeByteSize(Type::type t); diff --git a/src/parquet/util/macros.h b/src/parquet/util/macros.h index 0d172b1..c28b2fa 100644 --- a/src/parquet/util/macros.h +++ b/src/parquet/util/macros.h @@ -68,4 +68,19 @@ #define FRIEND_TEST(test_case_name, test_name) \ friend class test_case_name##_##test_name##_Test +// clang-format off +// [[deprecated]] is only available in C++14, use this for the time being +// This macro takes an optional deprecation message +#if __cplusplus <= 201103L +# ifdef __GNUC__ +# define PARQUET_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__))) +# elif defined(_MSC_VER) +# define PARQUET_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__)) +# else +# define PARQUET_DEPRECATED(...) +# endif +#else +# define PARQUET_DEPRECATED(...) [[deprecated(__VA_ARGS__)]] +#endif + #endif // PARQUET_UTIL_MACROS_H