parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-882: Improve Application Version parsing
Date Tue, 21 Feb 2017 14:22:58 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 16395d04d -> 3527224ba


PARQUET-882: Improve Application Version parsing

Author: Deepak Majeti <deepak.majeti@hpe.com>

Closes #248 from majetideepak/PARQUET-882 and squashes the following commits:

da2daff [Deepak Majeti] minor comment fixes
d5526a8 [Deepak Majeti] Use boost:regex
2ef6afb [Deepak Majeti] Improve regex
1d58ee0 [Deepak Majeti] review comments
bf7d494 [Deepak Majeti] improve comments
d8435c6 [Deepak Majeti] Add test
a17f973 [Deepak Majeti] Add Regular Expressions


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/3527224b
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/3527224b
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/3527224b

Branch: refs/heads/master
Commit: 3527224ba4e9dd917f2f0ad730816dfa01b6eb7e
Parents: 16395d0
Author: Deepak Majeti <deepak.majeti@hpe.com>
Authored: Tue Feb 21 09:22:51 2017 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Tue Feb 21 09:22:51 2017 -0500

----------------------------------------------------------------------
 .travis.yml                             |  1 +
 CMakeLists.txt                          | 15 ++++++++
 cmake_modules/ThirdpartyToolchain.cmake | 28 +++++++++++++--
 src/parquet/compression.cc              |  2 +-
 src/parquet/file/file-metadata-test.cc  | 20 +++++++++--
 src/parquet/file/metadata.cc            | 51 ++++++++++++++++------------
 src/parquet/file/metadata.h             | 41 +++++++++++++++-------
 7 files changed, 118 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/3527224b/.travis.yml
----------------------------------------------------------------------
diff --git a/.travis.yml b/.travis.yml
index 14df7f3..52f0f03 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,6 +10,7 @@ addons:
     - valgrind
     - libboost-dev
     - libboost-filesystem-dev
+    - libboost-regex-dev
     - libboost-system-dev
     - libboost-program-options-dev
     - libboost-test-dev

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/3527224b/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52a44e1..bb24c14 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,6 +76,9 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
   option(PARQUET_BUILD_BENCHMARKS
     "Build the libparquet benchmark suite"
     OFF)
+  option(PARQUET_BOOST_USE_SHARED
+    "Rely on boost shared libraries where relevant"
+    ON)
   option(PARQUET_BUILD_TESTS
     "Build the libparquet test suite"
     ON)
@@ -434,6 +437,17 @@ if ("${PARQUET_GENERATE_COVERAGE}")
 endif()
 
 #############################################################
+# Boost linkage
+
+if (PARQUET_BOOST_USE_SHARED)
+  set(BOOST_LINK_LIBS
+    boost_shared_regex)
+else()
+  set(BOOST_LINK_LIBS
+    boost_static_regex)
+endif()
+
+#############################################################
 # Apache Arrow linkage
 
 if ("${PARQUET_ARROW_LINKAGE}" STREQUAL "shared")
@@ -547,6 +561,7 @@ set(BUNDLED_STATIC_LIBS
 # Shared library linked libs
 set(LIBPARQUET_PRIVATE_LINK_LIBS
   ${ARROW_LINK_LIBS}
+  ${BOOST_LINK_LIBS}
   ${BUNDLED_STATIC_LIBS}
 )
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/3527224b/cmake_modules/ThirdpartyToolchain.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
index 4b943b5..cebec75 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -25,13 +25,35 @@ set(BROTLI_VERSION "5db62dcc9d386579609540cdf8869e95ad334bbd")
 set(ARROW_VERSION "fa8d27f314b7c21c611d1c5caaa9b32ae0cb2b06")
 
 # find boost headers and libs
+# Find shared Boost libraries.
 set(Boost_DEBUG TRUE)
 set(Boost_USE_MULTITHREADED ON)
-find_package(Boost REQUIRED)
-include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
-set(LIBS ${LIBS} ${Boost_LIBRARIES})
+set(Boost_USE_STATIC_LIBS ON)
+find_package(Boost COMPONENTS regex REQUIRED)
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
+  set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
+else()
+  set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
+endif()
+
+# Find static Boost libraries.
+set(Boost_USE_STATIC_LIBS OFF)
+find_package(Boost COMPONENTS regex REQUIRED)
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
+  set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
+else()
+  set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
+endif()
+
 message(STATUS "Boost include dir: " ${Boost_INCLUDE_DIRS})
 message(STATUS "Boost libraries: " ${Boost_LIBRARIES})
+add_library(boost_static_regex STATIC IMPORTED)
+set_target_properties(boost_static_regex PROPERTIES IMPORTED_LOCATION ${BOOST_STATIC_REGEX_LIBRARY})
+add_library(boost_shared_regex SHARED IMPORTED)
+set_target_properties(boost_shared_regex PROPERTIES IMPORTED_LOCATION ${BOOST_SHARED_REGEX_LIBRARY})
+
+include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
+set(LIBS ${LIBS} ${Boost_LIBRARIES})
 
 string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE)
 # Set -fPIC on all external projects and include the main CXX_FLAGS

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/3527224b/src/parquet/compression.cc
----------------------------------------------------------------------
diff --git a/src/parquet/compression.cc b/src/parquet/compression.cc
index 97b5c17..c71f161 100644
--- a/src/parquet/compression.cc
+++ b/src/parquet/compression.cc
@@ -189,7 +189,7 @@ int64_t GZipCodec::Compress(
     if (ret == Z_OK) {
       // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
       // small
-      throw ParquetException("zlib deflate failed, output buffer to small");
+      throw ParquetException("zlib deflate failed, output buffer too small");
     }
     std::stringstream ss;
     ss << "zlib deflate failed: " << stream_.msg;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/3527224b/src/parquet/file/file-metadata-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/file-metadata-test.cc b/src/parquet/file/file-metadata-test.cc
index 74d4406..7f71955 100644
--- a/src/parquet/file/file-metadata-test.cc
+++ b/src/parquet/file/file-metadata-test.cc
@@ -186,17 +186,33 @@ TEST(ApplicationVersion, Basics) {
   ApplicationVersion version1("parquet-mr version 1.8.0");
   ApplicationVersion version2("parquet-cpp version 1.0.0");
   ApplicationVersion version3("");
+  ApplicationVersion version4("parquet-mr version 1.5.0ab-cdh5.5.0+cd (build abcd)");
+  ApplicationVersion version5("parquet-mr");
 
-  ASSERT_EQ("parquet-mr", version.application);
+  ASSERT_EQ("parquet-mr", version.application_);
   ASSERT_EQ(1, version.version.major);
   ASSERT_EQ(7, version.version.minor);
   ASSERT_EQ(9, version.version.patch);
 
-  ASSERT_EQ("parquet-cpp", version2.application);
+  ASSERT_EQ("parquet-cpp", version2.application_);
   ASSERT_EQ(1, version2.version.major);
   ASSERT_EQ(0, version2.version.minor);
   ASSERT_EQ(0, version2.version.patch);
 
+  ASSERT_EQ("parquet-mr", version4.application_);
+  ASSERT_EQ("abcd", version4.build_);
+  ASSERT_EQ(1, version4.version.major);
+  ASSERT_EQ(5, version4.version.minor);
+  ASSERT_EQ(0, version4.version.patch);
+  ASSERT_EQ("ab", version4.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version4.version.pre_release);
+  ASSERT_EQ("cd", version4.version.build_info);
+
+  ASSERT_EQ("parquet-mr", version5.application_);
+  ASSERT_EQ(0, version5.version.major);
+  ASSERT_EQ(0, version5.version.minor);
+  ASSERT_EQ(0, version5.version.patch);
+
   ASSERT_EQ(true, version.VersionLt(version1));
 
   ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96));

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/3527224b/src/parquet/file/metadata.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc
index 42942a5..5c68e3c 100644
--- a/src/parquet/file/metadata.cc
+++ b/src/parquet/file/metadata.cc
@@ -27,6 +27,8 @@
 #include "parquet/util/memory.h"
 
 #include <boost/algorithm/string.hpp>
+#include <boost/regex.hpp>
+
 
 namespace parquet {
 
@@ -474,29 +476,36 @@ void FileMetaData::WriteTo(OutputStream* dst) {
 }
 
 ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  namespace ba = boost::algorithm;
+  boost::regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
+  boost::regex ver_regex{ApplicationVersion::VERSION_FORMAT};
+  boost::smatch app_matches;
+  boost::smatch ver_matches;
 
   std::string created_by_lower = created_by;
   std::transform(created_by_lower.begin(), created_by_lower.end(),
       created_by_lower.begin(), ::tolower);
 
-  std::vector<std::string> tokens;
-  ba::split(tokens, created_by_lower, ba::is_any_of(" "), ba::token_compress_on);
-  // Boost always creates at least one token
-  DCHECK_GT(tokens.size(), 0);
-  application = tokens[0];
-
-  if (tokens.size() >= 3 && tokens[1] == "version") {
-    std::string version_string = tokens[2];
-    // Ignore any trailing nodextra characters
-    int n = version_string.find_first_not_of("0123456789.");
-    std::string version_string_trimmed = version_string.substr(0, n);
-
-    std::vector<std::string> version_tokens;
-    ba::split(version_tokens, version_string_trimmed, ba::is_any_of("."));
-    version.major = version_tokens.size() >= 1 ? atoi(version_tokens[0].c_str()) : 0;
-    version.minor = version_tokens.size() >= 2 ? atoi(version_tokens[1].c_str()) : 0;
-    version.patch = version_tokens.size() >= 3 ? atoi(version_tokens[2].c_str()) : 0;
+  bool app_success = boost::regex_match(created_by_lower, app_matches, app_regex);
+  bool ver_success = false;
+  std::string version_str;
+
+  if (app_success && app_matches.size() >= 4) {
+    // first match is the entire string. sub-matches start from second.
+    application_ = app_matches[1];
+    version_str = app_matches[3];
+    build_ = app_matches[4];
+    ver_success = boost::regex_match(version_str, ver_matches, ver_regex);
+  } else {
+      application_ = "unknown";
+  }
+
+  if (ver_success && ver_matches.size() >= 7) {
+    version.major = atoi(ver_matches[1].str().c_str());
+    version.minor = atoi(ver_matches[2].str().c_str());
+    version.patch = atoi(ver_matches[3].str().c_str());
+    version.unknown = ver_matches[4].str();
+    version.pre_release = ver_matches[5].str();
+    version.build_info = ver_matches[6].str();
   } else {
     version.major = 0;
     version.minor = 0;
@@ -505,7 +514,7 @@ ApplicationVersion::ApplicationVersion(const std::string& created_by)
{
 }
 
 bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const {
-  if (application != other_version.application) return false;
+  if (application_ != other_version.application_) return false;
 
   if (version.major < other_version.version.major) return true;
   if (version.major > other_version.version.major) return false;
@@ -517,7 +526,7 @@ bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version)
cons
 }
 
 bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const {
-  return application == other_version.application &&
+  return application_ == other_version.application_ &&
          version.major == other_version.version.major &&
          version.minor == other_version.version.minor &&
          version.patch == other_version.version.patch;
@@ -537,7 +546,7 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type) const
{
 
   // created_by is not populated, which could have been caused by
   // parquet-mr during the same time as PARQUET-251, see PARQUET-297
-  if (application == "unknown") { return true; }
+  if (application_ == "unknown") { return true; }
 
   // PARQUET-251
   if (VersionLt(PARQUET_251_FIXED_VERSION)) { return false; }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/3527224b/src/parquet/file/metadata.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h
index 97793a1..057c5b1 100644
--- a/src/parquet/file/metadata.h
+++ b/src/parquet/file/metadata.h
@@ -45,32 +45,47 @@ enum SortOrder { SIGNED, UNSIGNED, UNKNOWN };
 
 class ApplicationVersion {
  public:
-  /// Known Versions with Issues
+  // Known Versions with Issues
   static const ApplicationVersion PARQUET_251_FIXED_VERSION;
   static const ApplicationVersion PARQUET_816_FIXED_VERSION;
-
-  /// Application that wrote the file. e.g. "IMPALA"
-  std::string application;
-
-  /// Version of the application that wrote the file, expressed in three parts
-  /// (<major>.<minor>.<patch>). Unspecified parts default to 0, and extra
parts are
-  /// ignored. e.g.:
-  /// "1.2.3"    => {1, 2, 3}
-  /// "1.2"      => {1, 2, 0}
-  /// "1.2-cdh5" => {1, 2, 0}
+  // Regular expression for the version format
+  // major . minor . patch unknown - prerelease.x + build info
+  // Eg: 1.5.0ab-cdh5.5.0+cd
+  static constexpr char const* VERSION_FORMAT =
+      "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$";
+  // Regular expression for the application format
+  // application_name version VERSION_FORMAT (build build_name)
+  // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+  static constexpr char const* APPLICATION_FORMAT =
+      "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)";
+
+  // Application that wrote the file. e.g. "IMPALA"
+  std::string application_;
+  // Build name
+  std::string build_;
+
+  // Version of the application that wrote the file, expressed as
+  // (<major>.<minor>.<patch>). Unmatched parts default to 0.
+  // "1.2.3"    => {1, 2, 3}
+  // "1.2"      => {0, 0, 0}
+  // "1.2-cdh5" => {0, 0, 0}
+  // TODO (majetideepak): Implement support for pre_release
   struct {
     int major;
     int minor;
     int patch;
+    std::string unknown;
+    std::string pre_release;
+    std::string build_info;
   } version;
 
   ApplicationVersion() {}
   explicit ApplicationVersion(const std::string& created_by);
 
-  /// Returns true if version is strictly less than other_version
+  // Returns true if version is strictly less than other_version
   bool VersionLt(const ApplicationVersion& other_version) const;
 
-  /// Returns true if version is strictly less than other_version
+  // Returns true if version is strictly less than other_version
   bool VersionEq(const ApplicationVersion& other_version) const;
 
   // Checks if the Version has the correct statistics for a given column


Mime
View raw message