parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-793: Do not return incorrect statistics
Date Sun, 12 Feb 2017 00:33:44 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 5e59bc5c6 -> 1a6d22b7d


PARQUET-793: Do not return incorrect statistics

Author: Deepak Majeti <deepak.majeti@hpe.com>

Closes #239 from majetideepak/FixStats and squashes the following commits:

948c9b9 [Deepak Majeti] fix failing test
ea0693f [Deepak Majeti] review comments
dbb8569 [Deepak Majeti] minor fix
75d392a [Deepak Majeti] format
e89f85f [Deepak Majeti] stats test and comments
800e239 [Deepak Majeti] added tests
92f31cd [Deepak Majeti] Return only Signed stats
feeeb88 [Deepak Majeti] Update Version and add checkCorrectStatistics


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/1a6d22b7
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/1a6d22b7
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/1a6d22b7

Branch: refs/heads/master
Commit: 1a6d22b7d9e430d3e4ba93fcf63431f0bc2daf69
Parents: 5e59bc5
Author: Deepak Majeti <deepak.majeti@hpe.com>
Authored: Sat Feb 11 19:33:38 2017 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sat Feb 11 19:33:38 2017 -0500

----------------------------------------------------------------------
 src/parquet/column/statistics-test.cc  |  46 +++++++
 src/parquet/file/file-metadata-test.cc |  24 +++-
 src/parquet/file/metadata.cc           | 190 ++++++++++++++++++++++------
 src/parquet/file/metadata.h            |  87 ++++++++-----
 src/parquet/file/reader-internal.cc    |   4 +-
 5 files changed, 271 insertions(+), 80 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/column/statistics-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/statistics-test.cc b/src/parquet/column/statistics-test.cc
index dcaad45..d631d98 100644
--- a/src/parquet/column/statistics-test.cc
+++ b/src/parquet/column/statistics-test.cc
@@ -32,6 +32,7 @@
 #include "parquet/file/reader.h"
 #include "parquet/file/writer.h"
 #include "parquet/schema.h"
+#include "parquet/thrift.h"
 #include "parquet/types.h"
 #include "parquet/util/memory.h"
 
@@ -167,6 +168,7 @@ class TestRowGroupStatistics : public PrimitiveTypedTest<TestType>
{
     auto file_reader = ParquetFileReader::Open(source);
     auto rg_reader = file_reader->RowGroup(0);
     auto column_chunk = rg_reader->metadata()->ColumnChunk(0);
+    if (!column_chunk->is_stats_set()) return;
     std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
     // check values after serialization + deserialization
     ASSERT_EQ(null_count, stats->null_count());
@@ -308,5 +310,49 @@ TYPED_TEST(TestNumericRowGroupStatistics, Merge) {
   this->TestMerge();
 }
 
+TEST(CorruptStatistics, Basics) {
+  ApplicationVersion version("parquet-mr version 1.8.0");
+  SchemaDescriptor schema;
+  schema::NodePtr node;
+  std::vector<schema::NodePtr> fields;
+  // Test Physical Types
+  fields.push_back(schema::PrimitiveNode::Make(
+      "col1", Repetition::OPTIONAL, Type::INT32, LogicalType::NONE));
+  fields.push_back(schema::PrimitiveNode::Make(
+      "col2", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::NONE));
+  // Test Logical Types
+  fields.push_back(schema::PrimitiveNode::Make(
+      "col3", Repetition::OPTIONAL, Type::INT32, LogicalType::DATE));
+  fields.push_back(schema::PrimitiveNode::Make(
+      "col4", Repetition::OPTIONAL, Type::INT32, LogicalType::UINT_32));
+  fields.push_back(schema::PrimitiveNode::Make("col5", Repetition::OPTIONAL,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 12));
+  fields.push_back(schema::PrimitiveNode::Make(
+      "col6", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8));
+  node = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
+  schema.Init(node);
+
+  format::ColumnChunk col_chunk;
+  col_chunk.meta_data.__isset.statistics = true;
+  auto column_chunk1 = ColumnChunkMetaData::Make(
+      reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(0), &version);
+  ASSERT_TRUE(column_chunk1->is_stats_set());
+  auto column_chunk2 = ColumnChunkMetaData::Make(
+      reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(1), &version);
+  ASSERT_FALSE(column_chunk2->is_stats_set());
+  auto column_chunk3 = ColumnChunkMetaData::Make(
+      reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(2), &version);
+  ASSERT_TRUE(column_chunk3->is_stats_set());
+  auto column_chunk4 = ColumnChunkMetaData::Make(
+      reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(3), &version);
+  ASSERT_FALSE(column_chunk4->is_stats_set());
+  auto column_chunk5 = ColumnChunkMetaData::Make(
+      reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(4), &version);
+  ASSERT_FALSE(column_chunk5->is_stats_set());
+  auto column_chunk6 = ColumnChunkMetaData::Make(
+      reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(5), &version);
+  ASSERT_FALSE(column_chunk6->is_stats_set());
+}
+
 }  // namespace test
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/file/file-metadata-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/file-metadata-test.cc b/src/parquet/file/file-metadata-test.cc
index 0edc34f..74d4406 100644
--- a/src/parquet/file/file-metadata-test.cc
+++ b/src/parquet/file/file-metadata-test.cc
@@ -181,13 +181,29 @@ TEST(Metadata, TestV1Version) {
   ASSERT_EQ(ParquetVersion::PARQUET_1_0, f_accessor->version());
 }
 
-TEST(FileVersion, Basics) {
-  FileMetaData::Version version("parquet-mr version 1.2.8");
+TEST(ApplicationVersion, Basics) {
+  ApplicationVersion version("parquet-mr version 1.7.9");
+  ApplicationVersion version1("parquet-mr version 1.8.0");
+  ApplicationVersion version2("parquet-cpp version 1.0.0");
+  ApplicationVersion version3("");
 
   ASSERT_EQ("parquet-mr", version.application);
   ASSERT_EQ(1, version.version.major);
-  ASSERT_EQ(2, version.version.minor);
-  ASSERT_EQ(8, version.version.patch);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+
+  ASSERT_EQ("parquet-cpp", version2.application);
+  ASSERT_EQ(1, version2.version.major);
+  ASSERT_EQ(0, version2.version.minor);
+  ASSERT_EQ(0, version2.version.patch);
+
+  ASSERT_EQ(true, version.VersionLt(version1));
+
+  ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96));
+  ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32));
+  ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY));
+  ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY));
+  ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY));
 }
 
 }  // namespace metadata

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/file/metadata.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc
index de7c4e4..42942a5 100644
--- a/src/parquet/file/metadata.cc
+++ b/src/parquet/file/metadata.cc
@@ -30,6 +30,62 @@
 
 namespace parquet {
 
+const ApplicationVersion ApplicationVersion::PARQUET_251_FIXED_VERSION =
+    ApplicationVersion("parquet-mr version 1.8.0");
+const ApplicationVersion ApplicationVersion::PARQUET_816_FIXED_VERSION =
+    ApplicationVersion("parquet-mr version 1.2.9");
+
+// Return the Sort Order of the Parquet Physical Types
+SortOrder default_sort_order(Type::type primitive) {
+  switch (primitive) {
+    case Type::BOOLEAN:
+    case Type::INT32:
+    case Type::INT64:
+    case Type::FLOAT:
+    case Type::DOUBLE:
+      return SortOrder::SIGNED;
+    case Type::BYTE_ARRAY:
+    case Type::FIXED_LEN_BYTE_ARRAY:
+    case Type::INT96:  // only used for timestamp, which uses unsigned values
+      return SortOrder::UNSIGNED;
+  }
+  return SortOrder::UNKNOWN;
+}
+
+// Return the SortOrder of the Parquet Types using Logical or Physical Types
+SortOrder get_sort_order(LogicalType::type converted, Type::type primitive) {
+  if (converted == LogicalType::NONE) return default_sort_order(primitive);
+  switch (converted) {
+    case LogicalType::INT_8:
+    case LogicalType::INT_16:
+    case LogicalType::INT_32:
+    case LogicalType::INT_64:
+    case LogicalType::DATE:
+    case LogicalType::TIME_MICROS:
+    case LogicalType::TIME_MILLIS:
+    case LogicalType::TIMESTAMP_MICROS:
+    case LogicalType::TIMESTAMP_MILLIS:
+      return SortOrder::SIGNED;
+    case LogicalType::UINT_8:
+    case LogicalType::UINT_16:
+    case LogicalType::UINT_32:
+    case LogicalType::UINT_64:
+    case LogicalType::ENUM:
+    case LogicalType::UTF8:
+    case LogicalType::BSON:
+    case LogicalType::JSON:
+      return SortOrder::UNSIGNED;
+    case LogicalType::DECIMAL:
+    case LogicalType::LIST:
+    case LogicalType::MAP:
+    case LogicalType::MAP_KEY_VALUE:
+    case LogicalType::INTERVAL:
+    case LogicalType::NONE:  // required instead of default
+      return SortOrder::UNKNOWN;
+  }
+  return SortOrder::UNKNOWN;
+}
+
 template <typename DType>
 static std::shared_ptr<RowGroupStatistics> MakeTypedColumnStats(
     const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
@@ -66,14 +122,14 @@ std::shared_ptr<RowGroupStatistics> MakeColumnStats(
 // ColumnChunk metadata
 class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
  public:
-  explicit ColumnChunkMetaDataImpl(
-      const format::ColumnChunk* column, const ColumnDescriptor* descr)
-      : column_(column), descr_(descr) {
+  explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
+      const ColumnDescriptor* descr, const ApplicationVersion* writer_version)
+      : column_(column), descr_(descr), writer_version_(writer_version) {
     const format::ColumnMetaData& meta_data = column->meta_data;
     for (auto encoding : meta_data.encodings) {
       encodings_.push_back(FromThrift(encoding));
     }
-    if (meta_data.__isset.statistics) { stats_ = MakeColumnStats(meta_data, descr_); }
+    stats_ = nullptr;
   }
   ~ColumnChunkMetaDataImpl() {}
 
@@ -82,7 +138,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
   inline const std::string& file_path() const { return column_->file_path; }
 
   // column metadata
-  inline Type::type type() { return FromThrift(column_->meta_data.type); }
+  inline Type::type type() const { return FromThrift(column_->meta_data.type); }
 
   inline int64_t num_values() const { return column_->meta_data.num_values; }
 
@@ -90,9 +146,26 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
     return std::make_shared<schema::ColumnPath>(column_->meta_data.path_in_schema);
   }
 
-  inline bool is_stats_set() const { return column_->meta_data.__isset.statistics; }
+  // Check if statistics are set and are valid
+  // 1) Must be set in the metadata
+  // 2) Statistics must not be corrupted
+  // 3) parquet-mr and parquet-cpp write statistics by SIGNED order comparison.
+  //    The statistics are corrupted if the type requires UNSIGNED order comparison.
+  //    Eg: UTF8
+  inline bool is_stats_set() const {
+    DCHECK(writer_version_ != nullptr);
+    return column_->meta_data.__isset.statistics &&
+           writer_version_->HasCorrectStatistics(type()) &&
+           SortOrder::SIGNED ==
+               get_sort_order(descr_->logical_type(), descr_->physical_type());
+  }
 
-  inline std::shared_ptr<RowGroupStatistics> statistics() const { return stats_; }
+  inline std::shared_ptr<RowGroupStatistics> statistics() const {
+    if (stats_ == nullptr && is_stats_set()) {
+      stats_ = MakeColumnStats(column_->meta_data, descr_);
+    }
+    return stats_;
+  }
 
   inline Compression::type compression() const {
     return FromThrift(column_->meta_data.codec);
@@ -123,21 +196,24 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
   }
 
  private:
-  std::shared_ptr<RowGroupStatistics> stats_;
+  mutable std::shared_ptr<RowGroupStatistics> stats_;
   std::vector<Encoding::type> encodings_;
   const format::ColumnChunk* column_;
   const ColumnDescriptor* descr_;
+  const ApplicationVersion* writer_version_;
 };
 
-std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
-    const uint8_t* metadata, const ColumnDescriptor* descr) {
-  return std::unique_ptr<ColumnChunkMetaData>(new ColumnChunkMetaData(metadata, descr));
+std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(const uint8_t* metadata,
+    const ColumnDescriptor* descr, const ApplicationVersion* writer_version) {
+  return std::unique_ptr<ColumnChunkMetaData>(
+      new ColumnChunkMetaData(metadata, descr, writer_version));
 }
 
-ColumnChunkMetaData::ColumnChunkMetaData(
-    const uint8_t* metadata, const ColumnDescriptor* descr)
+ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata,
+    const ColumnDescriptor* descr, const ApplicationVersion* writer_version)
     : impl_{std::unique_ptr<ColumnChunkMetaDataImpl>(new ColumnChunkMetaDataImpl(
-          reinterpret_cast<const format::ColumnChunk*>(metadata), descr))} {}
+          reinterpret_cast<const format::ColumnChunk*>(metadata), descr,
+          writer_version))} {}
 ColumnChunkMetaData::~ColumnChunkMetaData() {}
 
 // column chunk
@@ -205,9 +281,9 @@ int64_t ColumnChunkMetaData::total_compressed_size() const {
 // row-group metadata
 class RowGroupMetaData::RowGroupMetaDataImpl {
  public:
-  explicit RowGroupMetaDataImpl(
-      const format::RowGroup* row_group, const SchemaDescriptor* schema)
-      : row_group_(row_group), schema_(schema) {}
+  explicit RowGroupMetaDataImpl(const format::RowGroup* row_group,
+      const SchemaDescriptor* schema, const ApplicationVersion* writer_version)
+      : row_group_(row_group), schema_(schema), writer_version_(writer_version) {}
   ~RowGroupMetaDataImpl() {}
 
   inline int num_columns() const { return row_group_->columns.size(); }
@@ -226,23 +302,27 @@ class RowGroupMetaData::RowGroupMetaDataImpl {
       throw ParquetException(ss.str());
     }
     return ColumnChunkMetaData::Make(
-        reinterpret_cast<const uint8_t*>(&row_group_->columns[i]), schema_->Column(i));
+        reinterpret_cast<const uint8_t*>(&row_group_->columns[i]), schema_->Column(i),
+        writer_version_);
   }
 
  private:
   const format::RowGroup* row_group_;
   const SchemaDescriptor* schema_;
+  const ApplicationVersion* writer_version_;
 };
 
-std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
-    const uint8_t* metadata, const SchemaDescriptor* schema) {
-  return std::unique_ptr<RowGroupMetaData>(new RowGroupMetaData(metadata, schema));
+std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(const uint8_t* metadata,
+    const SchemaDescriptor* schema, const ApplicationVersion* writer_version) {
+  return std::unique_ptr<RowGroupMetaData>(
+      new RowGroupMetaData(metadata, schema, writer_version));
 }
 
-RowGroupMetaData::RowGroupMetaData(
-    const uint8_t* metadata, const SchemaDescriptor* schema)
+RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata,
+    const SchemaDescriptor* schema, const ApplicationVersion* writer_version)
     : impl_{std::unique_ptr<RowGroupMetaDataImpl>(new RowGroupMetaDataImpl(
-          reinterpret_cast<const format::RowGroup*>(metadata), schema))} {}
+          reinterpret_cast<const format::RowGroup*>(metadata), schema, writer_version))}
{
+}
 RowGroupMetaData::~RowGroupMetaData() {}
 
 int RowGroupMetaData::num_columns() const {
@@ -277,9 +357,9 @@ class FileMetaData::FileMetaDataImpl {
     metadata_len_ = *metadata_len;
 
     if (metadata_->__isset.created_by) {
-      writer_version_ = FileMetaData::Version(metadata_->created_by);
+      writer_version_ = ApplicationVersion(metadata_->created_by);
     } else {
-      writer_version_ = FileMetaData::Version("unknown 0.0.0");
+      writer_version_ = ApplicationVersion("unknown 0.0.0");
     }
 
     InitSchema();
@@ -294,7 +374,7 @@ class FileMetaData::FileMetaDataImpl {
   inline const std::string& created_by() const { return metadata_->created_by; }
   inline int num_schema_elements() const { return metadata_->schema.size(); }
 
-  const FileMetaData::Version& writer_version() const { return writer_version_; }
+  const ApplicationVersion& writer_version() const { return writer_version_; }
 
   void WriteTo(OutputStream* dst) { SerializeThriftMsg(metadata_.get(), 1024, dst); }
 
@@ -306,7 +386,8 @@ class FileMetaData::FileMetaDataImpl {
       throw ParquetException(ss.str());
     }
     return RowGroupMetaData::Make(
-        reinterpret_cast<const uint8_t*>(&metadata_->row_groups[i]), &schema_);
+        reinterpret_cast<const uint8_t*>(&metadata_->row_groups[i]), &schema_,
+        &writer_version_);
   }
 
   const SchemaDescriptor* schema() const { return &schema_; }
@@ -321,7 +402,7 @@ class FileMetaData::FileMetaDataImpl {
     schema_.Init(converter.Convert());
   }
   SchemaDescriptor schema_;
-  FileMetaData::Version writer_version_;
+  ApplicationVersion writer_version_;
 };
 
 std::shared_ptr<FileMetaData> FileMetaData::Make(
@@ -372,7 +453,7 @@ ParquetVersion::type FileMetaData::version() const {
   return ParquetVersion::PARQUET_1_0;
 }
 
-const FileMetaData::Version& FileMetaData::writer_version() const {
+const ApplicationVersion& FileMetaData::writer_version() const {
   return impl_->writer_version();
 }
 
@@ -392,7 +473,7 @@ void FileMetaData::WriteTo(OutputStream* dst) {
   return impl_->WriteTo(dst);
 }
 
-FileMetaData::Version::Version(const std::string& created_by) {
+ApplicationVersion::ApplicationVersion(const std::string& created_by) {
   namespace ba = boost::algorithm;
 
   std::string created_by_lower = created_by;
@@ -423,18 +504,45 @@ FileMetaData::Version::Version(const std::string& created_by) {
   }
 }
 
-bool FileMetaData::Version::VersionLt(int major, int minor, int patch) const {
-  if (version.major < major) return true;
-  if (version.major > major) return false;
-  DCHECK_EQ(version.major, major);
-  if (version.minor < minor) return true;
-  if (version.minor > minor) return false;
-  DCHECK_EQ(version.minor, minor);
-  return version.patch < patch;
+bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const {
+  if (application != other_version.application) return false;
+
+  if (version.major < other_version.version.major) return true;
+  if (version.major > other_version.version.major) return false;
+  DCHECK_EQ(version.major, other_version.version.major);
+  if (version.minor < other_version.version.minor) return true;
+  if (version.minor > other_version.version.minor) return false;
+  DCHECK_EQ(version.minor, other_version.version.minor);
+  return version.patch < other_version.version.patch;
+}
+
+bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const {
+  return application == other_version.application &&
+         version.major == other_version.version.major &&
+         version.minor == other_version.version.minor &&
+         version.patch == other_version.version.patch;
 }
 
-bool FileMetaData::Version::VersionEq(int major, int minor, int patch) const {
-  return version.major == major && version.minor == minor && version.patch
== patch;
+// Reference:
+// parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
+// PARQUET-686 has more disussion on statistics
+bool ApplicationVersion::HasCorrectStatistics(Type::type col_type) const {
+  // None of the current tools write INT96 Statistics correctly
+  if (col_type == Type::INT96) return false;
+
+  // Statistics of other types are OK
+  if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
+    return true;
+  }
+
+  // created_by is not populated, which could have been caused by
+  // parquet-mr during the same time as PARQUET-251, see PARQUET-297
+  if (application == "unknown") { return true; }
+
+  // PARQUET-251
+  if (VersionLt(PARQUET_251_FIXED_VERSION)) { return false; }
+
+  return true;
 }
 
 // MetaData Builders

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/file/metadata.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h
index eab7fc6..97793a1 100644
--- a/src/parquet/file/metadata.h
+++ b/src/parquet/file/metadata.h
@@ -32,11 +32,56 @@
 
 namespace parquet {
 
+// Reference:
+// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
+//                            format/converter/ParquetMetadataConverter.java
+// Sort order for page and column statistics. Types are associated with sort
+// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
+// aggregated using a sort order. As of parquet-format version 2.3.1, the
+// order used to aggregate stats is always SIGNED and is not stored in the
+// Parquet file. These stats are discarded for types that need unsigned.
+// See PARQUET-686.
+enum SortOrder { SIGNED, UNSIGNED, UNKNOWN };
+
+class ApplicationVersion {
+ public:
+  /// Known Versions with Issues
+  static const ApplicationVersion PARQUET_251_FIXED_VERSION;
+  static const ApplicationVersion PARQUET_816_FIXED_VERSION;
+
+  /// Application that wrote the file. e.g. "IMPALA"
+  std::string application;
+
+  /// Version of the application that wrote the file, expressed in three parts
+  /// (<major>.<minor>.<patch>). Unspecified parts default to 0, and extra
parts are
+  /// ignored. e.g.:
+  /// "1.2.3"    => {1, 2, 3}
+  /// "1.2"      => {1, 2, 0}
+  /// "1.2-cdh5" => {1, 2, 0}
+  struct {
+    int major;
+    int minor;
+    int patch;
+  } version;
+
+  ApplicationVersion() {}
+  explicit ApplicationVersion(const std::string& created_by);
+
+  /// Returns true if version is strictly less than other_version
+  bool VersionLt(const ApplicationVersion& other_version) const;
+
+  /// Returns true if version is strictly less than other_version
+  bool VersionEq(const ApplicationVersion& other_version) const;
+
+  // Checks if the Version has the correct statistics for a given column
+  bool HasCorrectStatistics(Type::type primitive) const;
+};
+
 class PARQUET_EXPORT ColumnChunkMetaData {
  public:
   // API convenience to get a MetaData accessor
-  static std::unique_ptr<ColumnChunkMetaData> Make(
-      const uint8_t* metadata, const ColumnDescriptor* descr);
+  static std::unique_ptr<ColumnChunkMetaData> Make(const uint8_t* metadata,
+      const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULL);
 
   ~ColumnChunkMetaData();
 
@@ -60,7 +105,8 @@ class PARQUET_EXPORT ColumnChunkMetaData {
   int64_t total_uncompressed_size() const;
 
  private:
-  explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr);
+  explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr,
+      const ApplicationVersion* writer_version = NULL);
   // PIMPL Idiom
   class ColumnChunkMetaDataImpl;
   std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
@@ -69,8 +115,8 @@ class PARQUET_EXPORT ColumnChunkMetaData {
 class PARQUET_EXPORT RowGroupMetaData {
  public:
   // API convenience to get a MetaData accessor
-  static std::unique_ptr<RowGroupMetaData> Make(
-      const uint8_t* metadata, const SchemaDescriptor* schema);
+  static std::unique_ptr<RowGroupMetaData> Make(const uint8_t* metadata,
+      const SchemaDescriptor* schema, const ApplicationVersion* writer_version = NULL);
 
   ~RowGroupMetaData();
 
@@ -83,7 +129,8 @@ class PARQUET_EXPORT RowGroupMetaData {
   std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const;
 
  private:
-  explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema);
+  explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema,
+      const ApplicationVersion* writer_version = NULL);
   // PIMPL Idiom
   class RowGroupMetaDataImpl;
   std::unique_ptr<RowGroupMetaDataImpl> impl_;
@@ -93,32 +140,6 @@ class FileMetaDataBuilder;
 
 class PARQUET_EXPORT FileMetaData {
  public:
-  struct Version {
-    /// Application that wrote the file. e.g. "IMPALA"
-    std::string application;
-
-    /// Version of the application that wrote the file, expressed in three parts
-    /// (<major>.<minor>.<patch>). Unspecified parts default to 0, and
extra parts are
-    /// ignored. e.g.:
-    /// "1.2.3"    => {1, 2, 3}
-    /// "1.2"      => {1, 2, 0}
-    /// "1.2-cdh5" => {1, 2, 0}
-    struct {
-      int major;
-      int minor;
-      int patch;
-    } version;
-
-    Version() {}
-    explicit Version(const std::string& created_by);
-
-    /// Returns true if version is strictly less than <major>.<minor>.<patch>
-    bool VersionLt(int major, int minor = 0, int patch = 0) const;
-
-    /// Returns true if version is equal to <major>.<minor>.<patch>
-    bool VersionEq(int major, int minor, int patch) const;
-  };
-
   // API convenience to get a MetaData accessor
   static std::shared_ptr<FileMetaData> Make(
       const uint8_t* serialized_metadata, uint32_t* metadata_len);
@@ -135,7 +156,7 @@ class PARQUET_EXPORT FileMetaData {
   int num_schema_elements() const;
   std::unique_ptr<RowGroupMetaData> RowGroup(int i) const;
 
-  const Version& writer_version() const;
+  const ApplicationVersion& writer_version() const;
 
   void WriteTo(OutputStream* dst);
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/file/reader-internal.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc
index 4eb40b4..e981e36 100644
--- a/src/parquet/file/reader-internal.cc
+++ b/src/parquet/file/reader-internal.cc
@@ -183,8 +183,8 @@ std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int
i) {
   std::unique_ptr<InputStream> stream;
 
   // PARQUET-816 workaround for old files created by older parquet-mr
-  const FileMetaData::Version& version = file_metadata_->writer_version();
-  if (version.application == "parquet-mr" && version.VersionLt(1, 2, 9)) {
+  const ApplicationVersion& version = file_metadata_->writer_version();
+  if (version.VersionLt(ApplicationVersion::PARQUET_816_FIXED_VERSION)) {
     // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
     // dictionary page header size in total_compressed_size and total_uncompressed_size
     // (see IMPALA-694). We add padding to compensate.


Mime
View raw message