parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-556:Extend RowGroupStatistics to include "min" "max" statistics
Date Sun, 13 Mar 2016 17:50:45 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master e20cfa495 -> c6d204f79


PARQUET-556:Extend RowGroupStatistics to include "min" "max" statistics

Also includes a patch to extend the GroupReader API with num_rows()

Author: Deepak Majeti <deepak.majeti@hpe.com>

Closes #76 from majetideepak/PARQUET-556 and squashes the following commits:

7f2b036 [Deepak Majeti] modified min max Statistics to pointers
4059821 [Deepak Majeti] added a test
7e02810 [Deepak Majeti] PARQUET:556


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c6d204f7
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c6d204f7
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c6d204f7

Branch: refs/heads/master
Commit: c6d204f79bc29cef187ab7cd2fdd5f23350651b1
Parents: e20cfa4
Author: Deepak Majeti <deepak.majeti@hpe.com>
Authored: Sun Mar 13 10:50:26 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Sun Mar 13 10:50:26 2016 -0700

----------------------------------------------------------------------
 src/parquet/file/reader-internal.cc | 6 ++++++
 src/parquet/file/reader-internal.h  | 1 +
 src/parquet/file/reader.cc          | 9 ++++++++-
 src/parquet/file/reader.h           | 4 ++++
 src/parquet/reader-test.cc          | 5 +++++
 5 files changed, 24 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6d204f7/src/parquet/file/reader-internal.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc
index c571c72..89e8298 100644
--- a/src/parquet/file/reader-internal.cc
+++ b/src/parquet/file/reader-internal.cc
@@ -154,6 +154,10 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
 // ----------------------------------------------------------------------
 // SerializedRowGroup
 
+int64_t SerializedRowGroup::num_rows() const {
+  return metadata_->num_rows;
+}
+
 int SerializedRowGroup::num_columns() const {
   return metadata_->columns.size();
 }
@@ -187,6 +191,8 @@ RowGroupStatistics SerializedRowGroup::GetColumnStats(int i) {
   result.num_values = meta_data.num_values;
   result.null_count = meta_data.statistics.null_count;
   result.distinct_count = meta_data.statistics.distinct_count;
+  result.max = &meta_data.statistics.max;
+  result.min = &meta_data.statistics.min;
 
   return result;
 }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6d204f7/src/parquet/file/reader-internal.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader-internal.h b/src/parquet/file/reader-internal.h
index a398cb3..b62f249 100644
--- a/src/parquet/file/reader-internal.h
+++ b/src/parquet/file/reader-internal.h
@@ -76,6 +76,7 @@ class SerializedRowGroup : public RowGroupReader::Contents {
       metadata_(metadata) {}
 
   virtual int num_columns() const;
+  virtual int64_t num_rows() const;
   virtual std::unique_ptr<PageReader> GetColumnPageReader(int i);
   virtual RowGroupStatistics GetColumnStats(int i);
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6d204f7/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index beace09..2937f9e 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -49,6 +49,10 @@ int RowGroupReader::num_columns() const {
   return contents_->num_columns();
 }
 
+int64_t RowGroupReader::num_rows() const {
+  return contents_->num_rows();
+}
+
 std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
   // TODO: boundschecking
   const ColumnDescriptor* descr = schema_->Column(i);
@@ -153,9 +157,12 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values)
{
       RowGroupStatistics stats = group_reader->GetColumnStats(i);
 
       stream << "Column " << i << ": "
-             << stats.num_values << " rows, "
+             << group_reader->num_rows() << " rows, "
+             << stats.num_values << " values, "
              << stats.null_count << " null values, "
              << stats.distinct_count << " distinct values, "
+             << *stats.max << " max, "
+             << *stats.min << " min, "
              << std::endl;
     }
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6d204f7/src/parquet/file/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.h b/src/parquet/file/reader.h
index 18b052a..436d1e8 100644
--- a/src/parquet/file/reader.h
+++ b/src/parquet/file/reader.h
@@ -34,6 +34,8 @@ struct RowGroupStatistics {
   int64_t num_values;
   int64_t null_count;
   int64_t distinct_count;
+  const std::string* min;
+  const std::string* max;
 };
 
 class RowGroupReader {
@@ -41,6 +43,7 @@ class RowGroupReader {
   // Forward declare the PIMPL
   struct Contents {
     virtual int num_columns() const = 0;
+    virtual int64_t num_rows() const = 0;
     virtual RowGroupStatistics GetColumnStats(int i) = 0;
     virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
   };
@@ -51,6 +54,7 @@ class RowGroupReader {
   // column. Ownership is shared with the RowGroupReader.
   std::shared_ptr<ColumnReader> Column(int i);
   int num_columns() const;
+  int64_t num_rows() const;
 
   RowGroupStatistics GetColumnStats(int i) const;
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6d204f7/src/parquet/reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index 2c69ce1..c273487 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -68,6 +68,11 @@ TEST_F(TestAllTypesPlain, TestBatchRead) {
   int32_t values[4];
 
   // This file only has 8 rows
+  ASSERT_EQ(8, reader_->num_rows());
+  // This file only has 1 row group
+  ASSERT_EQ(1, reader_->num_row_groups());
+  // This row group must have 8 rows
+  ASSERT_EQ(8, group->num_rows());
 
   ASSERT_TRUE(col->HasNext());
   int64_t values_read;


Mime
View raw message