parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-809: Add SchemaDescriptor::Equals method
Date Thu, 05 Jan 2017 17:21:11 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 378f335c1 -> 52d36960e


PARQUET-809: Add SchemaDescriptor::Equals method

To make it simpler to compare file metadata

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #214 from wesm/PARQUET-809 and squashes the following commits:

691e5bc [Wes McKinney] Add SchemaDescriptor::Equals method


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/52d36960
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/52d36960
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/52d36960

Branch: refs/heads/master
Commit: 52d36960ef46a497089bd35b73eada8a689ab6d9
Parents: 378f335
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Thu Jan 5 12:21:05 2017 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Thu Jan 5 12:21:05 2017 -0500

----------------------------------------------------------------------
 src/parquet/schema/descriptor.cc             | 20 +++++++++
 src/parquet/schema/descriptor.h              |  4 ++
 src/parquet/schema/schema-descriptor-test.cc | 54 +++++++++++++++++++++++
 3 files changed, 78 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/52d36960/src/parquet/schema/descriptor.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc
index 4d46204..c5250d1 100644
--- a/src/parquet/schema/descriptor.cc
+++ b/src/parquet/schema/descriptor.cc
@@ -47,6 +47,20 @@ void SchemaDescriptor::Init(const NodePtr& schema) {
   }
 }
 
+bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
+  if (this->num_columns() != other.num_columns()) {
+    return false;
+  }
+
+  for (int i = 0; i < this->num_columns(); ++i) {
+    if (!this->Column(i)->Equals(*other.Column(i))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
     int16_t max_rep_level, const NodePtr& base) {
   if (node->is_optional()) {
@@ -82,6 +96,12 @@ ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node,
   primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
 }
 
+bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
+  return primitive_node_->Equals(other.primitive_node_) &&
+    max_repetition_level() == other.max_repetition_level() &&
+    max_definition_level() == other.max_definition_level();
+}
+
 const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
   DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
   return &leaves_[i];

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/52d36960/src/parquet/schema/descriptor.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h
index 1673d5f..ae7b60e 100644
--- a/src/parquet/schema/descriptor.h
+++ b/src/parquet/schema/descriptor.h
@@ -42,6 +42,8 @@ class PARQUET_EXPORT ColumnDescriptor {
   ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
       int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
 
+  bool Equals(const ColumnDescriptor& other) const;
+
   int16_t max_definition_level() const { return max_definition_level_; }
 
   int16_t max_repetition_level() const { return max_repetition_level_; }
@@ -97,6 +99,8 @@ class PARQUET_EXPORT SchemaDescriptor {
 
   const ColumnDescriptor* Column(int i) const;
 
+  bool Equals(const SchemaDescriptor& other) const;
+
   // The number of physical columns appearing in the file
   int num_columns() const { return leaves_.size(); }
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/52d36960/src/parquet/schema/schema-descriptor-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc
index eeaec5b..467d63c 100644
--- a/src/parquet/schema/schema-descriptor-test.cc
+++ b/src/parquet/schema/schema-descriptor-test.cc
@@ -71,6 +71,60 @@ TEST_F(TestSchemaDescriptor, InitNonGroup) {
   ASSERT_THROW(descr_.Init(node), ParquetException);
 }
 
+TEST_F(TestSchemaDescriptor, Equals) {
+  NodePtr schema;
+
+  NodePtr inta = Int32("a", Repetition::REQUIRED);
+  NodePtr intb = Int64("b", Repetition::OPTIONAL);
+  NodePtr intb2 = Int64("b2", Repetition::OPTIONAL);
+  NodePtr intc = ByteArray("c", Repetition::REPEATED);
+
+  NodePtr item1 = Int64("item1", Repetition::REQUIRED);
+  NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
+  NodePtr item3 = Int32("item3", Repetition::REPEATED);
+  NodePtr list(GroupNode::Make(
+      "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST));
+
+  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
+  NodePtr bag2(GroupNode::Make("bag", Repetition::REQUIRED, {list}));
+
+  SchemaDescriptor descr1;
+  descr1.Init(GroupNode::Make("schema", Repetition::REPEATED,
+          {inta, intb, intc, bag}));
+
+  ASSERT_TRUE(descr1.Equals(descr1));
+
+  SchemaDescriptor descr2;
+  descr2.Init(GroupNode::Make("schema", Repetition::REPEATED,
+          {inta, intb, intc, bag2}));
+  ASSERT_FALSE(descr1.Equals(descr2));
+
+  SchemaDescriptor descr3;
+  descr3.Init(GroupNode::Make("schema", Repetition::REPEATED,
+          {inta, intb2, intc, bag}));
+  ASSERT_FALSE(descr1.Equals(descr3));
+
+  // Robust to name of parent node
+  SchemaDescriptor descr4;
+  descr4.Init(GroupNode::Make("SCHEMA", Repetition::REPEATED,
+          {inta, intb, intc, bag}));
+  ASSERT_TRUE(descr1.Equals(descr4));
+
+  SchemaDescriptor descr5;
+  descr5.Init(GroupNode::Make("schema", Repetition::REPEATED,
+          {inta, intb, intc, bag, intb2}));
+  ASSERT_FALSE(descr1.Equals(descr5));
+
+  // Different max repetition / definition levels
+  ColumnDescriptor col1(inta, 5, 1);
+  ColumnDescriptor col2(inta, 6, 1);
+  ColumnDescriptor col3(inta, 5, 2);
+
+  ASSERT_TRUE(col1.Equals(col1));
+  ASSERT_FALSE(col1.Equals(col2));
+  ASSERT_FALSE(col1.Equals(col3));
+}
+
 TEST_F(TestSchemaDescriptor, BuildTree) {
   NodeVector fields;
   NodePtr schema;


Mime
View raw message