parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-583: Parquet to Thrift schema conversion
Date Tue, 19 Apr 2016 09:01:57 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 198df4dca -> 49a5c1a8c


PARQUET-583: Parquet to Thrift schema conversion

Depends on #86

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #87 from xhochy/parquet-583 and squashes the following commits:

9f3f050 [Uwe L. Korn] Incoperate feedback
86aed44 [Uwe L. Korn] PARQUET-583: Parquet to Thrift schema conversion


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/49a5c1a8
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/49a5c1a8
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/49a5c1a8

Branch: refs/heads/master
Commit: 49a5c1a8c5e61bc4d43d5927767023d94d9d22f5
Parents: 198df4d
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Tue Apr 19 11:01:46 2016 +0200
Committer: Wes McKinney <wesm@apache.org>
Committed: Tue Apr 19 11:01:46 2016 +0200

----------------------------------------------------------------------
 src/parquet/schema/converter.cc             | 39 +++++++++++++
 src/parquet/schema/converter.h              |  4 +-
 src/parquet/schema/printer.cc               |  6 +-
 src/parquet/schema/schema-converter-test.cc | 73 ++++++++++++++++++++----
 src/parquet/schema/schema-types-test.cc     |  8 +--
 src/parquet/schema/test-util.h              | 10 +++-
 src/parquet/schema/types.cc                 | 37 ++++++++++++
 src/parquet/schema/types.h                  | 15 +++++
 8 files changed, 172 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/converter.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/converter.cc b/src/parquet/schema/converter.cc
index 703f38d..bece340 100644
--- a/src/parquet/schema/converter.cc
+++ b/src/parquet/schema/converter.cc
@@ -83,6 +83,45 @@ std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>&
   return descr;
 }
 
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
+  SchemaFlattener flattener(schema, out);
+  flattener.Flatten();
+}
+
+class SchemaVisitor : public Node::ConstVisitor {
+ public:
+  explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
+     : elements_(elements) {}
+  virtual ~SchemaVisitor() {}
+
+  void Visit(const Node* node) override {
+    format::SchemaElement element;
+    node->ToParquet(&element);
+    // Override field_id here as we can get user-generated Nodes without a valid id
+    element.__set_field_id(elements_->size());
+    elements_->push_back(element);
+
+    if (node->is_group()) {
+      const GroupNode* group_node = static_cast<const GroupNode*>(node);
+      for (int i = 0; i < group_node->field_count(); ++i) {
+        group_node->field(i)->VisitConst(this);
+      }
+    }
+  }
+
+ private:
+  std::vector<format::SchemaElement>* elements_;
+};
+
+SchemaFlattener::SchemaFlattener(const GroupNode* schema,
+        std::vector<format::SchemaElement>* out)
+    : root_(schema), elements_(out) {}
+
+void SchemaFlattener::Flatten() {
+  SchemaVisitor visitor(elements_);
+  root_->VisitConst(&visitor);
+}
+
 } // namespace schema
 
 } // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/converter.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/converter.h b/src/parquet/schema/converter.h
index 8d2b556..2742b98 100644
--- a/src/parquet/schema/converter.h
+++ b/src/parquet/schema/converter.h
@@ -80,9 +80,11 @@ class SchemaFlattener {
  public:
   SchemaFlattener(const GroupNode* schema, std::vector<format::SchemaElement>* out);
 
+  void Flatten();
+
  private:
   const GroupNode* root_;
-  std::vector<format::SchemaElement>* schema_;
+  std::vector<format::SchemaElement>* elements_;
 };
 
 } // namespace schema

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/printer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc
index dc6ba5b..6de696b 100644
--- a/src/parquet/schema/printer.cc
+++ b/src/parquet/schema/printer.cc
@@ -27,14 +27,14 @@ namespace parquet {
 
 namespace schema {
 
-class SchemaPrinter : public Node::Visitor {
+class SchemaPrinter : public Node::ConstVisitor {
  public:
   explicit SchemaPrinter(std::ostream& stream, int indent_width) :
       stream_(stream),
       indent_(0),
       indent_width_(2) {}
 
-  virtual void Visit(const Node* node);
+  void Visit(const Node* node) override;
 
  private:
   void Visit(const PrimitiveNode* node);
@@ -108,7 +108,7 @@ void SchemaPrinter::Visit(const GroupNode* node) {
 
   indent_ += indent_width_;
   for (int i = 0; i < node->field_count(); ++i) {
-    node->field(i)->Visit(this);
+    node->field(i)->VisitConst(this);
   }
   indent_ -= indent_width_;
   Indent();

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/schema-converter-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc
index 239aff0..55b8439 100644
--- a/src/parquet/schema/schema-converter-test.cc
+++ b/src/parquet/schema/schema-converter-test.cc
@@ -82,21 +82,21 @@ bool check_for_parent_consistency(const GroupNode* node) {
 TEST_F(TestSchemaConverter, NestedExample) {
   SchemaElement elt;
   std::vector<SchemaElement> elements;
-  elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2));
+  elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
 
   // A primitive one
   elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED,
-          format::Type::INT32));
+          format::Type::INT32, 1));
 
   // A group
-  elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1));
+  elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2));
 
   // 3-level list encoding, by hand
-  elt = NewGroup("b", FieldRepetitionType::REPEATED, 1);
+  elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3);
   elt.__set_converted_type(ConvertedType::LIST);
   elements.push_back(elt);
   elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL,
-          format::Type::INT64));
+          format::Type::INT64, 4));
 
   Convert(&elements[0], elements.size());
 
@@ -127,19 +127,19 @@ TEST_F(TestSchemaConverter, InvalidRoot) {
 
   SchemaElement elements[2];
   elements[0] = NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED,
-      format::Type::INT32);
+      format::Type::INT32, 0);
   ASSERT_THROW(Convert(elements, 2), ParquetException);
 
   // While the Parquet spec indicates that the root group should have REPEATED
   // repetition type, some implementations may return REQUIRED or OPTIONAL
   // groups as the first element. These tests check that this is okay as a
   // practicality matter.
-  elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1);
+  elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1, 0);
   elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED,
-      format::Type::INT32);
+      format::Type::INT32, 1);
   Convert(elements, 2);
 
-  elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1);
+  elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1, 0);
   Convert(elements, 2);
 }
 
@@ -147,13 +147,66 @@ TEST_F(TestSchemaConverter, NotEnoughChildren) {
   // Throw a ParquetException, but don't core dump or anything
   SchemaElement elt;
   std::vector<SchemaElement> elements;
-  elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2));
+  elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
   ASSERT_THROW(Convert(&elements[0], 1), ParquetException);
 }
 
 // ----------------------------------------------------------------------
 // Schema tree flatten / unflatten
 
+class TestSchemaFlatten : public ::testing::Test {
+ public:
+  void setUp() {
+    name_ = "parquet_schema";
+  }
+
+  void Flatten(const GroupNode* schema) {
+    ToParquet(schema, &elements_);
+  }
+
+ protected:
+  std::string name_;
+  std::vector<format::SchemaElement> elements_;
+};
+
+TEST_F(TestSchemaFlatten, NestedExample) {
+  SchemaElement elt;
+  std::vector<SchemaElement> elements;
+  elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
+
+  // A primitive one
+  elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED,
+          format::Type::INT32, 1));
+
+  // A group
+  elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2));
+
+  // 3-level list encoding, by hand
+  elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3);
+  elt.__set_converted_type(ConvertedType::LIST);
+  elements.push_back(elt);
+  elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL,
+          format::Type::INT64, 4));
+
+  // Construct the schema
+  NodeVector fields;
+  fields.push_back(Int32("a", Repetition::REQUIRED));
+
+  // 3-level list encoding
+  NodePtr item = Int64("item");
+  NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
+  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
+  fields.push_back(bag);
+
+  NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
+
+  Flatten(static_cast<GroupNode*>(schema.get()));
+  ASSERT_EQ(elements_.size(), elements.size());
+  for (size_t i = 0; i < elements_.size(); i++) {
+    ASSERT_EQ(elements_[i], elements[i]);
+  }
+}
+
 } // namespace schema
 
 } // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/schema-types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc
index 909dd4d..8215613 100644
--- a/src/parquet/schema/schema-types-test.cc
+++ b/src/parquet/schema/schema-types-test.cc
@@ -112,7 +112,7 @@ TEST_F(TestPrimitiveNode, Attrs) {
 
 TEST_F(TestPrimitiveNode, FromParquet) {
   SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
-      format::Type::INT32);
+      format::Type::INT32, 0);
   Convert(&elt);
   ASSERT_EQ(name_, prim_node_->name());
   ASSERT_EQ(id_, prim_node_->id());
@@ -121,7 +121,7 @@ TEST_F(TestPrimitiveNode, FromParquet) {
   ASSERT_EQ(LogicalType::NONE, prim_node_->logical_type());
 
   // Test a logical type
-  elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, format::Type::BYTE_ARRAY);
+  elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, format::Type::BYTE_ARRAY, 0);
   elt.__set_converted_type(ConvertedType::UTF8);
 
   Convert(&elt);
@@ -131,7 +131,7 @@ TEST_F(TestPrimitiveNode, FromParquet) {
 
   // FIXED_LEN_BYTE_ARRAY
   elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
-      format::Type::FIXED_LEN_BYTE_ARRAY);
+      format::Type::FIXED_LEN_BYTE_ARRAY, 0);
   elt.__set_type_length(16);
 
   Convert(&elt);
@@ -143,7 +143,7 @@ TEST_F(TestPrimitiveNode, FromParquet) {
 
   // ConvertedType::Decimal
   elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
-      format::Type::FIXED_LEN_BYTE_ARRAY);
+      format::Type::FIXED_LEN_BYTE_ARRAY, 0);
   elt.__set_converted_type(ConvertedType::DECIMAL);
   elt.__set_type_length(6);
   elt.__set_scale(2);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/test-util.h b/src/parquet/schema/test-util.h
index 2049ce0..25dacb0 100644
--- a/src/parquet/schema/test-util.h
+++ b/src/parquet/schema/test-util.h
@@ -36,22 +36,28 @@ namespace parquet {
 namespace schema {
 
 static inline SchemaElement NewPrimitive(const std::string& name,
-    FieldRepetitionType::type repetition, format::Type::type type) {
+    FieldRepetitionType::type repetition, format::Type::type type, int id = 0) {
   SchemaElement result;
   result.__set_name(name);
   result.__set_repetition_type(repetition);
   result.__set_type(type);
   result.__set_num_children(0);
+  result.__set_field_id(id);
+  // Set default (non-set) values
+  result.__set_type_length(-1);
+  result.__set_precision(-1);
+  result.__set_scale(-1);
 
   return result;
 }
 
 static inline SchemaElement NewGroup(const std::string& name,
-    FieldRepetitionType::type repetition, int num_children) {
+    FieldRepetitionType::type repetition, int num_children, int id = 0) {
   SchemaElement result;
   result.__set_name(name);
   result.__set_repetition_type(repetition);
   result.__set_num_children(num_children);
+  result.__set_field_id(id);
 
   return result;
 }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
index 50e63f2..f08fb41 100644
--- a/src/parquet/schema/types.cc
+++ b/src/parquet/schema/types.cc
@@ -203,6 +203,10 @@ void PrimitiveNode::Visit(Node::Visitor* visitor) {
   visitor->Visit(this);
 }
 
+void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const {
+  visitor->Visit(this);
+}
+
 // ----------------------------------------------------------------------
 // Group node
 
@@ -232,6 +236,10 @@ void GroupNode::Visit(Node::Visitor* visitor) {
   visitor->Visit(this);
 }
 
+void GroupNode::VisitConst(Node::ConstVisitor* visitor) const {
+  visitor->Visit(this);
+}
+
 // ----------------------------------------------------------------------
 // Node construction from Parquet metadata
 
@@ -280,6 +288,35 @@ std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
   return std::unique_ptr<Node>(result.release());
 }
 
+void GroupNode::ToParquet(void* opaque_element) const {
+  format::SchemaElement* element =
+    static_cast<format::SchemaElement*>(opaque_element);
+  element->__set_name(name_);
+  element->__set_num_children(field_count());
+  element->__set_repetition_type(ToThrift(repetition_));
+  if (logical_type_ != LogicalType::NONE) {
+    element->__set_converted_type(ToThrift(logical_type_));
+  }
+  // FIXME: SchemaFlattener does this for us: element->__set_field_id(id_);
+}
+
+void PrimitiveNode::ToParquet(void* opaque_element) const {
+  format::SchemaElement* element =
+    static_cast<format::SchemaElement*>(opaque_element);
+
+  element->__set_name(name_);
+  element->__set_num_children(0);
+  element->__set_repetition_type(ToThrift(repetition_));
+  if (logical_type_ != LogicalType::NONE) {
+    element->__set_converted_type(ToThrift(logical_type_));
+  }
+  element->__set_type(ToThrift(physical_type_));
+  // FIXME: SchemaFlattener does this for us: element->__set_field_id(id_);
+  element->__set_type_length(type_length_);
+  element->__set_precision(decimal_metadata_.precision);
+  element->__set_scale(decimal_metadata_.scale);
+}
+
 } // namespace schema
 
 } // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h
index e87e18e..4131d24 100644
--- a/src/parquet/schema/types.h
+++ b/src/parquet/schema/types.h
@@ -163,15 +163,26 @@ class Node {
     return parent_;
   }
 
+  // ToParquet returns an opaque void* to avoid exporting
+  // parquet::SchemaElement into the public API
+  virtual void ToParquet(void* opaque_element) const = 0;
+
   // Node::Visitor abstract class for walking schemas with the visitor pattern
   class Visitor {
    public:
     virtual ~Visitor() {}
 
+    virtual void Visit(Node* node) = 0;
+  };
+  class ConstVisitor {
+   public:
+    virtual ~ConstVisitor() {}
+
     virtual void Visit(const Node* node) = 0;
   };
 
   virtual void Visit(Visitor* visitor) = 0;
+  virtual void VisitConst(ConstVisitor* visitor) const = 0;
 
  protected:
   friend class GroupNode;
@@ -224,7 +235,9 @@ class PrimitiveNode : public Node {
     return decimal_metadata_;
   }
 
+  void ToParquet(void* opaque_element) const override;
   virtual void Visit(Visitor* visitor);
+  void VisitConst(ConstVisitor* visitor) const override;
 
  private:
   PrimitiveNode(const std::string& name, Repetition::type repetition,
@@ -278,7 +291,9 @@ class GroupNode : public Node {
     return fields_.size();
   }
 
+  void ToParquet(void* opaque_element) const override;
   virtual void Visit(Visitor* visitor);
+  void VisitConst(ConstVisitor* visitor) const override;
 
  private:
   GroupNode(const std::string& name, Repetition::type repetition,


Mime
View raw message