Return-Path: X-Original-To: apmail-parquet-commits-archive@minotaur.apache.org Delivered-To: apmail-parquet-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 22A9019CD0 for ; Tue, 19 Apr 2016 09:01:59 +0000 (UTC) Received: (qmail 4808 invoked by uid 500); 19 Apr 2016 09:01:58 -0000 Delivered-To: apmail-parquet-commits-archive@parquet.apache.org Received: (qmail 4771 invoked by uid 500); 19 Apr 2016 09:01:58 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 4760 invoked by uid 99); 19 Apr 2016 09:01:58 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 19 Apr 2016 09:01:58 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 01F3ADFE04; Tue, 19 Apr 2016 09:01:57 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wesm@apache.org To: commits@parquet.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: parquet-cpp git commit: PARQUET-583: Parquet to Thrift schema conversion Date: Tue, 19 Apr 2016 09:01:57 +0000 (UTC) Repository: parquet-cpp Updated Branches: refs/heads/master 198df4dca -> 49a5c1a8c PARQUET-583: Parquet to Thrift schema conversion Depends on #86 Author: Uwe L. Korn Closes #87 from xhochy/parquet-583 and squashes the following commits: 9f3f050 [Uwe L. Korn] Incoperate feedback 86aed44 [Uwe L. Korn] PARQUET-583: Parquet to Thrift schema conversion Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/49a5c1a8 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/49a5c1a8 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/49a5c1a8 Branch: refs/heads/master Commit: 49a5c1a8c5e61bc4d43d5927767023d94d9d22f5 Parents: 198df4d Author: Uwe L. Korn Authored: Tue Apr 19 11:01:46 2016 +0200 Committer: Wes McKinney Committed: Tue Apr 19 11:01:46 2016 +0200 ---------------------------------------------------------------------- src/parquet/schema/converter.cc | 39 +++++++++++++ src/parquet/schema/converter.h | 4 +- src/parquet/schema/printer.cc | 6 +- src/parquet/schema/schema-converter-test.cc | 73 ++++++++++++++++++++---- src/parquet/schema/schema-types-test.cc | 8 +-- src/parquet/schema/test-util.h | 10 +++- src/parquet/schema/types.cc | 37 ++++++++++++ src/parquet/schema/types.h | 15 +++++ 8 files changed, 172 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/converter.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/converter.cc b/src/parquet/schema/converter.cc index 703f38d..bece340 100644 --- a/src/parquet/schema/converter.cc +++ b/src/parquet/schema/converter.cc @@ -83,6 +83,45 @@ std::shared_ptr FromParquet(const std::vector& return descr; } +void ToParquet(const GroupNode* schema, std::vector* out) { + SchemaFlattener flattener(schema, out); + flattener.Flatten(); +} + +class SchemaVisitor : public Node::ConstVisitor { + public: + explicit SchemaVisitor(std::vector* elements) + : elements_(elements) {} + virtual ~SchemaVisitor() {} + + void Visit(const Node* node) override { + format::SchemaElement element; + node->ToParquet(&element); + // Override field_id here as we can get user-generated Nodes without a valid id + element.__set_field_id(elements_->size()); + elements_->push_back(element); + + if (node->is_group()) { + const GroupNode* group_node = static_cast(node); + for (int i = 0; i < group_node->field_count(); ++i) { + group_node->field(i)->VisitConst(this); + } + } + } + + private: + std::vector* elements_; +}; + +SchemaFlattener::SchemaFlattener(const GroupNode* schema, + std::vector* out) + : root_(schema), elements_(out) {} + +void SchemaFlattener::Flatten() { + SchemaVisitor visitor(elements_); + root_->VisitConst(&visitor); +} + } // namespace schema } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/converter.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/converter.h b/src/parquet/schema/converter.h index 8d2b556..2742b98 100644 --- a/src/parquet/schema/converter.h +++ b/src/parquet/schema/converter.h @@ -80,9 +80,11 @@ class SchemaFlattener { public: SchemaFlattener(const GroupNode* schema, std::vector* out); + void Flatten(); + private: const GroupNode* root_; - std::vector* schema_; + std::vector* elements_; }; } // namespace schema http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/printer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc index dc6ba5b..6de696b 100644 --- a/src/parquet/schema/printer.cc +++ b/src/parquet/schema/printer.cc @@ -27,14 +27,14 @@ namespace parquet { namespace schema { -class SchemaPrinter : public Node::Visitor { +class SchemaPrinter : public Node::ConstVisitor { public: explicit SchemaPrinter(std::ostream& stream, int indent_width) : stream_(stream), indent_(0), indent_width_(2) {} - virtual void Visit(const Node* node); + void Visit(const Node* node) override; private: void Visit(const PrimitiveNode* node); @@ -108,7 +108,7 @@ void SchemaPrinter::Visit(const GroupNode* node) { indent_ += indent_width_; for (int i = 0; i < node->field_count(); ++i) { - node->field(i)->Visit(this); + node->field(i)->VisitConst(this); } indent_ -= indent_width_; Indent(); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/schema-converter-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc index 239aff0..55b8439 100644 --- a/src/parquet/schema/schema-converter-test.cc +++ b/src/parquet/schema/schema-converter-test.cc @@ -82,21 +82,21 @@ bool check_for_parent_consistency(const GroupNode* node) { TEST_F(TestSchemaConverter, NestedExample) { SchemaElement elt; std::vector elements; - elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2)); + elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); // A primitive one elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED, - format::Type::INT32)); + format::Type::INT32, 1)); // A group - elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1)); + elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2)); // 3-level list encoding, by hand - elt = NewGroup("b", FieldRepetitionType::REPEATED, 1); + elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3); elt.__set_converted_type(ConvertedType::LIST); elements.push_back(elt); elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL, - format::Type::INT64)); + format::Type::INT64, 4)); Convert(&elements[0], elements.size()); @@ -127,19 +127,19 @@ TEST_F(TestSchemaConverter, InvalidRoot) { SchemaElement elements[2]; elements[0] = NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED, - format::Type::INT32); + format::Type::INT32, 0); ASSERT_THROW(Convert(elements, 2), ParquetException); // While the Parquet spec indicates that the root group should have REPEATED // repetition type, some implementations may return REQUIRED or OPTIONAL // groups as the first element. These tests check that this is okay as a // practicality matter. - elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1); + elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1, 0); elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED, - format::Type::INT32); + format::Type::INT32, 1); Convert(elements, 2); - elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1); + elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1, 0); Convert(elements, 2); } @@ -147,13 +147,66 @@ TEST_F(TestSchemaConverter, NotEnoughChildren) { // Throw a ParquetException, but don't core dump or anything SchemaElement elt; std::vector elements; - elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2)); + elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); ASSERT_THROW(Convert(&elements[0], 1), ParquetException); } // ---------------------------------------------------------------------- // Schema tree flatten / unflatten +class TestSchemaFlatten : public ::testing::Test { + public: + void setUp() { + name_ = "parquet_schema"; + } + + void Flatten(const GroupNode* schema) { + ToParquet(schema, &elements_); + } + + protected: + std::string name_; + std::vector elements_; +}; + +TEST_F(TestSchemaFlatten, NestedExample) { + SchemaElement elt; + std::vector elements; + elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); + + // A primitive one + elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED, + format::Type::INT32, 1)); + + // A group + elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2)); + + // 3-level list encoding, by hand + elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3); + elt.__set_converted_type(ConvertedType::LIST); + elements.push_back(elt); + elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL, + format::Type::INT64, 4)); + + // Construct the schema + NodeVector fields; + fields.push_back(Int32("a", Repetition::REQUIRED)); + + // 3-level list encoding + NodePtr item = Int64("item"); + NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST)); + NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); + fields.push_back(bag); + + NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields); + + Flatten(static_cast(schema.get())); + ASSERT_EQ(elements_.size(), elements.size()); + for (size_t i = 0; i < elements_.size(); i++) { + ASSERT_EQ(elements_[i], elements[i]); + } +} + } // namespace schema } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/schema-types-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc index 909dd4d..8215613 100644 --- a/src/parquet/schema/schema-types-test.cc +++ b/src/parquet/schema/schema-types-test.cc @@ -112,7 +112,7 @@ TEST_F(TestPrimitiveNode, Attrs) { TEST_F(TestPrimitiveNode, FromParquet) { SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, - format::Type::INT32); + format::Type::INT32, 0); Convert(&elt); ASSERT_EQ(name_, prim_node_->name()); ASSERT_EQ(id_, prim_node_->id()); @@ -121,7 +121,7 @@ TEST_F(TestPrimitiveNode, FromParquet) { ASSERT_EQ(LogicalType::NONE, prim_node_->logical_type()); // Test a logical type - elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, format::Type::BYTE_ARRAY); + elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, format::Type::BYTE_ARRAY, 0); elt.__set_converted_type(ConvertedType::UTF8); Convert(&elt); @@ -131,7 +131,7 @@ TEST_F(TestPrimitiveNode, FromParquet) { // FIXED_LEN_BYTE_ARRAY elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, - format::Type::FIXED_LEN_BYTE_ARRAY); + format::Type::FIXED_LEN_BYTE_ARRAY, 0); elt.__set_type_length(16); Convert(&elt); @@ -143,7 +143,7 @@ TEST_F(TestPrimitiveNode, FromParquet) { // ConvertedType::Decimal elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, - format::Type::FIXED_LEN_BYTE_ARRAY); + format::Type::FIXED_LEN_BYTE_ARRAY, 0); elt.__set_converted_type(ConvertedType::DECIMAL); elt.__set_type_length(6); elt.__set_scale(2); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/test-util.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/test-util.h b/src/parquet/schema/test-util.h index 2049ce0..25dacb0 100644 --- a/src/parquet/schema/test-util.h +++ b/src/parquet/schema/test-util.h @@ -36,22 +36,28 @@ namespace parquet { namespace schema { static inline SchemaElement NewPrimitive(const std::string& name, - FieldRepetitionType::type repetition, format::Type::type type) { + FieldRepetitionType::type repetition, format::Type::type type, int id = 0) { SchemaElement result; result.__set_name(name); result.__set_repetition_type(repetition); result.__set_type(type); result.__set_num_children(0); + result.__set_field_id(id); + // Set default (non-set) values + result.__set_type_length(-1); + result.__set_precision(-1); + result.__set_scale(-1); return result; } static inline SchemaElement NewGroup(const std::string& name, - FieldRepetitionType::type repetition, int num_children) { + FieldRepetitionType::type repetition, int num_children, int id = 0) { SchemaElement result; result.__set_name(name); result.__set_repetition_type(repetition); result.__set_num_children(num_children); + result.__set_field_id(id); return result; } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/types.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc index 50e63f2..f08fb41 100644 --- a/src/parquet/schema/types.cc +++ b/src/parquet/schema/types.cc @@ -203,6 +203,10 @@ void PrimitiveNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); } +void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const { + visitor->Visit(this); +} + // ---------------------------------------------------------------------- // Group node @@ -232,6 +236,10 @@ void GroupNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); } +void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { + visitor->Visit(this); +} + // ---------------------------------------------------------------------- // Node construction from Parquet metadata @@ -280,6 +288,35 @@ std::unique_ptr PrimitiveNode::FromParquet(const void* opaque_element, return std::unique_ptr(result.release()); } +void GroupNode::ToParquet(void* opaque_element) const { + format::SchemaElement* element = + static_cast(opaque_element); + element->__set_name(name_); + element->__set_num_children(field_count()); + element->__set_repetition_type(ToThrift(repetition_)); + if (logical_type_ != LogicalType::NONE) { + element->__set_converted_type(ToThrift(logical_type_)); + } + // FIXME: SchemaFlattener does this for us: element->__set_field_id(id_); +} + +void PrimitiveNode::ToParquet(void* opaque_element) const { + format::SchemaElement* element = + static_cast(opaque_element); + + element->__set_name(name_); + element->__set_num_children(0); + element->__set_repetition_type(ToThrift(repetition_)); + if (logical_type_ != LogicalType::NONE) { + element->__set_converted_type(ToThrift(logical_type_)); + } + element->__set_type(ToThrift(physical_type_)); + // FIXME: SchemaFlattener does this for us: element->__set_field_id(id_); + element->__set_type_length(type_length_); + element->__set_precision(decimal_metadata_.precision); + element->__set_scale(decimal_metadata_.scale); +} + } // namespace schema } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/49a5c1a8/src/parquet/schema/types.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h index e87e18e..4131d24 100644 --- a/src/parquet/schema/types.h +++ b/src/parquet/schema/types.h @@ -163,15 +163,26 @@ class Node { return parent_; } + // ToParquet returns an opaque void* to avoid exporting + // parquet::SchemaElement into the public API + virtual void ToParquet(void* opaque_element) const = 0; + // Node::Visitor abstract class for walking schemas with the visitor pattern class Visitor { public: virtual ~Visitor() {} + virtual void Visit(Node* node) = 0; + }; + class ConstVisitor { + public: + virtual ~ConstVisitor() {} + virtual void Visit(const Node* node) = 0; }; virtual void Visit(Visitor* visitor) = 0; + virtual void VisitConst(ConstVisitor* visitor) const = 0; protected: friend class GroupNode; @@ -224,7 +235,9 @@ class PrimitiveNode : public Node { return decimal_metadata_; } + void ToParquet(void* opaque_element) const override; virtual void Visit(Visitor* visitor); + void VisitConst(ConstVisitor* visitor) const override; private: PrimitiveNode(const std::string& name, Repetition::type repetition, @@ -278,7 +291,9 @@ class GroupNode : public Node { return fields_.size(); } + void ToParquet(void* opaque_element) const override; virtual void Visit(Visitor* visitor); + void VisitConst(ConstVisitor* visitor) const override; private: GroupNode(const std::string& name, Repetition::type repetition,