parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jul...@apache.org
Subject [1/2] parquet-cpp git commit: PARQUET-442: Nested schema conversion, Thrift struct decoupling, dump-schema utility
Date Sat, 06 Feb 2016 02:01:48 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master c0eec9a59 -> 04d75c7cb


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/schema-types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc
new file mode 100644
index 0000000..72d38c0
--- /dev/null
+++ b/src/parquet/schema/schema-types-test.cc
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "parquet/util/test-common.h"
+
+#include "parquet/schema/types.h"
+#include "parquet/schema/test-util.h"
+
+using std::string;
+using std::vector;
+
+namespace parquet_cpp {
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+class TestPrimitiveNode : public ::testing::Test {
+ public:
+  void setUp() {
+    name_ = "name";
+    id_ = 5;
+  }
+
+  void Convert(const parquet::SchemaElement* element) {
+    node_ = PrimitiveNode::FromParquet(element, id_);
+    ASSERT_TRUE(node_->is_primitive());
+    prim_node_ = static_cast<const PrimitiveNode*>(node_.get());
+  }
+
+ protected:
+  std::string name_;
+  const PrimitiveNode* prim_node_;
+
+  int id_;
+  std::unique_ptr<Node> node_;
+};
+
+TEST_F(TestPrimitiveNode, Attrs) {
+  PrimitiveNode node1("foo", Repetition::REPEATED, Type::INT32);
+
+  PrimitiveNode node2("bar", Repetition::OPTIONAL, Type::BYTE_ARRAY,
+      LogicalType::UTF8);
+
+  ASSERT_EQ("foo", node1.name());
+
+  ASSERT_TRUE(node1.is_primitive());
+  ASSERT_FALSE(node1.is_group());
+
+  ASSERT_EQ(Repetition::REPEATED, node1.repetition());
+  ASSERT_EQ(Repetition::OPTIONAL, node2.repetition());
+
+  ASSERT_EQ(Node::PRIMITIVE, node1.node_type());
+
+  ASSERT_EQ(Type::INT32, node1.physical_type());
+  ASSERT_EQ(Type::BYTE_ARRAY, node2.physical_type());
+
+  // logical types
+  ASSERT_EQ(LogicalType::NONE, node1.logical_type());
+  ASSERT_EQ(LogicalType::UTF8, node2.logical_type());
+
+  // repetition
+  node1 = PrimitiveNode("foo", Repetition::REQUIRED, Type::INT32);
+  node2 = PrimitiveNode("foo", Repetition::OPTIONAL, Type::INT32);
+  PrimitiveNode node3("foo", Repetition::REPEATED, Type::INT32);
+
+  ASSERT_TRUE(node1.is_required());
+
+  ASSERT_TRUE(node2.is_optional());
+  ASSERT_FALSE(node2.is_required());
+
+  ASSERT_TRUE(node3.is_repeated());
+  ASSERT_FALSE(node3.is_optional());
+}
+
+TEST_F(TestPrimitiveNode, FromParquet) {
+  SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
+      parquet::Type::INT32);
+  Convert(&elt);
+  ASSERT_EQ(name_, prim_node_->name());
+  ASSERT_EQ(id_, prim_node_->id());
+  ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition());
+  ASSERT_EQ(Type::INT32, prim_node_->physical_type());
+  ASSERT_EQ(LogicalType::NONE, prim_node_->logical_type());
+
+  // Test a logical type
+  elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, parquet::Type::BYTE_ARRAY);
+  elt.__set_converted_type(ConvertedType::UTF8);
+
+  Convert(&elt);
+  ASSERT_EQ(Repetition::REQUIRED, prim_node_->repetition());
+  ASSERT_EQ(Type::BYTE_ARRAY, prim_node_->physical_type());
+  ASSERT_EQ(LogicalType::UTF8, prim_node_->logical_type());
+
+  // FIXED_LEN_BYTE_ARRAY
+  elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
+      parquet::Type::FIXED_LEN_BYTE_ARRAY);
+  elt.__set_type_length(16);
+
+  Convert(&elt);
+  ASSERT_EQ(name_, prim_node_->name());
+  ASSERT_EQ(id_, prim_node_->id());
+  ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition());
+  ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type());
+  ASSERT_EQ(16, prim_node_->type_length());
+
+  // ConvertedType::Decimal
+  elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
+      parquet::Type::FIXED_LEN_BYTE_ARRAY);
+  elt.__set_converted_type(ConvertedType::DECIMAL);
+  elt.__set_type_length(6);
+  elt.__set_scale(12);
+  elt.__set_precision(2);
+
+  Convert(&elt);
+  ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type());
+  ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type());
+  ASSERT_EQ(6, prim_node_->type_length());
+  ASSERT_EQ(12, prim_node_->decimal_metadata().scale);
+  ASSERT_EQ(2, prim_node_->decimal_metadata().precision);
+}
+
+TEST_F(TestPrimitiveNode, Equals) {
+  PrimitiveNode node1("foo", Repetition::REQUIRED, Type::INT32);
+  PrimitiveNode node2("foo", Repetition::REQUIRED, Type::INT64);
+  PrimitiveNode node3("bar", Repetition::REQUIRED, Type::INT32);
+  PrimitiveNode node4("foo", Repetition::OPTIONAL, Type::INT32);
+  PrimitiveNode node5("foo", Repetition::REQUIRED, Type::INT32);
+
+  ASSERT_TRUE(node1.Equals(&node1));
+  ASSERT_FALSE(node1.Equals(&node2));
+  ASSERT_FALSE(node1.Equals(&node3));
+  ASSERT_FALSE(node1.Equals(&node4));
+  ASSERT_TRUE(node1.Equals(&node5));
+
+  PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+  flba1.SetTypeLength(12);
+
+  PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+  flba2.SetTypeLength(12);
+
+  PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+  flba3.SetTypeLength(16);
+
+  ASSERT_TRUE(flba1.Equals(&flba2));
+  ASSERT_FALSE(flba1.Equals(&flba3));
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+class TestGroupNode : public ::testing::Test {
+ public:
+  NodeVector Fields1() {
+    NodeVector fields;
+
+    fields.push_back(Int32("one", Repetition::REQUIRED));
+    fields.push_back(Int64("two"));
+    fields.push_back(Double("three"));
+
+    return fields;
+  }
+};
+
+TEST_F(TestGroupNode, Attrs) {
+  NodeVector fields = Fields1();
+
+  GroupNode node1("foo", Repetition::REPEATED, fields);
+  GroupNode node2("bar", Repetition::OPTIONAL, fields, LogicalType::LIST);
+
+  ASSERT_EQ("foo", node1.name());
+
+  ASSERT_TRUE(node1.is_group());
+  ASSERT_FALSE(node1.is_primitive());
+
+  ASSERT_EQ(fields.size(), node1.field_count());
+
+  ASSERT_TRUE(node1.is_repeated());
+  ASSERT_TRUE(node2.is_optional());
+
+  ASSERT_EQ(Repetition::REPEATED, node1.repetition());
+  ASSERT_EQ(Repetition::OPTIONAL, node2.repetition());
+
+  ASSERT_EQ(Node::GROUP, node1.node_type());
+
+  // logical types
+  ASSERT_EQ(LogicalType::NONE, node1.logical_type());
+  ASSERT_EQ(LogicalType::LIST, node2.logical_type());
+}
+
+TEST_F(TestGroupNode, Equals) {
+  NodeVector f1 = Fields1();
+  NodeVector f2 = Fields1();
+
+  GroupNode group1("group", Repetition::REPEATED, f1);
+  GroupNode group2("group", Repetition::REPEATED, f2);
+  GroupNode group3("group2", Repetition::REPEATED, f2);
+
+  // This is copied in the GroupNode ctor, so this is okay
+  f2.push_back(Float("four", Repetition::OPTIONAL));
+  GroupNode group4("group", Repetition::REPEATED, f2);
+
+  ASSERT_TRUE(group1.Equals(&group2));
+  ASSERT_FALSE(group1.Equals(&group3));
+
+  ASSERT_FALSE(group1.Equals(&group4));
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/test-util.h b/src/parquet/schema/test-util.h
new file mode 100644
index 0000000..5593abd
--- /dev/null
+++ b/src/parquet/schema/test-util.h
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#ifndef PARQUET_SCHEMA_TEST_UTIL_H
+#define PARQUET_SCHEMA_TEST_UTIL_H
+
+#include <string>
+
+#include "parquet/schema/types.h"
+#include "parquet/thrift/parquet_types.h"
+
+using parquet::ConvertedType;
+using parquet::FieldRepetitionType;
+using parquet::SchemaElement;
+
+namespace parquet_cpp {
+
+namespace schema {
+
+static inline SchemaElement NewPrimitive(const std::string& name,
+    FieldRepetitionType::type repetition, parquet::Type::type type) {
+  SchemaElement result;
+  result.__set_name(name);
+  result.__set_repetition_type(repetition);
+  result.__set_type(type);
+  result.__set_num_children(0);
+
+  return result;
+}
+
+static inline SchemaElement NewGroup(const std::string& name,
+    FieldRepetitionType::type repetition, size_t num_children) {
+  SchemaElement result;
+  result.__set_name(name);
+  result.__set_repetition_type(repetition);
+  result.__set_num_children(num_children);
+
+  return result;
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_COLUMN_TEST_UTIL_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
new file mode 100644
index 0000000..e088eed
--- /dev/null
+++ b/src/parquet/schema/types.cc
@@ -0,0 +1,163 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema/types.h"
+
+#include <memory>
+
+#include "parquet/thrift/parquet_types.h"
+
+namespace parquet_cpp {
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Base node
+
+bool Node::EqualsInternal(const Node* other) const {
+  return type_ == other->type_ &&
+    name_ == other->name_ &&
+    repetition_ == other->repetition_ &&
+    logical_type_ == other->logical_type_;
+}
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
+  if (physical_type_ != other->physical_type_) {
+    return false;
+  } else if (logical_type_ == LogicalType::DECIMAL) {
+    // TODO(wesm): metadata
+    ParquetException::NYI("comparing decimals");
+    return false;
+  } else if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+    return type_length_ == other->type_length_;
+  }
+  return true;
+}
+
+bool PrimitiveNode::Equals(const Node* other) const {
+  if (!Node::EqualsInternal(other)) {
+    return false;
+  }
+  return EqualsInternal(static_cast<const PrimitiveNode*>(other));
+}
+
+void PrimitiveNode::Visit(Node::Visitor* visitor) {
+  visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+bool GroupNode::EqualsInternal(const GroupNode* other) const {
+  if (this == other) {
+    return true;
+  }
+  if (this->field_count() != other->field_count()) {
+    return false;
+  }
+  for (size_t i = 0; i < this->field_count(); ++i) {
+    if (!this->field(i)->Equals(other->field(i).get())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool GroupNode::Equals(const Node* other) const {
+  if (!Node::EqualsInternal(other)) {
+    return false;
+  }
+  return EqualsInternal(static_cast<const GroupNode*>(other));
+}
+
+void GroupNode::Visit(Node::Visitor* visitor) {
+  visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Node construction from Parquet metadata
+
+static Type::type ConvertEnum(parquet::Type::type type) {
+  return static_cast<Type::type>(type);
+}
+
+static LogicalType::type ConvertEnum(parquet::ConvertedType::type type) {
+  // item 0 is NONE
+  return static_cast<LogicalType::type>(static_cast<int>(type) + 1);
+}
+
+static Repetition::type ConvertEnum(parquet::FieldRepetitionType::type type) {
+  return static_cast<Repetition::type>(type);
+}
+
+struct NodeParams {
+  explicit NodeParams(const std::string& name) :
+      name(name) {}
+
+  const std::string& name;
+  Repetition::type repetition;
+  LogicalType::type logical_type;
+};
+
+static inline NodeParams GetNodeParams(const parquet::SchemaElement* element) {
+  NodeParams params(element->name);
+
+  params.repetition = ConvertEnum(element->repetition_type);
+  if (element->__isset.converted_type) {
+    params.logical_type = ConvertEnum(element->converted_type);
+  } else {
+    params.logical_type = LogicalType::NONE;
+  }
+  return params;
+}
+
+std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element, int node_id,
+    const NodeVector& fields) {
+  const parquet::SchemaElement* element =
+    static_cast<const parquet::SchemaElement*>(opaque_element);
+  NodeParams params = GetNodeParams(element);
+  return std::unique_ptr<Node>(new GroupNode(params.name, params.repetition, fields,
+          params.logical_type, node_id));
+}
+
+std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
+    int node_id) {
+  const parquet::SchemaElement* element =
+    static_cast<const parquet::SchemaElement*>(opaque_element);
+  NodeParams params = GetNodeParams(element);
+
+  std::unique_ptr<PrimitiveNode> result = std::unique_ptr<PrimitiveNode>(
+      new PrimitiveNode(params.name, params.repetition,
+          ConvertEnum(element->type), params.logical_type, node_id));
+
+  if (element->type == parquet::Type::FIXED_LEN_BYTE_ARRAY) {
+    result->SetTypeLength(element->type_length);
+    if (params.logical_type == LogicalType::DECIMAL) {
+      result->SetDecimalMetadata(element->scale, element->precision);
+    }
+  }
+
+  // Return as unique_ptr to the base type
+  return std::unique_ptr<Node>(result.release());
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h
new file mode 100644
index 0000000..82db233
--- /dev/null
+++ b/src/parquet/schema/types.h
@@ -0,0 +1,303 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#ifndef PARQUET_SCHEMA_TYPES_H
+#define PARQUET_SCHEMA_TYPES_H
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/exception.h"
+#include "parquet/types.h"
+#include "parquet/util/macros.h"
+
+namespace parquet_cpp {
+
+namespace schema {
+
+// List encodings: using the terminology from Impala to define different styles
+// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
+// the converted type named in the Parquet metadata is ConvertedType::LIST we
+// use that terminology here. It also helps distinguish from the *_ARRAY
+// primitive types.
+//
+// One-level encoding: Only allows required lists with required cells
+//   repeated value_type name
+//
+// Two-level encoding: Enables optional lists with only required cells
+//   <required/optional> group list
+//     repeated value_type item
+//
+// Three-level encoding: Enables optional lists with optional cells
+//   <required/optional> group bag
+//     repeated group list
+//       <required/optional> value_type item
+//
+// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
+// the non-repeated nodes set to required.
+//
+// The "official" encoding recommended in the Parquet spec is the 3-level, and
+// we use that as the default when creating list types. For semantic completeness
+// we allow the other two. Since all types of encodings will occur "in the
+// wild" we need to be able to interpret the associated definition levels in
+// the context of the actual encoding used in the file.
+//
+// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
+// SchemaElement, which could make things challenging if we are trying to infer
+// that a sequence of nodes semantically represents an array according to one
+// of these encodings (versus a struct containing an array). We should refuse
+// the temptation to guess, as they say.
+struct ListEncoding {
+  enum type {
+    ONE_LEVEL,
+    TWO_LEVEL,
+    THREE_LEVEL
+  };
+};
+
+struct DecimalMetadata {
+  int32_t scale;
+  int32_t precision;
+};
+
+// Base class for logical schema types. A type has a name, repetition level,
+// and optionally a logical type (ConvertedType in Parquet metadata parlance)
+class Node {
+ public:
+  enum type {
+    PRIMITIVE,
+    GROUP
+  };
+
+  Node(Node::type type, const std::string& name,
+      Repetition::type repetition,
+      LogicalType::type logical_type = LogicalType::NONE,
+      int id = -1) :
+      type_(type),
+      name_(name),
+      repetition_(repetition),
+      logical_type_(logical_type),
+      id_(id) {}
+
+  virtual ~Node() {}
+
+  bool is_primitive() const {
+    return type_ == Node::PRIMITIVE;
+  }
+
+  bool is_group() const {
+    return type_ == Node::GROUP;
+  }
+
+  bool is_optional() const {
+    return repetition_ == Repetition::OPTIONAL;
+  }
+
+  bool is_repeated() const {
+    return repetition_ == Repetition::REPEATED;
+  }
+
+  bool is_required() const {
+    return repetition_ == Repetition::REQUIRED;
+  }
+
+  virtual bool Equals(const Node* other) const = 0;
+
+  const std::string& name() const {
+    return name_;
+  }
+
+  Node::type node_type() const {
+    return type_;
+  }
+
+  Repetition::type repetition() const {
+    return repetition_;
+  }
+
+  LogicalType::type logical_type() const {
+    return logical_type_;
+  }
+
+  int id() const {
+    return id_;
+  }
+
+  // Node::Visitor abstract class for walking schemas with the visitor pattern
+  class Visitor {
+   public:
+    virtual ~Visitor() {}
+
+    virtual void Visit(const Node* node) = 0;
+  };
+
+  virtual void Visit(Visitor* visitor) = 0;
+
+ protected:
+  Node::type type_;
+  std::string name_;
+  Repetition::type repetition_;
+  LogicalType::type logical_type_;
+  int id_;
+
+  bool EqualsInternal(const Node* other) const;
+};
+
+// Save our breath all over the place with these typedefs
+typedef std::shared_ptr<Node> NodePtr;
+typedef std::vector<NodePtr> NodeVector;
+
+// A type that is one of the primitive Parquet storage types. In addition to
+// the other type metadata (name, repetition level, logical type), also has the
+// physical storage type and their type-specific metadata (byte width, decimal
+// parameters)
+class PrimitiveNode : public Node {
+ public:
+  // FromParquet accepts an opaque void* to avoid exporting
+  // parquet::SchemaElement into the public API
+  static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id);
+
+  static inline NodePtr Make(const std::string& name,
+      Repetition::type repetition, Type::type type,
+      LogicalType::type logical_type = LogicalType::NONE) {
+    return NodePtr(new PrimitiveNode(name, repetition, type, logical_type));
+  }
+
+  // Alternate constructor for FIXED_LEN_BYTE_ARRAY (FLBA)
+  static inline NodePtr MakeFLBA(const std::string& name,
+      Repetition::type repetition, Type::type type,
+      int32_t type_length,
+      LogicalType::type logical_type = LogicalType::NONE) {
+    NodePtr result = Make(name, repetition, type, logical_type);
+    static_cast<PrimitiveNode*>(result.get())->SetTypeLength(type_length);
+    return result;
+  }
+
+  virtual bool Equals(const Node* other) const;
+
+  Type::type physical_type() const {
+    return physical_type_;
+  }
+
+  int32_t type_length() const {
+    return type_length_;
+  }
+
+  const DecimalMetadata& decimal_metadata() const {
+    return decimal_metadata_;
+  }
+
+  virtual void Visit(Visitor* visitor);
+
+ private:
+  PrimitiveNode(const std::string& name, Repetition::type repetition,
+      Type::type type,
+      LogicalType::type logical_type = LogicalType::NONE,
+      int id = -1) :
+      Node(Node::PRIMITIVE, name, repetition, logical_type, id),
+      physical_type_(type) {}
+
+  Type::type physical_type_;
+  int32_t type_length_;
+  DecimalMetadata decimal_metadata_;
+
+  // For FIXED_LEN_BYTE_ARRAY
+  void SetTypeLength(int32_t length) {
+    type_length_ = length;
+  }
+
+
+  // For Decimal logical type: Precision and scale
+  void SetDecimalMetadata(int32_t scale, int32_t precision) {
+    decimal_metadata_.scale = scale;
+    decimal_metadata_.precision = precision;
+  }
+
+  bool EqualsInternal(const PrimitiveNode* other) const;
+
+  FRIEND_TEST(TestPrimitiveNode, Attrs);
+  FRIEND_TEST(TestPrimitiveNode, Equals);
+  FRIEND_TEST(TestPrimitiveNode, FromParquet);
+};
+
+class GroupNode : public Node {
+ public:
+  // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting
+  // parquet::SchemaElement into the public API
+  static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id,
+      const NodeVector& fields);
+
+  static inline NodePtr Make(const std::string& name,
+      Repetition::type repetition, const NodeVector& fields,
+      LogicalType::type logical_type = LogicalType::NONE) {
+    return NodePtr(new GroupNode(name, repetition, fields, logical_type));
+  }
+
+  virtual bool Equals(const Node* other) const;
+
+  const NodePtr& field(size_t i) const {
+    return fields_[i];
+  }
+
+  size_t field_count() const {
+    return fields_.size();
+  }
+
+  virtual void Visit(Visitor* visitor);
+
+ private:
+  GroupNode(const std::string& name, Repetition::type repetition,
+      const NodeVector& fields,
+      LogicalType::type logical_type = LogicalType::NONE,
+      int id = -1) :
+      Node(Node::GROUP, name, repetition, logical_type, id),
+      fields_(fields) {}
+
+  NodeVector fields_;
+  bool EqualsInternal(const GroupNode* other) const;
+
+  FRIEND_TEST(TestGroupNode, Attrs);
+  FRIEND_TEST(TestGroupNode, Equals);
+};
+
+// ----------------------------------------------------------------------
+// Convenience primitive type factory functions
+
+#define PRIMITIVE_FACTORY(FuncName, TYPE)                       \
+  static inline NodePtr FuncName(const std::string& name,       \
+      Repetition::type repetition = Repetition::OPTIONAL) {     \
+    return PrimitiveNode::Make(name, repetition, Type::TYPE);   \
+  }
+
+PRIMITIVE_FACTORY(Boolean, BOOLEAN);
+PRIMITIVE_FACTORY(Int32, INT32);
+PRIMITIVE_FACTORY(Int64, INT64);
+PRIMITIVE_FACTORY(Int96, INT96);
+PRIMITIVE_FACTORY(Float, FLOAT);
+PRIMITIVE_FACTORY(Double, DOUBLE);
+PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY);
+
+} // namespace schema
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_SCHEMA_TYPES_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/types.h b/src/parquet/types.h
index f39e3a2..2d15cad 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -24,11 +24,110 @@
 #include <sstream>
 #include <string>
 
-#include "parquet/thrift/parquet_types.h"
 #include "parquet/util/compiler-util.h"
 
 namespace parquet_cpp {
 
+// ----------------------------------------------------------------------
+// Metadata enums to match Thrift metadata
+//
+// The reason we maintain our own enums is to avoid transitive dependency on
+// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
+// public API. After building parquet-cpp, you should not need to include
+// Thrift headers in your application. This means some boilerplate to convert
+// between our types and Parquet's Thrift types.
+//
+// We can also add special values like NONE to distinguish between metadata
+// values being set and not set. As an example consider ConvertedType and
+// CompressionCodec
+
+// Mirrors parquet::Type
+struct Type {
+  enum type {
+    BOOLEAN = 0,
+    INT32 = 1,
+    INT64 = 2,
+    INT96 = 3,
+    FLOAT = 4,
+    DOUBLE = 5,
+    BYTE_ARRAY = 6,
+    FIXED_LEN_BYTE_ARRAY = 7
+  };
+};
+
+// Mirrors parquet::ConvertedType
+struct LogicalType {
+  enum type {
+    NONE,
+    UTF8,
+    MAP,
+    MAP_KEY_VALUE,
+    LIST,
+    ENUM,
+    DECIMAL,
+    DATE,
+    TIME_MILLIS,
+    TIMESTAMP_MILLIS,
+    UINT_8,
+    UINT_16,
+    UINT_32,
+    UINT_64,
+    INT_8,
+    INT_16,
+    INT_32,
+    INT_64,
+    JSON,
+    BSON,
+    INTERVAL
+  };
+};
+
+// Mirrors parquet::FieldRepetitionType
+struct Repetition {
+  enum type {
+    REQUIRED = 0,
+    OPTIONAL = 1,
+    REPEATED = 2
+  };
+};
+
+// Data encodings. Mirrors parquet::Encoding
+struct Encoding {
+  enum type {
+    PLAIN = 0,
+    PLAIN_DICTIONARY = 2,
+    RLE = 3,
+    BIT_PACKED = 4,
+    DELTA_BINARY_PACKED = 5,
+    DELTA_LENGTH_BYTE_ARRAY = 6,
+    DELTA_BYTE_ARRAY = 7,
+    RLE_DICTIONARY = 8
+  };
+};
+
+// Compression, mirrors parquet::CompressionCodec
+struct Compression {
+  enum type {
+    NONE,
+    UNCOMPRESSED,
+    SNAPPY,
+    GZIP,
+    LZO
+  };
+};
+
+// parquet::PageType
+struct PageType {
+  enum type {
+    DATA_PAGE,
+    INDEX_PAGE,
+    DICTIONARY_PAGE,
+    DATA_PAGE_V2
+  };
+};
+
+// ----------------------------------------------------------------------
+
 struct ByteArray {
   uint32_t len;
   const uint8_t* ptr;
@@ -80,72 +179,64 @@ struct type_traits {
 };
 
 template <>
-struct type_traits<parquet::Type::BOOLEAN> {
+struct type_traits<Type::BOOLEAN> {
   typedef bool value_type;
-  static constexpr parquet::Type::type parquet_type = parquet::Type::BOOLEAN;
   static constexpr size_t value_byte_size = 1;
 
   static constexpr const char* printf_code = "d";
 };
 
 template <>
-struct type_traits<parquet::Type::INT32> {
+struct type_traits<Type::INT32> {
   typedef int32_t value_type;
-  static constexpr parquet::Type::type parquet_type = parquet::Type::INT32;
 
   static constexpr size_t value_byte_size = 4;
   static constexpr const char* printf_code = "d";
 };
 
 template <>
-struct type_traits<parquet::Type::INT64> {
+struct type_traits<Type::INT64> {
   typedef int64_t value_type;
-  static constexpr parquet::Type::type parquet_type = parquet::Type::INT64;
 
   static constexpr size_t value_byte_size = 8;
   static constexpr const char* printf_code = "ld";
 };
 
 template <>
-struct type_traits<parquet::Type::INT96> {
+struct type_traits<Type::INT96> {
   typedef Int96 value_type;
-  static constexpr parquet::Type::type parquet_type = parquet::Type::INT96;
 
   static constexpr size_t value_byte_size = 12;
   static constexpr const char* printf_code = "s";
 };
 
 template <>
-struct type_traits<parquet::Type::FLOAT> {
+struct type_traits<Type::FLOAT> {
   typedef float value_type;
-  static constexpr parquet::Type::type parquet_type = parquet::Type::FLOAT;
 
   static constexpr size_t value_byte_size = 4;
   static constexpr const char* printf_code = "f";
 };
 
 template <>
-struct type_traits<parquet::Type::DOUBLE> {
+struct type_traits<Type::DOUBLE> {
   typedef double value_type;
-  static constexpr parquet::Type::type parquet_type = parquet::Type::DOUBLE;
 
   static constexpr size_t value_byte_size = 8;
   static constexpr const char* printf_code = "lf";
 };
 
 template <>
-struct type_traits<parquet::Type::BYTE_ARRAY> {
+struct type_traits<Type::BYTE_ARRAY> {
   typedef ByteArray value_type;
-  static constexpr parquet::Type::type parquet_type = parquet::Type::BYTE_ARRAY;
 
   static constexpr size_t value_byte_size = sizeof(ByteArray);
   static constexpr const char* printf_code = "s";
 };
 
 template <>
-struct type_traits<parquet::Type::FIXED_LEN_BYTE_ARRAY> {
+struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
   typedef FixedLenByteArray value_type;
-  static constexpr parquet::Type::type parquet_type = parquet::Type::FIXED_LEN_BYTE_ARRAY;
 
   static constexpr size_t value_byte_size = sizeof(FixedLenByteArray);
   static constexpr const char* printf_code = "s";
@@ -158,6 +249,38 @@ inline std::string format_fwf(int width) {
   return ss.str();
 }
 
+static inline std::string type_to_string(Type::type t) {
+  switch (t) {
+    case Type::BOOLEAN:
+      return "BOOLEAN";
+      break;
+    case Type::INT32:
+      return "INT32";
+      break;
+    case Type::INT64:
+      return "INT64";
+      break;
+    case Type::INT96:
+      return "INT96";
+      break;
+    case Type::FLOAT:
+      return "FLOAT";
+      break;
+    case Type::DOUBLE:
+      return "DOUBLE";
+      break;
+    case Type::BYTE_ARRAY:
+      return "BYTE_ARRAY";
+      break;
+    case Type::FIXED_LEN_BYTE_ARRAY:
+      return "FIXED_LEN_BYTE_ARRAY";
+      break;
+    default:
+      return "UNKNOWN";
+      break;
+  }
+}
+
 } // namespace parquet_cpp
 
 #endif // PARQUET_TYPES_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt
index 1c86112..90a053f 100644
--- a/src/parquet/util/CMakeLists.txt
+++ b/src/parquet/util/CMakeLists.txt
@@ -24,6 +24,7 @@ install(FILES
   sse-info.h
   compiler-util.h
   logging.h
+  macros.h
   rle-encoding.h
   stopwatch.h
   input_stream.h

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/util/macros.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/macros.h b/src/parquet/util/macros.h
new file mode 100644
index 0000000..7b301d6
--- /dev/null
+++ b/src/parquet/util/macros.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_UTIL_MACROS_H
+#define PARQUET_UTIL_MACROS_H
+
+// Useful macros from elsewhere
+
+// ----------------------------------------------------------------------
+// From googletest
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void MyMethod();
+//   FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+//   // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif // PARQUET_UTIL_MACROS_H


Mime
View raw message