parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [2/3] parquet-cpp git commit: PARQUET-844: Schema, compression consolidation / flattening
Date Thu, 26 Jan 2017 17:02:51 GMT
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema.cc b/src/parquet/schema.cc
new file mode 100644
index 0000000..13fca68
--- /dev/null
+++ b/src/parquet/schema.cc
@@ -0,0 +1,655 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema.h"
+#include "parquet/schema-internal.h"
+
+#include <algorithm>
+#include <memory>
+
+#include "parquet/exception.h"
+#include "parquet/thrift/parquet_types.h"
+#include "parquet/thrift/util.h"
+
+using parquet::format::SchemaElement;
+
+namespace parquet {
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// ColumnPath
+
+std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
+  std::stringstream ss(dotstring);
+  std::string item;
+  std::vector<std::string> path;
+  while (std::getline(ss, item, '.')) {
+    path.push_back(item);
+  }
+  return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
+  std::vector<std::string> path;
+  path.reserve(path_.size() + 1);
+  path.resize(path_.size() + 1);
+  std::copy(path_.cbegin(), path_.cend(), path.begin());
+  path[path_.size()] = node_name;
+
+  return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
+}
+
+std::string ColumnPath::ToDotString() const {
+  std::stringstream ss;
+  for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
+    if (it != path_.cbegin()) { ss << "."; }
+    ss << *it;
+  }
+  return ss.str();
+}
+
+const std::vector<std::string>& ColumnPath::ToDotVector() const {
+  return path_;
+}
+
+// ----------------------------------------------------------------------
+// Base node
+
+bool Node::EqualsInternal(const Node* other) const {
+  return type_ == other->type_ && name_ == other->name_ &&
+         repetition_ == other->repetition_ && logical_type_ == other->logical_type_;
+}
+
+void Node::SetParent(const Node* parent) {
+  parent_ = parent;
+}
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+    Type::type type, LogicalType::type logical_type, int length, int precision, int scale,
+    int id)
+    : Node(Node::PRIMITIVE, name, repetition, logical_type, id),
+      physical_type_(type),
+      type_length_(length) {
+  std::stringstream ss;
+
+  // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being
+  // set to true, but Impala will raise an incompatible metadata in such cases
+  memset(&decimal_metadata_, 0, sizeof(decimal_metadata_));
+
+  // Check if the physical and logical types match
+  // Mapping referred from Apache parquet-mr as on 2016-02-22
+  switch (logical_type) {
+    case LogicalType::NONE:
+      // Logical type not set
+      break;
+    case LogicalType::UTF8:
+    case LogicalType::JSON:
+    case LogicalType::BSON:
+      if (type != Type::BYTE_ARRAY) {
+        ss << LogicalTypeToString(logical_type);
+        ss << " can only annotate BYTE_ARRAY fields";
+        throw ParquetException(ss.str());
+      }
+      break;
+    case LogicalType::DECIMAL:
+      if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) &&
+          (type != Type::FIXED_LEN_BYTE_ARRAY)) {
+        ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
+        throw ParquetException(ss.str());
+      }
+      if (precision <= 0) {
+        ss << "Invalid DECIMAL precision: " << precision;
+        throw ParquetException(ss.str());
+      }
+      if (scale < 0) {
+        ss << "Invalid DECIMAL scale: " << scale;
+        throw ParquetException(ss.str());
+      }
+      if (scale > precision) {
+        ss << "Invalid DECIMAL scale " << scale;
+        ss << " cannot be greater than precision " << precision;
+        throw ParquetException(ss.str());
+      }
+      decimal_metadata_.isset = true;
+      decimal_metadata_.precision = precision;
+      decimal_metadata_.scale = scale;
+      break;
+    case LogicalType::DATE:
+    case LogicalType::TIME_MILLIS:
+    case LogicalType::UINT_8:
+    case LogicalType::UINT_16:
+    case LogicalType::UINT_32:
+    case LogicalType::INT_8:
+    case LogicalType::INT_16:
+    case LogicalType::INT_32:
+      if (type != Type::INT32) {
+        ss << LogicalTypeToString(logical_type);
+        ss << " can only annotate INT32";
+        throw ParquetException(ss.str());
+      }
+      break;
+    case LogicalType::TIME_MICROS:
+    case LogicalType::TIMESTAMP_MILLIS:
+    case LogicalType::TIMESTAMP_MICROS:
+    case LogicalType::UINT_64:
+    case LogicalType::INT_64:
+      if (type != Type::INT64) {
+        ss << LogicalTypeToString(logical_type);
+        ss << " can only annotate INT64";
+        throw ParquetException(ss.str());
+      }
+      break;
+    case LogicalType::INTERVAL:
+      if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
+        ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
+        throw ParquetException(ss.str());
+      }
+      break;
+    case LogicalType::ENUM:
+      if (type != Type::BYTE_ARRAY) {
+        ss << "ENUM can only annotate BYTE_ARRAY fields";
+        throw ParquetException(ss.str());
+      }
+      break;
+    default:
+      ss << LogicalTypeToString(logical_type);
+      ss << " can not be applied to a primitive type";
+      throw ParquetException(ss.str());
+  }
+  if (type == Type::FIXED_LEN_BYTE_ARRAY) {
+    if (length <= 0) {
+      ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
+      throw ParquetException(ss.str());
+    }
+    type_length_ = length;
+  }
+}
+
+bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
+  bool is_equal = true;
+  if ((physical_type_ != other->physical_type_) ||
+      (logical_type_ != other->logical_type_)) {
+    return false;
+  }
+  if (logical_type_ == LogicalType::DECIMAL) {
+    is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) &&
+                (decimal_metadata_.scale == other->decimal_metadata_.scale);
+  }
+  if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+    is_equal &= (type_length_ == other->type_length_);
+  }
+  return is_equal;
+}
+
+bool PrimitiveNode::Equals(const Node* other) const {
+  if (!Node::EqualsInternal(other)) { return false; }
+  return EqualsInternal(static_cast<const PrimitiveNode*>(other));
+}
+
+void PrimitiveNode::Visit(Node::Visitor* visitor) {
+  visitor->Visit(this);
+}
+
+void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const {
+  visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+bool GroupNode::EqualsInternal(const GroupNode* other) const {
+  if (this == other) { return true; }
+  if (this->field_count() != other->field_count()) { return false; }
+  for (int i = 0; i < this->field_count(); ++i) {
+    if (!this->field(i)->Equals(other->field(i).get())) { return false; }
+  }
+  return true;
+}
+
+bool GroupNode::Equals(const Node* other) const {
+  if (!Node::EqualsInternal(other)) { return false; }
+  return EqualsInternal(static_cast<const GroupNode*>(other));
+}
+
+void GroupNode::Visit(Node::Visitor* visitor) {
+  visitor->Visit(this);
+}
+
+void GroupNode::VisitConst(Node::ConstVisitor* visitor) const {
+  visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Node construction from Parquet metadata
+
+struct NodeParams {
+  explicit NodeParams(const std::string& name) : name(name) {}
+
+  const std::string& name;
+  Repetition::type repetition;
+  LogicalType::type logical_type;
+};
+
+static inline NodeParams GetNodeParams(const format::SchemaElement* element) {
+  NodeParams params(element->name);
+
+  params.repetition = FromThrift(element->repetition_type);
+  if (element->__isset.converted_type) {
+    params.logical_type = FromThrift(element->converted_type);
+  } else {
+    params.logical_type = LogicalType::NONE;
+  }
+  return params;
+}
+
+std::unique_ptr<Node> GroupNode::FromParquet(
+    const void* opaque_element, int node_id, const NodeVector& fields) {
+  const format::SchemaElement* element =
+      static_cast<const format::SchemaElement*>(opaque_element);
+  NodeParams params = GetNodeParams(element);
+  return std::unique_ptr<Node>(new GroupNode(
+      params.name, params.repetition, fields, params.logical_type, node_id));
+}
+
+std::unique_ptr<Node> PrimitiveNode::FromParquet(
+    const void* opaque_element, int node_id) {
+  const format::SchemaElement* element =
+      static_cast<const format::SchemaElement*>(opaque_element);
+  NodeParams params = GetNodeParams(element);
+
+  std::unique_ptr<PrimitiveNode> result =
+      std::unique_ptr<PrimitiveNode>(new PrimitiveNode(params.name, params.repetition,
+          FromThrift(element->type), params.logical_type, element->type_length,
+          element->precision, element->scale, node_id));
+
+  // Return as unique_ptr to the base type
+  return std::unique_ptr<Node>(result.release());
+}
+
+void GroupNode::ToParquet(void* opaque_element) const {
+  format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+  element->__set_name(name_);
+  element->__set_num_children(field_count());
+  element->__set_repetition_type(ToThrift(repetition_));
+  if (logical_type_ != LogicalType::NONE) {
+    element->__set_converted_type(ToThrift(logical_type_));
+  }
+}
+
+void PrimitiveNode::ToParquet(void* opaque_element) const {
+  format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+
+  element->__set_name(name_);
+  element->__set_num_children(0);
+  element->__set_repetition_type(ToThrift(repetition_));
+  if (logical_type_ != LogicalType::NONE) {
+    element->__set_converted_type(ToThrift(logical_type_));
+  }
+  element->__set_type(ToThrift(physical_type_));
+  if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+    element->__set_type_length(type_length_);
+  }
+  if (decimal_metadata_.isset) {
+    element->__set_precision(decimal_metadata_.precision);
+    element->__set_scale(decimal_metadata_.scale);
+  }
+}
+
+// ----------------------------------------------------------------------
+// Schema converters
+
+std::unique_ptr<Node> FlatSchemaConverter::Convert() {
+  const SchemaElement& root = elements_[0];
+
+  // Validate the root node
+  if (root.num_children == 0) {
+    throw ParquetException("Root node did not have children");
+  }
+
+  // Relaxing this restriction as some implementations don't set this
+  // if (root.repetition_type != FieldRepetitionType::REPEATED) {
+  //   throw ParquetException("Root node was not FieldRepetitionType::REPEATED");
+  // }
+
+  return NextNode();
+}
+
+std::unique_ptr<Node> FlatSchemaConverter::NextNode() {
+  const SchemaElement& element = Next();
+
+  int node_id = next_id();
+
+  const void* opaque_element = static_cast<const void*>(&element);
+
+  if (element.num_children == 0) {
+    // Leaf (primitive) node
+    return PrimitiveNode::FromParquet(opaque_element, node_id);
+  } else {
+    // Group
+    NodeVector fields;
+    for (int i = 0; i < element.num_children; ++i) {
+      std::unique_ptr<Node> field = NextNode();
+      fields.push_back(NodePtr(field.release()));
+    }
+    return GroupNode::FromParquet(opaque_element, node_id, fields);
+  }
+}
+
+const format::SchemaElement& FlatSchemaConverter::Next() {
+  if (pos_ == length_) {
+    throw ParquetException("Malformed schema: not enough SchemaElement values");
+  }
+  return elements_[pos_++];
+}
+
+std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
+  FlatSchemaConverter converter(&schema[0], schema.size());
+  std::unique_ptr<Node> root = converter.Convert();
+
+  std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
+  descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
+
+  return descr;
+}
+
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
+  SchemaFlattener flattener(schema, out);
+  flattener.Flatten();
+}
+
+class SchemaVisitor : public Node::ConstVisitor {
+ public:
+  explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
+      : elements_(elements) {}
+  virtual ~SchemaVisitor() {}
+
+  void Visit(const Node* node) override {
+    format::SchemaElement element;
+    node->ToParquet(&element);
+    elements_->push_back(element);
+
+    if (node->is_group()) {
+      const GroupNode* group_node = static_cast<const GroupNode*>(node);
+      for (int i = 0; i < group_node->field_count(); ++i) {
+        group_node->field(i)->VisitConst(this);
+      }
+    }
+  }
+
+ private:
+  std::vector<format::SchemaElement>* elements_;
+};
+
+SchemaFlattener::SchemaFlattener(
+    const GroupNode* schema, std::vector<format::SchemaElement>* out)
+    : root_(schema), elements_(out) {}
+
+void SchemaFlattener::Flatten() {
+  SchemaVisitor visitor(elements_);
+  root_->VisitConst(&visitor);
+}
+
+// ----------------------------------------------------------------------
+// Schema printing
+
+class SchemaPrinter : public Node::ConstVisitor {
+ public:
+  explicit SchemaPrinter(std::ostream& stream, int indent_width)
+      : stream_(stream), indent_(0), indent_width_(2) {}
+
+  void Visit(const Node* node) override;
+
+ private:
+  void Visit(const PrimitiveNode* node);
+  void Visit(const GroupNode* node);
+
+  void Indent();
+
+  std::ostream& stream_;
+
+  int indent_;
+  int indent_width_;
+};
+
+static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
+  switch (repetition) {
+    case Repetition::REQUIRED:
+      stream << "required";
+      break;
+    case Repetition::OPTIONAL:
+      stream << "optional";
+      break;
+    case Repetition::REPEATED:
+      stream << "repeated";
+      break;
+    default:
+      break;
+  }
+}
+
+static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
+  switch (node->physical_type()) {
+    case Type::BOOLEAN:
+      stream << "boolean";
+      break;
+    case Type::INT32:
+      stream << "int32";
+      break;
+    case Type::INT64:
+      stream << "int64";
+      break;
+    case Type::INT96:
+      stream << "int96";
+      break;
+    case Type::FLOAT:
+      stream << "float";
+      break;
+    case Type::DOUBLE:
+      stream << "double";
+      break;
+    case Type::BYTE_ARRAY:
+      stream << "binary";
+      break;
+    case Type::FIXED_LEN_BYTE_ARRAY:
+      stream << "fixed_len_byte_array(" << node->type_length() << ")";
+      break;
+    default:
+      break;
+  }
+}
+
+static void PrintLogicalType(const PrimitiveNode* node, std::ostream& stream) {
+  auto lt = node->logical_type();
+  if (lt == LogicalType::DECIMAL) {
+    stream << " (" << LogicalTypeToString(lt) << "(" << node->decimal_metadata().precision
+           << "," << node->decimal_metadata().scale << "))";
+  } else if (lt != LogicalType::NONE) {
+    stream << " (" << LogicalTypeToString(lt) << ")";
+  }
+}
+
+void SchemaPrinter::Visit(const PrimitiveNode* node) {
+  PrintRepLevel(node->repetition(), stream_);
+  stream_ << " ";
+  PrintType(node, stream_);
+  stream_ << " " << node->name();
+  PrintLogicalType(node, stream_);
+  stream_ << ";" << std::endl;
+}
+
+void SchemaPrinter::Visit(const GroupNode* node) {
+  if (!node->parent()) {
+    stream_ << "message " << node->name() << " {" << std::endl;
+  } else {
+    PrintRepLevel(node->repetition(), stream_);
+    stream_ << " group " << node->name();
+    auto lt = node->logical_type();
+    if (lt != LogicalType::NONE) { stream_ << " (" << LogicalTypeToString(lt) << ")"; }
+    stream_ << " {" << std::endl;
+  }
+
+  indent_ += indent_width_;
+  for (int i = 0; i < node->field_count(); ++i) {
+    node->field(i)->VisitConst(this);
+  }
+  indent_ -= indent_width_;
+  Indent();
+  stream_ << "}" << std::endl;
+}
+
+void SchemaPrinter::Indent() {
+  if (indent_ > 0) {
+    std::string spaces(indent_, ' ');
+    stream_ << spaces;
+  }
+}
+
+void SchemaPrinter::Visit(const Node* node) {
+  Indent();
+  if (node->is_group()) {
+    Visit(static_cast<const GroupNode*>(node));
+  } else {
+    // Primitive
+    Visit(static_cast<const PrimitiveNode*>(node));
+  }
+}
+
+void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
+  SchemaPrinter printer(stream, indent_width);
+  printer.Visit(schema);
+}
+
+}  // namespace schema
+
+using schema::ColumnPath;
+using schema::Node;
+using schema::NodePtr;
+using schema::PrimitiveNode;
+using schema::GroupNode;
+
+void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
+  Init(NodePtr(schema.release()));
+}
+
+void SchemaDescriptor::Init(const NodePtr& schema) {
+  schema_ = schema;
+
+  if (!schema_->is_group()) {
+    throw ParquetException("Must initialize with a schema group");
+  }
+
+  group_node_ = static_cast<const GroupNode*>(schema_.get());
+  leaves_.clear();
+
+  for (int i = 0; i < group_node_->field_count(); ++i) {
+    BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
+  }
+}
+
+bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
+  if (this->num_columns() != other.num_columns()) { return false; }
+
+  for (int i = 0; i < this->num_columns(); ++i) {
+    if (!this->Column(i)->Equals(*other.Column(i))) { return false; }
+  }
+
+  return true;
+}
+
+void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
+    int16_t max_rep_level, const NodePtr& base) {
+  if (node->is_optional()) {
+    ++max_def_level;
+  } else if (node->is_repeated()) {
+    // Repeated fields add a definition level. This is used to distinguish
+    // between an empty list and a list with an item in it.
+    ++max_rep_level;
+    ++max_def_level;
+  }
+
+  // Now, walk the schema and create a ColumnDescriptor for each leaf node
+  if (node->is_group()) {
+    const GroupNode* group = static_cast<const GroupNode*>(node.get());
+    for (int i = 0; i < group->field_count(); ++i) {
+      BuildTree(group->field(i), max_def_level, max_rep_level, base);
+    }
+  } else {
+    // Primitive node, append to leaves
+    leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
+    leaf_to_base_.emplace(leaves_.size() - 1, base);
+  }
+}
+
+ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node,
+    int16_t max_definition_level, int16_t max_repetition_level,
+    const SchemaDescriptor* schema_descr)
+    : node_(node),
+      max_definition_level_(max_definition_level),
+      max_repetition_level_(max_repetition_level),
+      schema_descr_(schema_descr) {
+  if (!node_->is_primitive()) { throw ParquetException("Must be a primitive type"); }
+  primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
+}
+
+bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
+  return primitive_node_->Equals(other.primitive_node_) &&
+         max_repetition_level() == other.max_repetition_level() &&
+         max_definition_level() == other.max_definition_level();
+}
+
+const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
+  DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+  return &leaves_[i];
+}
+
+const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const {
+  DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+  return leaf_to_base_.find(i)->second;
+}
+
+int ColumnDescriptor::type_scale() const {
+  return primitive_node_->decimal_metadata().scale;
+}
+
+int ColumnDescriptor::type_precision() const {
+  return primitive_node_->decimal_metadata().precision;
+}
+
+int ColumnDescriptor::type_length() const {
+  return primitive_node_->type_length();
+}
+
+const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
+  // Build the path in reverse order as we traverse the nodes to the top
+  std::vector<std::string> rpath_;
+  const Node* node = primitive_node_;
+  // The schema node is not part of the ColumnPath
+  while (node->parent()) {
+    rpath_.push_back(node->name());
+    node = node->parent();
+  }
+
+  // Build ColumnPath in correct order
+  std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend());
+  return std::make_shared<ColumnPath>(std::move(path_));
+}
+
+}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema.h b/src/parquet/schema.h
new file mode 100644
index 0000000..30aea44
--- /dev/null
+++ b/src/parquet/schema.h
@@ -0,0 +1,405 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#ifndef PARQUET_SCHEMA_TYPES_H
+#define PARQUET_SCHEMA_TYPES_H
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "parquet/types.h"
+#include "parquet/util/macros.h"
+#include "parquet/util/visibility.h"
+
+namespace parquet {
+
+class SchemaDescriptor;
+
+namespace schema {
+
+// List encodings: using the terminology from Impala to define different styles
+// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
+// the converted type named in the Parquet metadata is ConvertedType::LIST we
+// use that terminology here. It also helps distinguish from the *_ARRAY
+// primitive types.
+//
+// One-level encoding: Only allows required lists with required cells
+//   repeated value_type name
+//
+// Two-level encoding: Enables optional lists with only required cells
+//   <required/optional> group list
+//     repeated value_type item
+//
+// Three-level encoding: Enables optional lists with optional cells
+//   <required/optional> group bag
+//     repeated group list
+//       <required/optional> value_type item
+//
+// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
+// the non-repeated nodes set to required.
+//
+// The "official" encoding recommended in the Parquet spec is the 3-level, and
+// we use that as the default when creating list types. For semantic completeness
+// we allow the other two. Since all types of encodings will occur "in the
+// wild" we need to be able to interpret the associated definition levels in
+// the context of the actual encoding used in the file.
+//
+// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
+// SchemaElement, which could make things challenging if we are trying to infer
+// that a sequence of nodes semantically represents an array according to one
+// of these encodings (versus a struct containing an array). We should refuse
+// the temptation to guess, as they say.
+struct ListEncoding {
+  enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
+};
+
+struct DecimalMetadata {
+  bool isset;
+  int32_t scale;
+  int32_t precision;
+};
+
+class PARQUET_EXPORT ColumnPath {
+ public:
+  ColumnPath() : path_() {}
+  explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
+  explicit ColumnPath(std::vector<std::string>&& path) : path_(path) {}
+
+  static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
+
+  std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
+  std::string ToDotString() const;
+  const std::vector<std::string>& ToDotVector() const;
+
+ protected:
+  std::vector<std::string> path_;
+};
+
+class GroupNode;
+
+// Base class for logical schema types. A type has a name, repetition level,
+// and optionally a logical type (ConvertedType in Parquet metadata parlance)
+class PARQUET_EXPORT Node {
+ public:
+  enum type { PRIMITIVE, GROUP };
+
+  Node(Node::type type, const std::string& name, Repetition::type repetition,
+      LogicalType::type logical_type = LogicalType::NONE, int id = -1)
+      : type_(type),
+        name_(name),
+        repetition_(repetition),
+        logical_type_(logical_type),
+        id_(id),
+        parent_(nullptr) {}
+
+  virtual ~Node() {}
+
+  bool is_primitive() const { return type_ == Node::PRIMITIVE; }
+
+  bool is_group() const { return type_ == Node::GROUP; }
+
+  bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
+
+  bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
+
+  bool is_required() const { return repetition_ == Repetition::REQUIRED; }
+
+  virtual bool Equals(const Node* other) const = 0;
+
+  const std::string& name() const { return name_; }
+
+  Node::type node_type() const { return type_; }
+
+  Repetition::type repetition() const { return repetition_; }
+
+  LogicalType::type logical_type() const { return logical_type_; }
+
+  int id() const { return id_; }
+
+  const Node* parent() const { return parent_; }
+
+  // ToParquet returns an opaque void* to avoid exporting
+  // parquet::SchemaElement into the public API
+  virtual void ToParquet(void* opaque_element) const = 0;
+
+  // Node::Visitor abstract class for walking schemas with the visitor pattern
+  class Visitor {
+   public:
+    virtual ~Visitor() {}
+
+    virtual void Visit(Node* node) = 0;
+  };
+  class ConstVisitor {
+   public:
+    virtual ~ConstVisitor() {}
+
+    virtual void Visit(const Node* node) = 0;
+  };
+
+  virtual void Visit(Visitor* visitor) = 0;
+  virtual void VisitConst(ConstVisitor* visitor) const = 0;
+
+ protected:
+  friend class GroupNode;
+
+  Node::type type_;
+  std::string name_;
+  Repetition::type repetition_;
+  LogicalType::type logical_type_;
+  int id_;
+  // Nodes should not be shared, they have a single parent.
+  const Node* parent_;
+
+  bool EqualsInternal(const Node* other) const;
+  void SetParent(const Node* p_parent);
+};
+
+// Save our breath all over the place with these typedefs
+typedef std::shared_ptr<Node> NodePtr;
+typedef std::vector<NodePtr> NodeVector;
+
+// A type that is one of the primitive Parquet storage types. In addition to
+// the other type metadata (name, repetition level, logical type), also has the
+// physical storage type and their type-specific metadata (byte width, decimal
+// parameters)
+class PARQUET_EXPORT PrimitiveNode : public Node {
+ public:
+  // FromParquet accepts an opaque void* to avoid exporting
+  // parquet::SchemaElement into the public API
+  static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id);
+
+  static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+      Type::type type, LogicalType::type logical_type = LogicalType::NONE,
+      int length = -1, int precision = -1, int scale = -1) {
+    return NodePtr(new PrimitiveNode(
+        name, repetition, type, logical_type, length, precision, scale));
+  }
+
+  bool Equals(const Node* other) const override;
+
+  Type::type physical_type() const { return physical_type_; }
+
+  int32_t type_length() const { return type_length_; }
+
+  const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
+
+  void ToParquet(void* opaque_element) const override;
+  void Visit(Visitor* visitor) override;
+  void VisitConst(ConstVisitor* visitor) const override;
+
+ private:
+  PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
+      LogicalType::type logical_type = LogicalType::NONE, int length = -1,
+      int precision = -1, int scale = -1, int id = -1);
+
+  Type::type physical_type_;
+  int32_t type_length_;
+  DecimalMetadata decimal_metadata_;
+
+  // For FIXED_LEN_BYTE_ARRAY
+  void SetTypeLength(int32_t length) { type_length_ = length; }
+
+  // For Decimal logical type: Precision and scale
+  void SetDecimalMetadata(int32_t scale, int32_t precision) {
+    decimal_metadata_.scale = scale;
+    decimal_metadata_.precision = precision;
+  }
+
+  bool EqualsInternal(const PrimitiveNode* other) const;
+
+  FRIEND_TEST(TestPrimitiveNode, Attrs);
+  FRIEND_TEST(TestPrimitiveNode, Equals);
+  FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
+  FRIEND_TEST(TestPrimitiveNode, FromParquet);
+};
+
+class PARQUET_EXPORT GroupNode : public Node {
+ public:
+  // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting
+  // parquet::SchemaElement into the public API
+  static std::unique_ptr<Node> FromParquet(
+      const void* opaque_element, int id, const NodeVector& fields);
+
+  static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+      const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE) {
+    return NodePtr(new GroupNode(name, repetition, fields, logical_type));
+  }
+
+  bool Equals(const Node* other) const override;
+
+  const NodePtr& field(int i) const { return fields_[i]; }
+
+  int field_count() const { return fields_.size(); }
+
+  void ToParquet(void* opaque_element) const override;
+  void Visit(Visitor* visitor) override;
+  void VisitConst(ConstVisitor* visitor) const override;
+
+ private:
+  GroupNode(const std::string& name, Repetition::type repetition,
+      const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE,
+      int id = -1)
+      : Node(Node::GROUP, name, repetition, logical_type, id), fields_(fields) {
+    for (NodePtr& field : fields_) {
+      field->SetParent(this);
+    }
+  }
+
+  NodeVector fields_;
+  bool EqualsInternal(const GroupNode* other) const;
+
+  FRIEND_TEST(TestGroupNode, Attrs);
+  FRIEND_TEST(TestGroupNode, Equals);
+};
+
+// ----------------------------------------------------------------------
+// Convenience primitive type factory functions
+
+#define PRIMITIVE_FACTORY(FuncName, TYPE)                                            \
+  static inline NodePtr FuncName(                                                    \
+      const std::string& name, Repetition::type repetition = Repetition::OPTIONAL) { \
+    return PrimitiveNode::Make(name, repetition, Type::TYPE);                        \
+  }
+
+PRIMITIVE_FACTORY(Boolean, BOOLEAN);
+PRIMITIVE_FACTORY(Int32, INT32);
+PRIMITIVE_FACTORY(Int64, INT64);
+PRIMITIVE_FACTORY(Int96, INT96);
+PRIMITIVE_FACTORY(Float, FLOAT);
+PRIMITIVE_FACTORY(Double, DOUBLE);
+PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY);
+
+void PARQUET_EXPORT PrintSchema(
+    const schema::Node* schema, std::ostream& stream, int indent_width = 2);
+
+}  // namespace schema
+
+// The ColumnDescriptor encapsulates information necessary to interpret
+// primitive column data in the context of a particular schema. We have to
+// examine the node structure of a column's path to the root in the schema tree
+// to be able to reassemble the nested structure from the repetition and
+// definition levels.
+class PARQUET_EXPORT ColumnDescriptor {
+ public:
+  ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
+      int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
+
+  bool Equals(const ColumnDescriptor& other) const;
+
+  int16_t max_definition_level() const { return max_definition_level_; }
+
+  int16_t max_repetition_level() const { return max_repetition_level_; }
+
+  Type::type physical_type() const { return primitive_node_->physical_type(); }
+
+  LogicalType::type logical_type() const { return primitive_node_->logical_type(); }
+
+  const std::string& name() const { return primitive_node_->name(); }
+
+  const std::shared_ptr<schema::ColumnPath> path() const;
+
+  const schema::NodePtr& schema_node() const { return node_; }
+
+  int type_length() const;
+
+  int type_precision() const;
+
+  int type_scale() const;
+
+ private:
+  schema::NodePtr node_;
+  const schema::PrimitiveNode* primitive_node_;
+
+  int16_t max_definition_level_;
+  int16_t max_repetition_level_;
+
+  // When this descriptor is part of a real schema (and not being used for
+  // testing purposes), maintain a link back to the parent SchemaDescriptor to
+  // enable reverse graph traversals
+  const SchemaDescriptor* schema_descr_;
+};
+
+// Container for the converted Parquet schema with a computed information from
+// the schema analysis needed for file reading
+//
+// * Column index to Node
+// * Max repetition / definition levels for each primitive node
+//
+// The ColumnDescriptor objects produced by this class can be used to assist in
+// the reconstruction of fully materialized data structures from the
+// repetition-definition level encoding of nested data
+//
+// TODO(wesm): this object can be recomputed from a Schema
+class PARQUET_EXPORT SchemaDescriptor {
+ public:
+  SchemaDescriptor() {}
+  ~SchemaDescriptor() {}
+
+  // Analyze the schema
+  void Init(std::unique_ptr<schema::Node> schema);
+  void Init(const schema::NodePtr& schema);
+
+  const ColumnDescriptor* Column(int i) const;
+
+  bool Equals(const SchemaDescriptor& other) const;
+
+  // The number of physical columns appearing in the file
+  int num_columns() const { return leaves_.size(); }
+
+  const schema::NodePtr& schema_root() const { return schema_; }
+
+  const schema::GroupNode* group_node() const { return group_node_; }
+
+  // Returns the root (child of the schema root) node of the leaf(column) node
+  const schema::NodePtr& GetColumnRoot(int i) const;
+
+  const std::string& name() const { return group_node_->name(); }
+
+ private:
+  friend class ColumnDescriptor;
+
+  schema::NodePtr schema_;
+  const schema::GroupNode* group_node_;
+
+  void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
+      int16_t max_rep_level, const schema::NodePtr& base);
+
+  // Result of leaf node / tree analysis
+  std::vector<ColumnDescriptor> leaves_;
+
+  // Mapping between leaf nodes and root group of leaf (first node
+  // below the schema's root group)
+  //
+  // For example, the leaf `a.b.c.d` would have a link back to `a`
+  //
+  // -- a  <------
+  // -- -- b     |
+  // -- -- -- c  |
+  // -- -- -- -- d
+  std::unordered_map<int, const schema::NodePtr> leaf_to_base_;
+};
+
+}  // namespace parquet
+
+#endif  // PARQUET_SCHEMA_TYPES_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/schema/CMakeLists.txt b/src/parquet/schema/CMakeLists.txt
deleted file mode 100644
index 8aa9969..0000000
--- a/src/parquet/schema/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Headers: top level
-install(FILES
-  descriptor.h
-  printer.h
-  types.h
-  DESTINATION include/parquet/schema)
-
-ADD_PARQUET_TEST(schema-converter-test)
-ADD_PARQUET_TEST(schema-descriptor-test)
-ADD_PARQUET_TEST(schema-printer-test)
-ADD_PARQUET_TEST(schema-types-test)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/converter.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/converter.cc b/src/parquet/schema/converter.cc
deleted file mode 100644
index 3b18af3..0000000
--- a/src/parquet/schema/converter.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/schema/converter.h"
-
-#include "parquet/exception.h"
-#include "parquet/schema/descriptor.h"
-#include "parquet/schema/types.h"
-#include "parquet/thrift/parquet_types.h"
-
-using parquet::format::SchemaElement;
-
-namespace parquet {
-
-namespace schema {
-
-std::unique_ptr<Node> FlatSchemaConverter::Convert() {
-  const SchemaElement& root = elements_[0];
-
-  // Validate the root node
-  if (root.num_children == 0) {
-    throw ParquetException("Root node did not have children");
-  }
-
-  // Relaxing this restriction as some implementations don't set this
-  // if (root.repetition_type != FieldRepetitionType::REPEATED) {
-  //   throw ParquetException("Root node was not FieldRepetitionType::REPEATED");
-  // }
-
-  return NextNode();
-}
-
-std::unique_ptr<Node> FlatSchemaConverter::NextNode() {
-  const SchemaElement& element = Next();
-
-  int node_id = next_id();
-
-  const void* opaque_element = static_cast<const void*>(&element);
-
-  if (element.num_children == 0) {
-    // Leaf (primitive) node
-    return PrimitiveNode::FromParquet(opaque_element, node_id);
-  } else {
-    // Group
-    NodeVector fields;
-    for (int i = 0; i < element.num_children; ++i) {
-      std::unique_ptr<Node> field = NextNode();
-      fields.push_back(NodePtr(field.release()));
-    }
-    return GroupNode::FromParquet(opaque_element, node_id, fields);
-  }
-}
-
-const format::SchemaElement& FlatSchemaConverter::Next() {
-  if (pos_ == length_) {
-    throw ParquetException("Malformed schema: not enough SchemaElement values");
-  }
-  return elements_[pos_++];
-}
-
-std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
-  FlatSchemaConverter converter(&schema[0], schema.size());
-  std::unique_ptr<Node> root = converter.Convert();
-
-  std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
-  descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
-
-  return descr;
-}
-
-void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
-  SchemaFlattener flattener(schema, out);
-  flattener.Flatten();
-}
-
-class SchemaVisitor : public Node::ConstVisitor {
- public:
-  explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
-      : elements_(elements) {}
-  virtual ~SchemaVisitor() {}
-
-  void Visit(const Node* node) override {
-    format::SchemaElement element;
-    node->ToParquet(&element);
-    elements_->push_back(element);
-
-    if (node->is_group()) {
-      const GroupNode* group_node = static_cast<const GroupNode*>(node);
-      for (int i = 0; i < group_node->field_count(); ++i) {
-        group_node->field(i)->VisitConst(this);
-      }
-    }
-  }
-
- private:
-  std::vector<format::SchemaElement>* elements_;
-};
-
-SchemaFlattener::SchemaFlattener(
-    const GroupNode* schema, std::vector<format::SchemaElement>* out)
-    : root_(schema), elements_(out) {}
-
-void SchemaFlattener::Flatten() {
-  SchemaVisitor visitor(elements_);
-  root_->VisitConst(&visitor);
-}
-
-}  // namespace schema
-
-}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/converter.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/converter.h b/src/parquet/schema/converter.h
deleted file mode 100644
index 617d985..0000000
--- a/src/parquet/schema/converter.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Conversion routines for converting to and from flat Parquet metadata. Among
-// other things, this limits the exposure of the internals of the Thrift
-// metadata structs to the rest of the library.
-
-// NB: This file is not part of the schema public API and only used internally
-// for converting to and from Parquet Thrift metadata
-
-#ifndef PARQUET_SCHEMA_CONVERTER_H
-#define PARQUET_SCHEMA_CONVERTER_H
-
-#include <memory>
-#include <vector>
-
-namespace parquet {
-
-namespace format {
-class SchemaElement;
-}
-
-class SchemaDescriptor;
-
-namespace schema {
-
-class GroupNode;
-class Node;
-
-// ----------------------------------------------------------------------
-// Conversion from Parquet Thrift metadata
-
-std::shared_ptr<SchemaDescriptor> FromParquet(
-    const std::vector<format::SchemaElement>& schema);
-
-class FlatSchemaConverter {
- public:
-  FlatSchemaConverter(const format::SchemaElement* elements, int length)
-      : elements_(elements), length_(length), pos_(0), current_id_(0) {}
-
-  std::unique_ptr<Node> Convert();
-
- private:
-  const format::SchemaElement* elements_;
-  int length_;
-  int pos_;
-  int current_id_;
-
-  int next_id() { return current_id_++; }
-
-  const format::SchemaElement& Next();
-
-  std::unique_ptr<Node> NextNode();
-};
-
-// ----------------------------------------------------------------------
-// Conversion to Parquet Thrift metadata
-
-void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out);
-
-// Converts nested parquet schema back to a flat vector of Thrift structs
-class SchemaFlattener {
- public:
-  SchemaFlattener(const GroupNode* schema, std::vector<format::SchemaElement>* out);
-
-  void Flatten();
-
- private:
-  const GroupNode* root_;
-  std::vector<format::SchemaElement>* elements_;
-};
-
-}  // namespace schema
-
-}  // namespace parquet
-
-#endif  // PARQUET_SCHEMA_CONVERTER_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/descriptor.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc
deleted file mode 100644
index 0b0d006..0000000
--- a/src/parquet/schema/descriptor.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/schema/descriptor.h"
-
-#include "parquet/exception.h"
-#include "parquet/util/logging.h"
-
-namespace parquet {
-
-using schema::ColumnPath;
-using schema::Node;
-using schema::NodePtr;
-using schema::PrimitiveNode;
-using schema::GroupNode;
-
-void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
-  Init(NodePtr(schema.release()));
-}
-
-void SchemaDescriptor::Init(const NodePtr& schema) {
-  schema_ = schema;
-
-  if (!schema_->is_group()) {
-    throw ParquetException("Must initialize with a schema group");
-  }
-
-  group_node_ = static_cast<const GroupNode*>(schema_.get());
-  leaves_.clear();
-
-  for (int i = 0; i < group_node_->field_count(); ++i) {
-    BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
-  }
-}
-
-bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
-  if (this->num_columns() != other.num_columns()) { return false; }
-
-  for (int i = 0; i < this->num_columns(); ++i) {
-    if (!this->Column(i)->Equals(*other.Column(i))) { return false; }
-  }
-
-  return true;
-}
-
-void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
-    int16_t max_rep_level, const NodePtr& base) {
-  if (node->is_optional()) {
-    ++max_def_level;
-  } else if (node->is_repeated()) {
-    // Repeated fields add a definition level. This is used to distinguish
-    // between an empty list and a list with an item in it.
-    ++max_rep_level;
-    ++max_def_level;
-  }
-
-  // Now, walk the schema and create a ColumnDescriptor for each leaf node
-  if (node->is_group()) {
-    const GroupNode* group = static_cast<const GroupNode*>(node.get());
-    for (int i = 0; i < group->field_count(); ++i) {
-      BuildTree(group->field(i), max_def_level, max_rep_level, base);
-    }
-  } else {
-    // Primitive node, append to leaves
-    leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
-    leaf_to_base_.emplace(leaves_.size() - 1, base);
-  }
-}
-
-ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node,
-    int16_t max_definition_level, int16_t max_repetition_level,
-    const SchemaDescriptor* schema_descr)
-    : node_(node),
-      max_definition_level_(max_definition_level),
-      max_repetition_level_(max_repetition_level),
-      schema_descr_(schema_descr) {
-  if (!node_->is_primitive()) { throw ParquetException("Must be a primitive type"); }
-  primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
-}
-
-bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
-  return primitive_node_->Equals(other.primitive_node_) &&
-         max_repetition_level() == other.max_repetition_level() &&
-         max_definition_level() == other.max_definition_level();
-}
-
-const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
-  DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
-  return &leaves_[i];
-}
-
-const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const {
-  DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
-  return leaf_to_base_.find(i)->second;
-}
-
-int ColumnDescriptor::type_scale() const {
-  return primitive_node_->decimal_metadata().scale;
-}
-
-int ColumnDescriptor::type_precision() const {
-  return primitive_node_->decimal_metadata().precision;
-}
-
-int ColumnDescriptor::type_length() const {
-  return primitive_node_->type_length();
-}
-
-const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
-  // Build the path in reverse order as we traverse the nodes to the top
-  std::vector<std::string> rpath_;
-  const Node* node = primitive_node_;
-  // The schema node is not part of the ColumnPath
-  while (node->parent()) {
-    rpath_.push_back(node->name());
-    node = node->parent();
-  }
-
-  // Build ColumnPath in correct order
-  std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend());
-  return std::make_shared<ColumnPath>(std::move(path_));
-}
-
-}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/descriptor.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h
deleted file mode 100644
index ae7b60e..0000000
--- a/src/parquet/schema/descriptor.h
+++ /dev/null
@@ -1,142 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PARQUET_SCHEMA_DESCRIPTOR_H
-#define PARQUET_SCHEMA_DESCRIPTOR_H
-
-#include "parquet/schema/types.h"
-#include "parquet/types.h"
-#include "parquet/util/visibility.h"
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace parquet {
-
-class SchemaDescriptor;
-
-// The ColumnDescriptor encapsulates information necessary to interpret
-// primitive column data in the context of a particular schema. We have to
-// examine the node structure of a column's path to the root in the schema tree
-// to be able to reassemble the nested structure from the repetition and
-// definition levels.
-class PARQUET_EXPORT ColumnDescriptor {
- public:
-  ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
-      int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
-
-  bool Equals(const ColumnDescriptor& other) const;
-
-  int16_t max_definition_level() const { return max_definition_level_; }
-
-  int16_t max_repetition_level() const { return max_repetition_level_; }
-
-  Type::type physical_type() const { return primitive_node_->physical_type(); }
-
-  LogicalType::type logical_type() const { return primitive_node_->logical_type(); }
-
-  const std::string& name() const { return primitive_node_->name(); }
-
-  const std::shared_ptr<schema::ColumnPath> path() const;
-
-  const schema::NodePtr& schema_node() const { return node_; }
-
-  int type_length() const;
-
-  int type_precision() const;
-
-  int type_scale() const;
-
- private:
-  schema::NodePtr node_;
-  const schema::PrimitiveNode* primitive_node_;
-
-  int16_t max_definition_level_;
-  int16_t max_repetition_level_;
-
-  // When this descriptor is part of a real schema (and not being used for
-  // testing purposes), maintain a link back to the parent SchemaDescriptor to
-  // enable reverse graph traversals
-  const SchemaDescriptor* schema_descr_;
-};
-
-// Container for the converted Parquet schema with a computed information from
-// the schema analysis needed for file reading
-//
-// * Column index to Node
-// * Max repetition / definition levels for each primitive node
-//
-// The ColumnDescriptor objects produced by this class can be used to assist in
-// the reconstruction of fully materialized data structures from the
-// repetition-definition level encoding of nested data
-//
-// TODO(wesm): this object can be recomputed from a Schema
-class PARQUET_EXPORT SchemaDescriptor {
- public:
-  SchemaDescriptor() {}
-  ~SchemaDescriptor() {}
-
-  // Analyze the schema
-  void Init(std::unique_ptr<schema::Node> schema);
-  void Init(const schema::NodePtr& schema);
-
-  const ColumnDescriptor* Column(int i) const;
-
-  bool Equals(const SchemaDescriptor& other) const;
-
-  // The number of physical columns appearing in the file
-  int num_columns() const { return leaves_.size(); }
-
-  const schema::NodePtr& schema_root() const { return schema_; }
-
-  const schema::GroupNode* group_node() const { return group_node_; }
-
-  // Returns the root (child of the schema root) node of the leaf(column) node
-  const schema::NodePtr& GetColumnRoot(int i) const;
-
-  const std::string& name() const { return group_node_->name(); }
-
- private:
-  friend class ColumnDescriptor;
-
-  schema::NodePtr schema_;
-  const schema::GroupNode* group_node_;
-
-  void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
-      int16_t max_rep_level, const schema::NodePtr& base);
-
-  // Result of leaf node / tree analysis
-  std::vector<ColumnDescriptor> leaves_;
-
-  // Mapping between leaf nodes and root group of leaf (first node
-  // below the schema's root group)
-  //
-  // For example, the leaf `a.b.c.d` would have a link back to `a`
-  //
-  // -- a  <------
-  // -- -- b     |
-  // -- -- -- c  |
-  // -- -- -- -- d
-  std::unordered_map<int, const schema::NodePtr> leaf_to_base_;
-};
-
-}  // namespace parquet
-
-#endif  // PARQUET_SCHEMA_DESCRIPTOR_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/printer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc
deleted file mode 100644
index ca11244..0000000
--- a/src/parquet/schema/printer.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/schema/printer.h"
-
-#include <memory>
-#include <string>
-
-#include "parquet/schema/types.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-namespace schema {
-
-class SchemaPrinter : public Node::ConstVisitor {
- public:
-  explicit SchemaPrinter(std::ostream& stream, int indent_width)
-      : stream_(stream), indent_(0), indent_width_(2) {}
-
-  void Visit(const Node* node) override;
-
- private:
-  void Visit(const PrimitiveNode* node);
-  void Visit(const GroupNode* node);
-
-  void Indent();
-
-  std::ostream& stream_;
-
-  int indent_;
-  int indent_width_;
-};
-
-static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
-  switch (repetition) {
-    case Repetition::REQUIRED:
-      stream << "required";
-      break;
-    case Repetition::OPTIONAL:
-      stream << "optional";
-      break;
-    case Repetition::REPEATED:
-      stream << "repeated";
-      break;
-    default:
-      break;
-  }
-}
-
-static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
-  switch (node->physical_type()) {
-    case Type::BOOLEAN:
-      stream << "boolean";
-      break;
-    case Type::INT32:
-      stream << "int32";
-      break;
-    case Type::INT64:
-      stream << "int64";
-      break;
-    case Type::INT96:
-      stream << "int96";
-      break;
-    case Type::FLOAT:
-      stream << "float";
-      break;
-    case Type::DOUBLE:
-      stream << "double";
-      break;
-    case Type::BYTE_ARRAY:
-      stream << "binary";
-      break;
-    case Type::FIXED_LEN_BYTE_ARRAY:
-      stream << "fixed_len_byte_array(" << node->type_length() << ")";
-      break;
-    default:
-      break;
-  }
-}
-
-static void PrintLogicalType(const PrimitiveNode* node, std::ostream& stream) {
-  auto lt = node->logical_type();
-  if (lt == LogicalType::DECIMAL) {
-    stream << " (" << LogicalTypeToString(lt) << "(" << node->decimal_metadata().precision
-           << "," << node->decimal_metadata().scale << "))";
-  } else if (lt != LogicalType::NONE) {
-    stream << " (" << LogicalTypeToString(lt) << ")";
-  }
-}
-
-void SchemaPrinter::Visit(const PrimitiveNode* node) {
-  PrintRepLevel(node->repetition(), stream_);
-  stream_ << " ";
-  PrintType(node, stream_);
-  stream_ << " " << node->name();
-  PrintLogicalType(node, stream_);
-  stream_ << ";" << std::endl;
-}
-
-void SchemaPrinter::Visit(const GroupNode* node) {
-  if (!node->parent()) {
-    stream_ << "message " << node->name() << " {" << std::endl;
-  } else {
-    PrintRepLevel(node->repetition(), stream_);
-    stream_ << " group " << node->name();
-    auto lt = node->logical_type();
-    if (lt != LogicalType::NONE) { stream_ << " (" << LogicalTypeToString(lt) << ")"; }
-    stream_ << " {" << std::endl;
-  }
-
-  indent_ += indent_width_;
-  for (int i = 0; i < node->field_count(); ++i) {
-    node->field(i)->VisitConst(this);
-  }
-  indent_ -= indent_width_;
-  Indent();
-  stream_ << "}" << std::endl;
-}
-
-void SchemaPrinter::Indent() {
-  if (indent_ > 0) {
-    std::string spaces(indent_, ' ');
-    stream_ << spaces;
-  }
-}
-
-void SchemaPrinter::Visit(const Node* node) {
-  Indent();
-  if (node->is_group()) {
-    Visit(static_cast<const GroupNode*>(node));
-  } else {
-    // Primitive
-    Visit(static_cast<const PrimitiveNode*>(node));
-  }
-}
-
-void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
-  SchemaPrinter printer(stream, indent_width);
-  printer.Visit(schema);
-}
-
-}  // namespace schema
-
-}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/printer.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.h b/src/parquet/schema/printer.h
deleted file mode 100644
index c37ef90..0000000
--- a/src/parquet/schema/printer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// A simple Schema printer using the visitor pattern
-
-#ifndef PARQUET_SCHEMA_PRINTER_H
-#define PARQUET_SCHEMA_PRINTER_H
-
-#include <ostream>
-
-#include "parquet/util/visibility.h"
-
-namespace parquet {
-
-namespace schema {
-
-class Node;
-
-void PARQUET_EXPORT PrintSchema(
-    const Node* schema, std::ostream& stream, int indent_width = 2);
-
-}  // namespace schema
-
-}  // namespace parquet
-
-#endif  // PARQUET_SCHEMA_PRINTER_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-converter-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc
deleted file mode 100644
index c752919..0000000
--- a/src/parquet/schema/schema-converter-test.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <gtest/gtest.h>
-
-#include <cstdlib>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "parquet/exception.h"
-#include "parquet/schema/converter.h"
-#include "parquet/schema/test-util.h"
-#include "parquet/schema/types.h"
-#include "parquet/thrift/parquet_types.h"
-#include "parquet/types.h"
-
-using std::string;
-using std::vector;
-
-using parquet::format::ConvertedType;
-using parquet::format::FieldRepetitionType;
-using parquet::format::SchemaElement;
-
-namespace parquet {
-
-namespace schema {
-
-// ----------------------------------------------------------------------
-// Test convert group
-
-class TestSchemaConverter : public ::testing::Test {
- public:
-  void setUp() { name_ = "parquet_schema"; }
-
-  void Convert(const parquet::format::SchemaElement* elements, int length) {
-    FlatSchemaConverter converter(elements, length);
-    node_ = converter.Convert();
-    ASSERT_TRUE(node_->is_group());
-    group_ = static_cast<const GroupNode*>(node_.get());
-  }
-
- protected:
-  std::string name_;
-  const GroupNode* group_;
-  std::unique_ptr<Node> node_;
-};
-
-bool check_for_parent_consistency(const GroupNode* node) {
-  // Each node should have the group as parent
-  for (int i = 0; i < node->field_count(); i++) {
-    const NodePtr& field = node->field(i);
-    if (field->parent() != node) { return false; }
-    if (field->is_group()) {
-      const GroupNode* group = static_cast<GroupNode*>(field.get());
-      if (!check_for_parent_consistency(group)) { return false; }
-    }
-  }
-  return true;
-}
-
-TEST_F(TestSchemaConverter, NestedExample) {
-  SchemaElement elt;
-  std::vector<SchemaElement> elements;
-  elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
-
-  // A primitive one
-  elements.push_back(
-      NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1));
-
-  // A group
-  elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2));
-
-  // 3-level list encoding, by hand
-  elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3);
-  elt.__set_converted_type(ConvertedType::LIST);
-  elements.push_back(elt);
-  elements.push_back(
-      NewPrimitive("item", FieldRepetitionType::OPTIONAL, format::Type::INT64, 4));
-
-  Convert(&elements[0], elements.size());
-
-  // Construct the expected schema
-  NodeVector fields;
-  fields.push_back(Int32("a", Repetition::REQUIRED));
-
-  // 3-level list encoding
-  NodePtr item = Int64("item");
-  NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
-  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
-  fields.push_back(bag);
-
-  NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
-
-  ASSERT_TRUE(schema->Equals(group_));
-
-  // Check that the parent relationship in each node is consitent
-  ASSERT_EQ(group_->parent(), nullptr);
-  ASSERT_TRUE(check_for_parent_consistency(group_));
-}
-
-TEST_F(TestSchemaConverter, InvalidRoot) {
-  // According to the Parquet specification, the first element in the
-  // list<SchemaElement> is a group whose children (and their descendants)
-  // contain all of the rest of the flattened schema elements. If the first
-  // element is not a group, it is a malformed Parquet file.
-
-  SchemaElement elements[2];
-  elements[0] =
-      NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED, format::Type::INT32, 0);
-  ASSERT_THROW(Convert(elements, 2), ParquetException);
-
-  // While the Parquet spec indicates that the root group should have REPEATED
-  // repetition type, some implementations may return REQUIRED or OPTIONAL
-  // groups as the first element. These tests check that this is okay as a
-  // practicality matter.
-  elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1, 0);
-  elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1);
-  Convert(elements, 2);
-
-  elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1, 0);
-  Convert(elements, 2);
-}
-
-TEST_F(TestSchemaConverter, NotEnoughChildren) {
-  // Throw a ParquetException, but don't core dump or anything
-  SchemaElement elt;
-  std::vector<SchemaElement> elements;
-  elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
-  ASSERT_THROW(Convert(&elements[0], 1), ParquetException);
-}
-
-// ----------------------------------------------------------------------
-// Schema tree flatten / unflatten
-
-class TestSchemaFlatten : public ::testing::Test {
- public:
-  void setUp() { name_ = "parquet_schema"; }
-
-  void Flatten(const GroupNode* schema) { ToParquet(schema, &elements_); }
-
- protected:
-  std::string name_;
-  std::vector<format::SchemaElement> elements_;
-};
-
-TEST_F(TestSchemaFlatten, DecimalMetadata) {
-  // Checks that DecimalMetadata is only set for DecimalTypes
-  NodePtr node = PrimitiveNode::Make(
-      "decimal", Repetition::REQUIRED, Type::INT64, LogicalType::DECIMAL, -1, 8, 4);
-  NodePtr group =
-      GroupNode::Make("group", Repetition::REPEATED, {node}, LogicalType::LIST);
-  Flatten(reinterpret_cast<GroupNode*>(group.get()));
-  ASSERT_EQ("decimal", elements_[1].name);
-  ASSERT_TRUE(elements_[1].__isset.precision);
-  ASSERT_TRUE(elements_[1].__isset.scale);
-
-  elements_.clear();
-  // Not for integers with no logical type
-  group =
-      GroupNode::Make("group", Repetition::REPEATED, {Int64("int64")}, LogicalType::LIST);
-  Flatten(reinterpret_cast<GroupNode*>(group.get()));
-  ASSERT_EQ("int64", elements_[1].name);
-  ASSERT_FALSE(elements_[0].__isset.precision);
-  ASSERT_FALSE(elements_[0].__isset.scale);
-}
-
-TEST_F(TestSchemaFlatten, NestedExample) {
-  SchemaElement elt;
-  std::vector<SchemaElement> elements;
-  elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
-
-  // A primitive one
-  elements.push_back(
-      NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1));
-
-  // A group
-  elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2));
-
-  // 3-level list encoding, by hand
-  elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3);
-  elt.__set_converted_type(ConvertedType::LIST);
-  elements.push_back(elt);
-  elements.push_back(
-      NewPrimitive("item", FieldRepetitionType::OPTIONAL, format::Type::INT64, 4));
-
-  // Construct the schema
-  NodeVector fields;
-  fields.push_back(Int32("a", Repetition::REQUIRED));
-
-  // 3-level list encoding
-  NodePtr item = Int64("item");
-  NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
-  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
-  fields.push_back(bag);
-
-  NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
-
-  Flatten(static_cast<GroupNode*>(schema.get()));
-  ASSERT_EQ(elements_.size(), elements.size());
-  for (size_t i = 0; i < elements_.size(); i++) {
-    ASSERT_EQ(elements_[i], elements[i]);
-  }
-}
-
-}  // namespace schema
-
-}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-descriptor-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc
deleted file mode 100644
index 4b7f67c..0000000
--- a/src/parquet/schema/schema-descriptor-test.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Schema / column descriptor correctness tests (from flat Parquet schemas)
-
-#include <cstdint>
-#include <cstdlib>
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "parquet/exception.h"
-#include "parquet/schema/descriptor.h"
-#include "parquet/schema/types.h"
-#include "parquet/types.h"
-
-using std::string;
-using std::vector;
-
-namespace parquet {
-
-namespace schema {
-
-TEST(TestColumnDescriptor, TestAttrs) {
-  NodePtr node = PrimitiveNode::Make(
-      "name", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8);
-  ColumnDescriptor descr(node, 4, 1);
-
-  ASSERT_EQ("name", descr.name());
-  ASSERT_EQ(4, descr.max_definition_level());
-  ASSERT_EQ(1, descr.max_repetition_level());
-
-  ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type());
-
-  ASSERT_EQ(-1, descr.type_length());
-
-  // Test FIXED_LEN_BYTE_ARRAY
-  node = PrimitiveNode::Make("name", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY,
-      LogicalType::DECIMAL, 12, 10, 4);
-  descr = ColumnDescriptor(node, 4, 1);
-
-  ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type());
-  ASSERT_EQ(12, descr.type_length());
-}
-
-class TestSchemaDescriptor : public ::testing::Test {
- public:
-  void setUp() {}
-
- protected:
-  SchemaDescriptor descr_;
-};
-
-TEST_F(TestSchemaDescriptor, InitNonGroup) {
-  NodePtr node = PrimitiveNode::Make("field", Repetition::OPTIONAL, Type::INT32);
-
-  ASSERT_THROW(descr_.Init(node), ParquetException);
-}
-
-TEST_F(TestSchemaDescriptor, Equals) {
-  NodePtr schema;
-
-  NodePtr inta = Int32("a", Repetition::REQUIRED);
-  NodePtr intb = Int64("b", Repetition::OPTIONAL);
-  NodePtr intb2 = Int64("b2", Repetition::OPTIONAL);
-  NodePtr intc = ByteArray("c", Repetition::REPEATED);
-
-  NodePtr item1 = Int64("item1", Repetition::REQUIRED);
-  NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
-  NodePtr item3 = Int32("item3", Repetition::REPEATED);
-  NodePtr list(GroupNode::Make(
-      "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST));
-
-  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
-  NodePtr bag2(GroupNode::Make("bag", Repetition::REQUIRED, {list}));
-
-  SchemaDescriptor descr1;
-  descr1.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag}));
-
-  ASSERT_TRUE(descr1.Equals(descr1));
-
-  SchemaDescriptor descr2;
-  descr2.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag2}));
-  ASSERT_FALSE(descr1.Equals(descr2));
-
-  SchemaDescriptor descr3;
-  descr3.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb2, intc, bag}));
-  ASSERT_FALSE(descr1.Equals(descr3));
-
-  // Robust to name of parent node
-  SchemaDescriptor descr4;
-  descr4.Init(GroupNode::Make("SCHEMA", Repetition::REPEATED, {inta, intb, intc, bag}));
-  ASSERT_TRUE(descr1.Equals(descr4));
-
-  SchemaDescriptor descr5;
-  descr5.Init(
-      GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag, intb2}));
-  ASSERT_FALSE(descr1.Equals(descr5));
-
-  // Different max repetition / definition levels
-  ColumnDescriptor col1(inta, 5, 1);
-  ColumnDescriptor col2(inta, 6, 1);
-  ColumnDescriptor col3(inta, 5, 2);
-
-  ASSERT_TRUE(col1.Equals(col1));
-  ASSERT_FALSE(col1.Equals(col2));
-  ASSERT_FALSE(col1.Equals(col3));
-}
-
-TEST_F(TestSchemaDescriptor, BuildTree) {
-  NodeVector fields;
-  NodePtr schema;
-
-  NodePtr inta = Int32("a", Repetition::REQUIRED);
-  fields.push_back(inta);
-  fields.push_back(Int64("b", Repetition::OPTIONAL));
-  fields.push_back(ByteArray("c", Repetition::REPEATED));
-
-  // 3-level list encoding
-  NodePtr item1 = Int64("item1", Repetition::REQUIRED);
-  NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
-  NodePtr item3 = Int32("item3", Repetition::REPEATED);
-  NodePtr list(GroupNode::Make(
-      "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST));
-  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
-  fields.push_back(bag);
-
-  schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
-
-  descr_.Init(schema);
-
-  int nleaves = 6;
-
-  // 6 leaves
-  ASSERT_EQ(nleaves, descr_.num_columns());
-
-  //                             mdef mrep
-  // required int32 a            0    0
-  // optional int64 b            1    0
-  // repeated byte_array c       1    1
-  // optional group bag          1    0
-  //   repeated group records    2    1
-  //     required int64 item1    2    1
-  //     optional boolean item2  3    1
-  //     repeated int32 item3    3    2
-  int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3};
-  int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2};
-
-  for (int i = 0; i < nleaves; ++i) {
-    const ColumnDescriptor* col = descr_.Column(i);
-    EXPECT_EQ(ex_max_def_levels[i], col->max_definition_level()) << i;
-    EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i;
-  }
-
-  ASSERT_EQ(descr_.Column(0)->path()->ToDotString(), "a");
-  ASSERT_EQ(descr_.Column(1)->path()->ToDotString(), "b");
-  ASSERT_EQ(descr_.Column(2)->path()->ToDotString(), "c");
-  ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1");
-  ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2");
-  ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3");
-
-  ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0).get());
-  ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3).get());
-  ASSERT_EQ(bag.get(), descr_.GetColumnRoot(4).get());
-  ASSERT_EQ(bag.get(), descr_.GetColumnRoot(5).get());
-
-  ASSERT_EQ(schema.get(), descr_.group_node());
-
-  // Init clears the leaves
-  descr_.Init(schema);
-  ASSERT_EQ(nleaves, descr_.num_columns());
-}
-
-}  // namespace schema
-
-}  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-printer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-printer-test.cc b/src/parquet/schema/schema-printer-test.cc
deleted file mode 100644
index 29140f0..0000000
--- a/src/parquet/schema/schema-printer-test.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <gtest/gtest.h>
-
-#include <iosfwd>
-#include <string>
-#include <vector>
-
-#include "parquet/schema/printer.h"
-#include "parquet/schema/types.h"
-#include "parquet/types.h"
-
-using std::string;
-using std::vector;
-
-namespace parquet {
-
-namespace schema {
-
-static std::string Print(const NodePtr& node) {
-  std::stringstream ss;
-  PrintSchema(node.get(), ss);
-  return ss.str();
-}
-
-TEST(TestSchemaPrinter, Examples) {
-  // Test schema 1
-  NodeVector fields;
-  fields.push_back(Int32("a", Repetition::REQUIRED));
-
-  // 3-level list encoding
-  NodePtr item1 = Int64("item1");
-  NodePtr item2 = Boolean("item2", Repetition::REQUIRED);
-  NodePtr list(
-      GroupNode::Make("b", Repetition::REPEATED, {item1, item2}, LogicalType::LIST));
-  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
-  fields.push_back(bag);
-
-  fields.push_back(PrimitiveNode::Make(
-      "c", Repetition::REQUIRED, Type::INT32, LogicalType::DECIMAL, -1, 3, 2));
-
-  NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
-
-  std::string result = Print(schema);
-  std::string expected = R"(message schema {
-  required int32 a;
-  optional group bag {
-    repeated group b (LIST) {
-      optional int64 item1;
-      required boolean item2;
-    }
-  }
-  required int32 c (DECIMAL(3,2));
-}
-)";
-  ASSERT_EQ(expected, result);
-}
-
-}  // namespace schema
-
-}  // namespace parquet


Mime
View raw message