parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jul...@apache.org
Subject parquet-cpp git commit: PARQUET-545: Improve API to support decimal type
Date Mon, 29 Feb 2016 18:04:10 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master c6e069297 -> ebb45b1e7


PARQUET-545: Improve API to support decimal type

This PR also
1) Improves scanner coverage.
2) Implements PARQUET-546
3) Adds tests for Types to string

Author: Deepak Majeti <deepak.majeti@hpe.com>

Closes #65 from majetideepak/PARQUET-545 and squashes the following commits:

df28ccc [Deepak Majeti] removed little redundant code
c5f6d85 [Deepak Majeti] improved schema type coverage
6a18c98 [Deepak Majeti] removed ColumnDescriptor Repetition API
6b450ab [Deepak Majeti] modified scanner test to remove redundant code
1eed0de [Deepak Majeti] Added tests for GroupNodes
0f0e559 [Deepak Majeti] added type to string tests
cab06d8 [Deepak Majeti] Improve coverage for types test
6ce10ce [Deepak Majeti] Tests for PARQUET-545
a643bf4 [Deepak Majeti] Added PrimitiveNode Tests
656942f [Deepak Majeti] restored some necessary checks
addf7f6 [Deepak Majeti] remove unnecessary checks
8a043c0 [Deepak Majeti] modified scanner and column descriptor
a346ca5 [Deepak Majeti] PARQUET-546
bdbcdd7 [Deepak Majeti] Extend Column Descriptor to include Decimal scale and precision
938ded5 [Deepak Majeti] Improve Scanner Coverage


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/ebb45b1e
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/ebb45b1e
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/ebb45b1e

Branch: refs/heads/master
Commit: ebb45b1e7da49e4084602b10aee90dcdf7317768
Parents: c6e0692
Author: Deepak Majeti <deepak.majeti@hpe.com>
Authored: Mon Feb 29 10:04:00 2016 -0800
Committer: Julien Le Dem <julien@dremio.com>
Committed: Mon Feb 29 10:04:00 2016 -0800

----------------------------------------------------------------------
 src/parquet/CMakeLists.txt                   |   1 +
 src/parquet/column/scanner-test.cc           |  91 +++++++++++-----
 src/parquet/column/scanner.h                 |  18 ++--
 src/parquet/encodings/encoding-test.cc       |   4 +-
 src/parquet/schema/descriptor.cc             |  11 +-
 src/parquet/schema/descriptor.h              |  19 ++--
 src/parquet/schema/schema-descriptor-test.cc |   6 +-
 src/parquet/schema/schema-types-test.cc      |  69 ++++++++++--
 src/parquet/schema/types.cc                  | 126 +++++++++++++++++++---
 src/parquet/schema/types.h                   |  23 ++--
 src/parquet/types-test.cc                    |  82 ++++++++++++++
 src/parquet/types.h                          |  67 ++++++++++++
 12 files changed, 419 insertions(+), 98 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt
index 97547ce..8a9b860 100644
--- a/src/parquet/CMakeLists.txt
+++ b/src/parquet/CMakeLists.txt
@@ -23,4 +23,5 @@ install(FILES
   DESTINATION include/parquet)
 
 ADD_PARQUET_TEST(public-api-test)
+ADD_PARQUET_TEST(types-test)
 ADD_PARQUET_TEST(reader-test)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/column/scanner-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc
index 785db08..fcaf65e 100644
--- a/src/parquet/column/scanner-test.cc
+++ b/src/parquet/column/scanner-test.cc
@@ -103,7 +103,7 @@ class TestFlatScanner : public ::testing::Test {
     TypedScanner<Type::type_num>* scanner =
       reinterpret_cast<TypedScanner<Type::type_num>* >(scanner_.get());
     T val;
-    bool is_null;
+    bool is_null = false;
     int16_t def_level;
     int16_t rep_level;
     size_t j = 0;
@@ -113,15 +113,15 @@ class TestFlatScanner : public ::testing::Test {
       if (!is_null) {
         ASSERT_EQ(values_[j++], val) << i <<"V"<< j;
       }
-      if (!d->is_required()) {
+      if (d->max_definition_level() > 0) {
         ASSERT_EQ(def_levels_[i], def_level) << i <<"D"<< j;
       }
-      if (d->is_repeated()) {
+      if (d->max_repetition_level() > 0) {
         ASSERT_EQ(rep_levels_[i], rep_level) << i <<"R"<< j;
       }
     }
     ASSERT_EQ(num_values_, j);
-    ASSERT_FALSE(scanner->HasNext());
+    ASSERT_FALSE(scanner->Next(&val, &def_level, &rep_level, &is_null));
   }
 
   void Clear() {
@@ -140,21 +140,25 @@ class TestFlatScanner : public ::testing::Test {
   }
 
   void InitDescriptors(std::shared_ptr<ColumnDescriptor>& d1,
-      std::shared_ptr<ColumnDescriptor>& d2, std::shared_ptr<ColumnDescriptor>&
d3) {
+      std::shared_ptr<ColumnDescriptor>& d2, std::shared_ptr<ColumnDescriptor>&
d3,
+      int length) {
     NodePtr type;
-    type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num);
+    type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num,
+       LogicalType::NONE, length);
     d1.reset(new ColumnDescriptor(type, 0, 0));
-    type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num);
+    type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num,
+       LogicalType::NONE, length);
     d2.reset(new ColumnDescriptor(type, 4, 0));
-    type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num);
+    type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num,
+       LogicalType::NONE, length);
     d3.reset(new ColumnDescriptor(type, 4, 2));
   }
 
-  void ExecuteAll(int num_pages, int num_levels, int batch_size) {
+  void ExecuteAll(int num_pages, int num_levels, int batch_size, int type_length) {
     std::shared_ptr<ColumnDescriptor> d1;
     std::shared_ptr<ColumnDescriptor> d2;
     std::shared_ptr<ColumnDescriptor> d3;
-    InitDescriptors(d1, d2, d3);
+    InitDescriptors(d1, d2, d3, type_length);
     // evaluate REQUIRED pages
     Execute(num_pages, num_levels, batch_size, d1.get());
     // evaluate OPTIONAL pages
@@ -203,42 +207,71 @@ void TestFlatScanner<FLBAType>::InitValues() {
       values_.data());
 }
 
-template<>
-void TestFlatScanner<FLBAType>::InitDescriptors(
-    std::shared_ptr<ColumnDescriptor>& d1, std::shared_ptr<ColumnDescriptor>&
d2,
-    std::shared_ptr<ColumnDescriptor>& d3) {
-  NodePtr type = schema::PrimitiveNode::MakeFLBA("c1", Repetition::REQUIRED,
-      FLBA_LENGTH, LogicalType::UTF8);
-  d1.reset(new ColumnDescriptor(type, 0, 0));
-  type = schema::PrimitiveNode::MakeFLBA("c2", Repetition::OPTIONAL,
-      FLBA_LENGTH, LogicalType::UTF8);
-  d2.reset(new ColumnDescriptor(type, 4, 0));
-  type = schema::PrimitiveNode::MakeFLBA("c3", Repetition::REPEATED,
-      FLBA_LENGTH, LogicalType::UTF8);
-  d3.reset(new ColumnDescriptor(type, 4, 2));
-}
-
 typedef TestFlatScanner<FLBAType> TestFlatFLBAScanner;
 
 static int num_levels_per_page = 100;
 static int num_pages = 20;
 static int batch_size = 32;
 
-TYPED_TEST_CASE(TestFlatScanner, ParquetTypes);
+typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type,
+                         FloatType, DoubleType, ByteArrayType> TestTypes;
+
+typedef TestFlatScanner<FLBAType> TestFLBAFlatScanner;
+
+TYPED_TEST_CASE(TestFlatScanner, TestTypes);
 
 TYPED_TEST(TestFlatScanner, TestScanner) {
-  this->ExecuteAll(num_pages, num_levels_per_page, batch_size);
+  this->ExecuteAll(num_pages, num_levels_per_page, batch_size, 0);
+}
+
+TEST_F(TestFLBAFlatScanner, TestScanner) {
+  this->ExecuteAll(num_pages, num_levels_per_page, batch_size, FLBA_LENGTH);
 }
 
 //PARQUET 502
 TEST_F(TestFlatFLBAScanner, TestSmallBatch) {
-  NodePtr type = schema::PrimitiveNode::MakeFLBA("c1", Repetition::REQUIRED,
-      FLBA_LENGTH, LogicalType::UTF8);
+  NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
   const ColumnDescriptor d(type, 0, 0);
   MakePages(&d, 1, 100);
   InitScanner(&d);
   CheckResults(1, &d);
 }
 
+TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) {
+  NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
+  const ColumnDescriptor d(type, 4, 0);
+  MakePages(&d, 1, 100);
+  InitScanner(&d);
+  TypedScanner<FLBAType::type_num>* scanner =
+    reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get());
+  ASSERT_EQ(10, scanner->descr()->type_precision());
+  ASSERT_EQ(2, scanner->descr()->type_scale());
+  ASSERT_EQ(FLBA_LENGTH, scanner->descr()->type_length());
+}
+
+TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) {
+  NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
+  const ColumnDescriptor d(type, 4, 0);
+  MakePages(&d, 1, 100);
+  InitScanner(&d);
+  TypedScanner<FLBAType::type_num>* scanner =
+    reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get());
+  size_t j = 0;
+  scanner->SetBatchSize(batch_size);
+  std::stringstream ss_fail;
+  for (size_t i = 0; i < num_levels_; i++) {
+    std::stringstream ss;
+    scanner->PrintNext(ss, 17);
+    std::string result = ss.str();
+    ASSERT_LE(17, result.size()) << i;
+  }
+  ASSERT_THROW(scanner->PrintNext(ss_fail, 17), ParquetException);
+}
+
+//Test for GroupNode
+
 } // namespace test
 } // namespace parquet_cpp

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/column/scanner.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h
index f3f5719..8569a94 100644
--- a/src/parquet/column/scanner.h
+++ b/src/parquet/column/scanner.h
@@ -45,8 +45,8 @@ class Scanner {
       values_buffered_(0),
       reader_(reader) {
     // TODO: don't allocate for required fields
-    def_levels_.resize(reader->descr()->is_optional() ? batch_size_ : 0);
-    rep_levels_.resize(reader->descr()->is_repeated() ? batch_size_ : 0);
+    def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
+    rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
   }
 
   virtual ~Scanner() {}
@@ -114,10 +114,8 @@ class TypedScanner : public Scanner {
         return false;
       }
     }
-    *def_level = descr()->is_optional() ?
-      def_levels_[level_offset_] : descr()->max_definition_level();
-    *rep_level = descr()->is_repeated() ?
-      rep_levels_[level_offset_] : descr()->max_repetition_level();
+    *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] :
0;
+    *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] :
0;
     level_offset_++;
     return true;
   }
@@ -172,10 +170,12 @@ class TypedScanner : public Scanner {
 
   virtual void PrintNext(std::ostream& out, int width) {
     T val;
-    bool is_null;
-
+    bool is_null = false;
     char buffer[25];
-    NextValue(&val, &is_null);
+
+    if (!NextValue(&val, &is_null)) {
+      throw ParquetException("No more values buffered");
+    }
 
     if (is_null) {
       std::string null_fmt = format_fwf<Type::BYTE_ARRAY>(width);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/encodings/encoding-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/encoding-test.cc b/src/parquet/encodings/encoding-test.cc
index 10310ed..f060f96 100644
--- a/src/parquet/encodings/encoding-test.cc
+++ b/src/parquet/encodings/encoding-test.cc
@@ -134,8 +134,8 @@ std::shared_ptr<ColumnDescriptor> ExampleDescr() {
 
 template <>
 std::shared_ptr<ColumnDescriptor> ExampleDescr<FLBA>() {
-  auto node = schema::PrimitiveNode::MakeFLBA("name", Repetition::OPTIONAL,
-      flba_length, LogicalType::UTF8);
+  auto node = schema::PrimitiveNode::Make("name", Repetition::OPTIONAL,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, flba_length, 10, 2);
   return std::make_shared<ColumnDescriptor>(node, 0, 0);
 }
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/descriptor.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc
index 02b5060..b3fefee 100644
--- a/src/parquet/schema/descriptor.cc
+++ b/src/parquet/schema/descriptor.cc
@@ -84,10 +84,15 @@ const ColumnDescriptor* SchemaDescriptor::Column(size_t i) const {
   return &leaves_[i];
 }
 
+int ColumnDescriptor::type_scale() const {
+  return primitive_node_->decimal_metadata().scale;
+}
+
+int ColumnDescriptor::type_precision() const {
+  return primitive_node_->decimal_metadata().precision;
+}
+
 int ColumnDescriptor::type_length() const {
-  if (primitive_node_->physical_type() != Type::FIXED_LEN_BYTE_ARRAY) {
-    throw ParquetException("Not a FIXED_LEN_BYTE_ARRAY");
-  }
   return primitive_node_->type_length();
 }
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/descriptor.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h
index 62066ef..4c6f50d 100644
--- a/src/parquet/schema/descriptor.h
+++ b/src/parquet/schema/descriptor.h
@@ -24,7 +24,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-
 #include "parquet/schema/types.h"
 #include "parquet/types.h"
 
@@ -54,23 +53,19 @@ class ColumnDescriptor {
     return primitive_node_->physical_type();
   }
 
-  const std::string& name() const {
-    return primitive_node_->name();
+  LogicalType::type logical_type() const {
+    return primitive_node_->logical_type();
   }
 
-  bool is_required() const {
-    return max_definition_level_ == 0;
+  const std::string& name() const {
+    return primitive_node_->name();
   }
 
-  bool is_optional() const {
-    return max_definition_level_ > 0;
-  }
+  int type_length() const;
 
-  bool is_repeated() const {
-    return max_repetition_level_ > 0;
-  }
+  int type_precision() const;
 
-  int type_length() const;
+  int type_scale() const;
 
  private:
   schema::NodePtr node_;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/schema-descriptor-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc
index 3b1734b..519d968 100644
--- a/src/parquet/schema/schema-descriptor-test.cc
+++ b/src/parquet/schema/schema-descriptor-test.cc
@@ -46,11 +46,11 @@ TEST(TestColumnDescriptor, TestAttrs) {
 
   ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type());
 
-  ASSERT_THROW(descr.type_length(), ParquetException);
+  ASSERT_EQ(-1, descr.type_length());
 
   // Test FIXED_LEN_BYTE_ARRAY
-  node = PrimitiveNode::MakeFLBA("name", Repetition::OPTIONAL,
-      12, LogicalType::UTF8);
+  node = PrimitiveNode::Make("name", Repetition::OPTIONAL,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 12, 10, 4);
   descr = ColumnDescriptor(node, 4, 1);
 
   ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type());

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/schema-types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc
index cac7dc5..f512f0f 100644
--- a/src/parquet/schema/schema-types-test.cc
+++ b/src/parquet/schema/schema-types-test.cc
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 
+#include "parquet/exception.h"
 #include "parquet/schema/test-util.h"
 #include "parquet/schema/types.h"
 #include "parquet/thrift/parquet_types.h"
@@ -130,15 +131,15 @@ TEST_F(TestPrimitiveNode, FromParquet) {
       parquet::Type::FIXED_LEN_BYTE_ARRAY);
   elt.__set_converted_type(ConvertedType::DECIMAL);
   elt.__set_type_length(6);
-  elt.__set_scale(12);
-  elt.__set_precision(2);
+  elt.__set_scale(2);
+  elt.__set_precision(12);
 
   Convert(&elt);
   ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type());
   ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type());
   ASSERT_EQ(6, prim_node_->type_length());
-  ASSERT_EQ(12, prim_node_->decimal_metadata().scale);
-  ASSERT_EQ(2, prim_node_->decimal_metadata().precision);
+  ASSERT_EQ(2, prim_node_->decimal_metadata().scale);
+  ASSERT_EQ(12, prim_node_->decimal_metadata().precision);
 }
 
 TEST_F(TestPrimitiveNode, Equals) {
@@ -154,17 +155,66 @@ TEST_F(TestPrimitiveNode, Equals) {
   ASSERT_FALSE(node1.Equals(&node4));
   ASSERT_TRUE(node1.Equals(&node5));
 
-  PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
-  flba1.SetTypeLength(12);
+  PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+      LogicalType::DECIMAL, 12, 4, 2);
 
-  PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+  PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+      LogicalType::DECIMAL, 1, 4, 2);
   flba2.SetTypeLength(12);
 
-  PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+  PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+      LogicalType::DECIMAL, 1, 4, 2);
   flba3.SetTypeLength(16);
 
+  PrimitiveNode flba4("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+      LogicalType::DECIMAL, 12, 4, 0);
+
+  PrimitiveNode flba5("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+      LogicalType::NONE, 12, 4, 0);
+
   ASSERT_TRUE(flba1.Equals(&flba2));
   ASSERT_FALSE(flba1.Equals(&flba3));
+  ASSERT_FALSE(flba1.Equals(&flba4));
+  ASSERT_FALSE(flba1.Equals(&flba5));
+}
+
+TEST_F(TestPrimitiveNode, PhysicalLogicalMapping) {
+  ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+        Type::INT32, LogicalType::INT_32));
+  ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+        Type::BYTE_ARRAY, LogicalType::JSON));
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+        Type::INT32, LogicalType::JSON), ParquetException);
+  ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+        Type::INT64, LogicalType::TIMESTAMP_MILLIS));
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::INT32, LogicalType::INT_64), ParquetException);
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+        Type::BYTE_ARRAY, LogicalType::INT_8), ParquetException);
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+        Type::BYTE_ARRAY, LogicalType::INTERVAL), ParquetException);
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::ENUM), ParquetException);
+  ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::BYTE_ARRAY, LogicalType::ENUM));
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 2, 4), ParquetException);
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FLOAT, LogicalType::DECIMAL, 0, 2, 4), ParquetException);
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 4, 0), ParquetException);
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 0, 4), ParquetException);
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 4, -1), ParquetException);
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 2, 4), ParquetException);
+  ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 6, 4));
+  ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 12));
+  ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+      Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 10), ParquetException);
 }
 
 // ----------------------------------------------------------------------
@@ -220,11 +270,14 @@ TEST_F(TestGroupNode, Equals) {
   // This is copied in the GroupNode ctor, so this is okay
   f2.push_back(Float("four", Repetition::OPTIONAL));
   GroupNode group4("group", Repetition::REPEATED, f2);
+  GroupNode group5("group", Repetition::REPEATED, Fields1());
 
+  ASSERT_TRUE(group1.Equals(&group1));
   ASSERT_TRUE(group1.Equals(&group2));
   ASSERT_FALSE(group1.Equals(&group3));
 
   ASSERT_FALSE(group1.Equals(&group4));
+  ASSERT_FALSE(group5.Equals(&group4));
 }
 
 } // namespace schema

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
index fae7c84..b57fd37 100644
--- a/src/parquet/schema/types.cc
+++ b/src/parquet/schema/types.cc
@@ -40,17 +40,117 @@ bool Node::EqualsInternal(const Node* other) const {
 // ----------------------------------------------------------------------
 // Primitive node
 
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+    Type::type type, LogicalType::type logical_type,
+    int length, int precision, int scale, int id) :
+  Node(Node::PRIMITIVE, name, repetition, logical_type, id),
+  physical_type_(type), type_length_(length) {
+  std::stringstream ss;
+  // Check if the physical and logical types match
+  // Mapping referred from Apache parquet-mr as on 2016-02-22
+  switch (logical_type) {
+    case LogicalType::NONE:
+      // Logical type not set
+      // Clients should be able to read these values
+      decimal_metadata_.precision = precision;
+      decimal_metadata_.scale = scale;
+      break;
+    case LogicalType::UTF8:
+    case LogicalType::JSON:
+    case LogicalType::BSON:
+      if (type != Type::BYTE_ARRAY) {
+        ss << logical_type_to_string(logical_type);
+        ss << " can only annotate BYTE_ARRAY fields";
+        throw ParquetException(ss.str());
+      }
+      break;
+    case LogicalType::DECIMAL:
+      if ((type != Type::INT32) &&
+            (type != Type::INT64) &&
+            (type != Type::BYTE_ARRAY) &&
+            (type != Type::FIXED_LEN_BYTE_ARRAY)) {
+        ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
+        throw ParquetException(ss.str());
+      }
+      if (precision <= 0) {
+        ss << "Invalid DECIMAL precision: " << precision;
+        throw ParquetException(ss.str());
+      }
+      if (scale < 0) {
+        ss << "Invalid DECIMAL scale: " << scale;
+        throw ParquetException(ss.str());
+      }
+      if (scale > precision) {
+        ss << "Invalid DECIMAL scale " << scale;
+        ss << " cannot be greater than precision " << precision;
+        throw ParquetException(ss.str());
+      }
+      decimal_metadata_.precision = precision;
+      decimal_metadata_.scale = scale;
+      break;
+    case LogicalType::DATE:
+    case LogicalType::TIME_MILLIS:
+    case LogicalType::UINT_8:
+    case LogicalType::UINT_16:
+    case LogicalType::UINT_32:
+    case LogicalType::INT_8:
+    case LogicalType::INT_16:
+    case LogicalType::INT_32:
+      if (type != Type::INT32) {
+        ss << logical_type_to_string(logical_type);
+        ss << " can only annotate INT32";
+        throw ParquetException(ss.str());
+      }
+      break;
+    case LogicalType::TIMESTAMP_MILLIS:
+    case LogicalType::UINT_64:
+    case LogicalType::INT_64:
+      if (type != Type::INT64) {
+        ss << logical_type_to_string(logical_type);
+        ss << " can only annotate INT64";
+        throw ParquetException(ss.str());
+      }
+      break;
+    case LogicalType::INTERVAL:
+      if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
+        ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
+        throw ParquetException(ss.str());
+      }
+      break;
+    case LogicalType::ENUM:
+      if (type != Type::BYTE_ARRAY) {
+        ss << "ENUM can only annotate BYTE_ARRAY fields";
+        throw ParquetException(ss.str());
+      }
+      break;
+    default:
+      ss << logical_type_to_string(logical_type);
+      ss << " can not be applied to a primitive type";
+      throw ParquetException(ss.str());
+  }
+  if (type == Type::FIXED_LEN_BYTE_ARRAY) {
+    if (length <= 0) {
+      ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
+      throw ParquetException(ss.str());
+    }
+    type_length_ = length;
+  }
+}
+
 bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
-  if (physical_type_ != other->physical_type_) {
-    return false;
-  } else if (logical_type_ == LogicalType::DECIMAL) {
-    // TODO(wesm): metadata
-    ParquetException::NYI("comparing decimals");
+  bool is_equal = true;
+  if ((physical_type_ != other->physical_type_) ||
+      (logical_type_ != other->logical_type_)) {
     return false;
-  } else if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
-    return type_length_ == other->type_length_;
   }
-  return true;
+  if (logical_type_ == LogicalType::DECIMAL) {
+    is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision)
&&
+      (decimal_metadata_.scale == other->decimal_metadata_.scale);
+  }
+  if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+    is_equal &= (type_length_ == other->type_length_);
+  }
+  return is_equal;
 }
 
 bool PrimitiveNode::Equals(const Node* other) const {
@@ -134,14 +234,8 @@ std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
 
   std::unique_ptr<PrimitiveNode> result = std::unique_ptr<PrimitiveNode>(
       new PrimitiveNode(params.name, params.repetition,
-          FromThrift(element->type), params.logical_type, node_id));
-
-  if (element->type == parquet::Type::FIXED_LEN_BYTE_ARRAY) {
-    result->SetTypeLength(element->type_length);
-    if (params.logical_type == LogicalType::DECIMAL) {
-      result->SetDecimalMetadata(element->scale, element->precision);
-    }
-  }
+          FromThrift(element->type), params.logical_type,
+          element->type_length, element->precision, element->scale, node_id));
 
   // Return as unique_ptr to the base type
   return std::unique_ptr<Node>(result.release());

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h
index e76323f..2eaee8b 100644
--- a/src/parquet/schema/types.h
+++ b/src/parquet/schema/types.h
@@ -177,17 +177,10 @@ class PrimitiveNode : public Node {
 
   static inline NodePtr Make(const std::string& name,
       Repetition::type repetition, Type::type type,
-      LogicalType::type logical_type = LogicalType::NONE) {
-    return NodePtr(new PrimitiveNode(name, repetition, type, logical_type));
-  }
-
-  // Alternate constructor for FIXED_LEN_BYTE_ARRAY (FLBA)
-  static inline NodePtr MakeFLBA(const std::string& name,
-      Repetition::type repetition, int32_t type_length,
-      LogicalType::type logical_type = LogicalType::NONE) {
-    NodePtr result = Make(name, repetition, Type::FIXED_LEN_BYTE_ARRAY, logical_type);
-    static_cast<PrimitiveNode*>(result.get())->SetTypeLength(type_length);
-    return result;
+      LogicalType::type logical_type = LogicalType::NONE,
+      int length = -1, int precision = -1, int scale = -1) {
+    return NodePtr(new PrimitiveNode(name, repetition, type, logical_type,
+          length, precision, scale));
   }
 
   virtual bool Equals(const Node* other) const;
@@ -208,11 +201,8 @@ class PrimitiveNode : public Node {
 
  private:
   PrimitiveNode(const std::string& name, Repetition::type repetition,
-      Type::type type,
-      LogicalType::type logical_type = LogicalType::NONE,
-      int id = -1) :
-      Node(Node::PRIMITIVE, name, repetition, logical_type, id),
-      physical_type_(type) {}
+      Type::type type, LogicalType::type logical_type = LogicalType::NONE,
+      int length = -1, int precision = -1, int scale = -1, int id = -1);
 
   Type::type physical_type_;
   int32_t type_length_;
@@ -234,6 +224,7 @@ class PrimitiveNode : public Node {
 
   FRIEND_TEST(TestPrimitiveNode, Attrs);
   FRIEND_TEST(TestPrimitiveNode, Equals);
+  FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
   FRIEND_TEST(TestPrimitiveNode, FromParquet);
 };
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc
new file mode 100644
index 0000000..b1bfacd
--- /dev/null
+++ b/src/parquet/types-test.cc
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <string>
+
+#include "parquet/types.h"
+
+namespace parquet_cpp {
+
+TEST(TestTypeToString, PhysicalTypes) {
+  ASSERT_STREQ("BOOLEAN", type_to_string(Type::BOOLEAN).c_str());
+  ASSERT_STREQ("INT32", type_to_string(Type::INT32).c_str());
+  ASSERT_STREQ("INT64", type_to_string(Type::INT64).c_str());
+  ASSERT_STREQ("INT96", type_to_string(Type::INT96).c_str());
+  ASSERT_STREQ("FLOAT", type_to_string(Type::FLOAT).c_str());
+  ASSERT_STREQ("DOUBLE", type_to_string(Type::DOUBLE).c_str());
+  ASSERT_STREQ("BYTE_ARRAY", type_to_string(Type::BYTE_ARRAY).c_str());
+  ASSERT_STREQ("FIXED_LEN_BYTE_ARRAY",
+      type_to_string(Type::FIXED_LEN_BYTE_ARRAY).c_str());
+}
+
+TEST(TestLogicalTypeToString, LogicalTypes) {
+  ASSERT_STREQ("NONE",
+      logical_type_to_string(LogicalType::NONE).c_str());
+  ASSERT_STREQ("UTF8",
+      logical_type_to_string(LogicalType::UTF8).c_str());
+  ASSERT_STREQ("MAP_KEY_VALUE",
+      logical_type_to_string(LogicalType::MAP_KEY_VALUE).c_str());
+  ASSERT_STREQ("LIST",
+      logical_type_to_string(LogicalType::LIST).c_str());
+  ASSERT_STREQ("ENUM",
+      logical_type_to_string(LogicalType::ENUM).c_str());
+  ASSERT_STREQ("DECIMAL",
+      logical_type_to_string(LogicalType::DECIMAL).c_str());
+  ASSERT_STREQ("DATE",
+      logical_type_to_string(LogicalType::DATE).c_str());
+  ASSERT_STREQ("TIME_MILLIS",
+      logical_type_to_string(LogicalType::TIME_MILLIS).c_str());
+  ASSERT_STREQ("TIMESTAMP_MILLIS",
+      logical_type_to_string(LogicalType::TIMESTAMP_MILLIS).c_str());
+  ASSERT_STREQ("UINT_8",
+      logical_type_to_string(LogicalType::UINT_8).c_str());
+  ASSERT_STREQ("UINT_16",
+      logical_type_to_string(LogicalType::UINT_16).c_str());
+  ASSERT_STREQ("UINT_32",
+      logical_type_to_string(LogicalType::UINT_32).c_str());
+  ASSERT_STREQ("UINT_64",
+      logical_type_to_string(LogicalType::UINT_64).c_str());
+  ASSERT_STREQ("INT_8",
+      logical_type_to_string(LogicalType::INT_8).c_str());
+  ASSERT_STREQ("INT_16",
+      logical_type_to_string(LogicalType::INT_16).c_str());
+  ASSERT_STREQ("INT_32",
+      logical_type_to_string(LogicalType::INT_32).c_str());
+  ASSERT_STREQ("INT_64",
+      logical_type_to_string(LogicalType::INT_64).c_str());
+  ASSERT_STREQ("JSON",
+      logical_type_to_string(LogicalType::JSON).c_str());
+  ASSERT_STREQ("BSON",
+      logical_type_to_string(LogicalType::BSON).c_str());
+  ASSERT_STREQ("INTERVAL",
+      logical_type_to_string(LogicalType::INTERVAL).c_str());
+}
+
+
+} // namespace parquet_cpp

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/types.h b/src/parquet/types.h
index f59f6a9..e29f11c 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -316,6 +316,73 @@ static inline std::string type_to_string(Type::type t) {
   }
 }
 
+static inline std::string logical_type_to_string(LogicalType::type t) {
+  switch (t) {
+    case LogicalType::NONE:
+      return "NONE";
+      break;
+    case LogicalType::UTF8:
+      return "UTF8";
+      break;
+    case LogicalType::MAP_KEY_VALUE:
+      return "MAP_KEY_VALUE";
+      break;
+    case LogicalType::LIST:
+      return "LIST";
+      break;
+    case LogicalType::ENUM:
+      return "ENUM";
+      break;
+    case LogicalType::DECIMAL:
+      return "DECIMAL";
+      break;
+    case LogicalType::DATE:
+      return "DATE";
+      break;
+    case LogicalType::TIME_MILLIS:
+      return "TIME_MILLIS";
+      break;
+    case LogicalType::TIMESTAMP_MILLIS:
+      return "TIMESTAMP_MILLIS";
+      break;
+    case LogicalType::UINT_8:
+      return "UINT_8";
+      break;
+    case LogicalType::UINT_16:
+      return "UINT_16";
+      break;
+    case LogicalType::UINT_32:
+      return "UINT_32";
+      break;
+    case LogicalType::UINT_64:
+      return "UINT_64";
+      break;
+    case LogicalType::INT_8:
+      return "INT_8";
+      break;
+    case LogicalType::INT_16:
+      return "INT_16";
+      break;
+    case LogicalType::INT_32:
+      return "INT_32";
+      break;
+    case LogicalType::INT_64:
+      return "INT_64";
+      break;
+    case LogicalType::JSON:
+      return "JSON";
+      break;
+    case LogicalType::BSON:
+      return "BSON";
+      break;
+    case LogicalType::INTERVAL:
+      return "INTERVAL";
+      break;
+    default:
+      return "UNKNOWN";
+      break;
+  }
+}
 } // namespace parquet_cpp
 
 #endif // PARQUET_TYPES_H


Mime
View raw message