parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject [parquet-cpp] branch master updated: PARQUET-1193: [CPP] Implement ColumnOrder to support min_value and max_value
Date Wed, 24 Jan 2018 22:20:52 GMT
This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git


The following commit(s) were added to refs/heads/master by this push:
     new 62de4b1  PARQUET-1193: [CPP] Implement ColumnOrder to support min_value and max_value
62de4b1 is described below

commit 62de4b13f42beb5fd84d673b55208fb737749d9a
Author: Deepak Majeti <deepak.majeti@hpe.com>
AuthorDate: Wed Jan 24 23:20:42 2018 +0100

    PARQUET-1193: [CPP] Implement ColumnOrder to support min_value and max_value
    
    Changes:
    
    1. Update parquet.thrift format
    2. Add ColumnOrder Implementation
    3. Make Int96 sort order UNKNOWN
    
    Author: Deepak Majeti <deepak.majeti@hpe.com>
    
    Closes #430 from majetideepak/PARQUET-1193 and squashes the following commits:
    
    d31df36 [Deepak Majeti] Fix unused function Warning
    4ed405f [Deepak Majeti] Add comments
    dec58ca [Deepak Majeti] clang-format
    2cd9f11 [Deepak Majeti] Make Int96 sort order UNKNOWN
    ff41b3c [Deepak Majeti] Add ColumnOrder Implementation
    6221cba [Deepak Majeti] Pull updated parquet.thrift format
---
 src/parquet/column_writer-test.cc |  18 +++
 src/parquet/metadata-test.cc      |   2 +-
 src/parquet/metadata.cc           |  47 +++++-
 src/parquet/parquet.thrift        | 295 +++++++++++++++++++++++++++++++++-----
 src/parquet/schema.cc             |  31 ++++
 src/parquet/schema.h              |  11 ++
 src/parquet/statistics-test.cc    |  30 +---
 src/parquet/test-specialization.h |  13 +-
 src/parquet/types.cc              |   6 +-
 src/parquet/types.h               |  15 ++
 src/parquet/util/comparison.cc    |   4 -
 11 files changed, 396 insertions(+), 76 deletions(-)

diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc
index 7e5dc82..224a23d 100644
--- a/src/parquet/column_writer-test.cc
+++ b/src/parquet/column_writer-test.cc
@@ -219,6 +219,24 @@ void TestPrimitiveWriter<TestType>::ReadColumnFully(Compression::type
compressio
 }
 
 template <>
+void TestPrimitiveWriter<Int96Type>::ReadAndCompare(Compression::type compression,
+                                                    int64_t num_rows) {
+  this->SetupValuesOut(num_rows);
+  this->ReadColumnFully(compression);
+  std::shared_ptr<CompareDefault<Int96Type>> compare;
+  compare = std::make_shared<CompareDefaultInt96>();
+  for (size_t i = 0; i < this->values_.size(); i++) {
+    if ((*compare)(this->values_[i], this->values_out_[i]) ||
+        (*compare)(this->values_out_[i], this->values_[i])) {
+      std::cout << "Failed at " << i << std::endl;
+    }
+    ASSERT_FALSE((*compare)(this->values_[i], this->values_out_[i]));
+    ASSERT_FALSE((*compare)(this->values_out_[i], this->values_[i]));
+  }
+  ASSERT_EQ(this->values_, this->values_out_);
+}
+
+template <>
 void TestPrimitiveWriter<FLBAType>::ReadColumnFully(Compression::type compression)
{
   int64_t total_values = static_cast<int64_t>(this->values_out_.size());
   BuildReader(total_values, compression);
diff --git a/src/parquet/metadata-test.cc b/src/parquet/metadata-test.cc
index b20293b..53653bd 100644
--- a/src/parquet/metadata-test.cc
+++ b/src/parquet/metadata-test.cc
@@ -219,7 +219,7 @@ TEST(ApplicationVersion, Basics) {
 
   ASSERT_EQ(true, version.VersionLt(version1));
 
-  ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, SortOrder::SIGNED));
+  ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, SortOrder::UNKNOWN));
   ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, SortOrder::SIGNED));
   ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
   ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc
index 1c7db86..91304cf 100644
--- a/src/parquet/metadata.cc
+++ b/src/parquet/metadata.cc
@@ -41,8 +41,8 @@ const ApplicationVersion ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION
=
 template <typename DType>
 static std::shared_ptr<RowGroupStatistics> MakeTypedColumnStats(
     const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
-  // If new fields max_value/min_value are set, then return them.
-  if (metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value) {
+  // If ColumnOrder is defined, return max_value and min_value
+  if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
     return std::make_shared<TypedRowGroupStatistics<DType>>(
         descr, metadata.statistics.min_value, metadata.statistics.max_value,
         metadata.num_values - metadata.statistics.null_count,
@@ -310,6 +310,7 @@ class FileMetaData::FileMetaDataImpl {
     }
 
     InitSchema();
+    InitColumnOrders();
     InitKeyValueMetadata();
   }
   ~FileMetaDataImpl() {}
@@ -357,6 +358,23 @@ class FileMetaData::FileMetaDataImpl {
                                           static_cast<int>(metadata_->schema.size()));
     schema_.Init(converter.Convert());
   }
+  void InitColumnOrders() {
+    // update ColumnOrder
+    std::vector<parquet::ColumnOrder> column_orders;
+    if (metadata_->__isset.column_orders) {
+      for (auto column_order : metadata_->column_orders) {
+        if (column_order.__isset.TYPE_ORDER) {
+          column_orders.push_back(ColumnOrder::type_defined_);
+        } else {
+          column_orders.push_back(ColumnOrder::undefined_);
+        }
+      }
+    } else {
+      column_orders.resize(schema_.num_columns(), ColumnOrder::undefined_);
+    }
+
+    schema_.updateColumnOrders(column_orders);
+  }
   SchemaDescriptor schema_;
   ApplicationVersion writer_version_;
 
@@ -495,10 +513,9 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
   // Parquet cpp version 1.3.0 onwards stats are computed correctly for all types
   if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION))) {
     // Only SIGNED are valid
-    if (SortOrder::SIGNED != sort_order) return false;
-
-    // None of the current tools write INT96 Statistics correctly
-    if (col_type == Type::INT96) return false;
+    if (SortOrder::SIGNED != sort_order) {
+      return false;
+    }
 
     // Statistics of other types are OK
     if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
@@ -511,6 +528,11 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
     return true;
   }
 
+  // Unknown sort order has incorrect stats
+  if (SortOrder::UNKNOWN == sort_order) {
+    return false;
+  }
+
   // PARQUET-251
   if (VersionLt(PARQUET_251_FIXED_VERSION)) {
     return false;
@@ -808,6 +830,19 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
     }
     metadata_->__set_version(file_version);
     metadata_->__set_created_by(properties_->created_by());
+
+    // Users cannot set the `ColumnOrder` since we donot not have user defined sort order
+    // in the spec yet.
+    // We always default to `TYPE_DEFINED_ORDER`. We can expose it in
+    // the API once we have user defined sort orders in the Parquet format.
+    // TypeDefinedOrder implies choose SortOrder based on LogicalType/PhysicalType
+    format::TypeDefinedOrder type_defined_order;
+    format::ColumnOrder column_order;
+    column_order.__set_TYPE_ORDER(type_defined_order);
+    column_order.__isset.TYPE_ORDER = true;
+    metadata_->column_orders.resize(schema_->num_columns(), column_order);
+    metadata_->__isset.column_orders = true;
+
     parquet::schema::SchemaFlattener flattener(
         static_cast<parquet::schema::GroupNode*>(schema_->schema_root().get()),
         &metadata_->schema);
diff --git a/src/parquet/parquet.thrift b/src/parquet/parquet.thrift
index a72ef2c..cfcc1fe 100644
--- a/src/parquet/parquet.thrift
+++ b/src/parquet/parquet.thrift
@@ -30,17 +30,6 @@ namespace java org.apache.parquet.format
  * with the encodings to control the on disk storage format.
  * For example INT16 is not included as a type since a good encoding of INT32
  * would handle this.
- *
- * When a logical type is not present, the type-defined sort order of these
- * physical types are:
- * * BOOLEAN - false, true
- * * INT32 - signed comparison
- * * INT64 - signed comparison
- * * INT96 - signed comparison
- * * FLOAT - signed comparison
- * * DOUBLE - signed comparison
- * * BYTE_ARRAY - unsigned byte-wise comparison
- * * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
  */
 enum Type {
   BOOLEAN = 0;
@@ -187,14 +176,6 @@ enum ConvertedType {
    * particular timezone or date.
    */
   INTERVAL = 21;
-
-  /**
-   * Annotates a column that is always null
-   * Sometimes when discovering the schema of existing data
-   * values are always null
-   * This is NULL in parquet-format
-   */
-  NA = 25;
 }
 
 /**
@@ -222,12 +203,12 @@ struct Statistics {
     * Values are encoded using PLAIN encoding, except that variable-length byte
     * arrays do not include a length prefix.
     *
-    * These fields encode min and max values determined by SIGNED comparison
+    * These fields encode min and max values determined by signed comparison
     * only. New files should use the correct order for a column's logical type
     * and store the values in the min_value and max_value fields.
     *
     * To support older readers, these may be set when the column order is
-    * SIGNED.
+    * signed.
     */
    1: optional binary max;
    2: optional binary min;
@@ -245,6 +226,115 @@ struct Statistics {
    6: optional binary min_value;
 }
 
+/** Empty structs to use as logical type annotations */
+struct StringType {}  // allowed for BINARY, must be encoded with UTF-8
+struct UUIDType {}    // allowed for FIXED[16], must encoded raw UUID bytes
+struct MapType {}     // see LogicalTypes.md
+struct ListType {}    // see LogicalTypes.md
+struct EnumType {}    // allowed for BINARY, must be encoded with UTF-8
+struct DateType {}    // allowed for INT32
+
+/**
+ * Logical type to annotate a column that is always null.
+ *
+ * Sometimes when discovering the schema of existing data, values are always
+ * null and the physical type can't be determined. This annotation signals
+ * the case where the physical type was guessed from all null values.
+ */
+struct NullType {}    // allowed for any physical type, only null values stored
+
+/**
+ * Decimal logical type annotation
+ *
+ * To maintain forward-compatibility in v1, implementations using this logical
+ * type must also set scale and precision on the annotated SchemaElement.
+ *
+ * Allowed for physical types: INT32, INT64, FIXED, and BINARY
+ */
+struct DecimalType {
+  1: required i32 scale
+  2: required i32 precision
+}
+
+/** Time units for logical types */
+struct MilliSeconds {}
+struct MicroSeconds {}
+union TimeUnit {
+  1: MilliSeconds MILLIS
+  2: MicroSeconds MICROS
+}
+
+/**
+ * Timestamp logical type annotation
+ *
+ * Allowed for physical types: INT64
+ */
+struct TimestampType {
+  1: required bool isAdjustedToUTC
+  2: required TimeUnit unit
+}
+
+/**
+ * Time logical type annotation
+ *
+ * Allowed for physical types: INT32 (millis), INT64 (micros)
+ */
+struct TimeType {
+  1: required bool isAdjustedToUTC
+  2: required TimeUnit unit
+}
+
+/**
+ * Integer logical type annotation
+ *
+ * bitWidth must be 8, 16, 32, or 64.
+ *
+ * Allowed for physical types: INT32, INT64
+ */
+struct IntType {
+  1: required byte bitWidth
+  2: required bool isSigned
+}
+
+/**
+ * Embedded JSON logical type annotation
+ *
+ * Allowed for physical types: BINARY
+ */
+struct JsonType {
+}
+
+/**
+ * Embedded BSON logical type annotation
+ *
+ * Allowed for physical types: BINARY
+ */
+struct BsonType {
+}
+
+/**
+ * LogicalType annotations to replace ConvertedType.
+ *
+ * To maintain compatibility, implementations using LogicalType for a
+ * SchemaElement must also set the corresponding ConvertedType from the
+ * following table.
+ */
+union LogicalType {
+  1:  StringType STRING       // use ConvertedType UTF8 if encoding is UTF-8
+  2:  MapType MAP             // use ConvertedType MAP
+  3:  ListType LIST           // use ConvertedType LIST
+  4:  EnumType ENUM           // use ConvertedType ENUM
+  5:  DecimalType DECIMAL     // use ConvertedType DECIMAL
+  6:  DateType DATE           // use ConvertedType DATE
+  7:  TimeType TIME           // use ConvertedType TIME_MICROS or TIME_MILLIS
+  8:  TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS
+  // 9: reserved for INTERVAL
+  10: IntType INTEGER         // use ConvertedType INT_* or UINT_*
+  11: NullType UNKNOWN        // no compatible ConvertedType
+  12: JsonType JSON           // use ConvertedType JSON
+  13: BsonType BSON           // use ConvertedType BSON
+}
+
 /**
  * Represents a element inside a schema definition.
  *  - if it is a group (inner node) then type is undefined and num_children is defined
@@ -292,6 +382,13 @@ struct SchemaElement {
    */
   9: optional i32 field_id;
 
+  /**
+   * The logical type of this SchemaElement; only valid for primitives.
+   *
+   * LogicalType replaces ConvertedType, but ConvertedType is still required
+   * for some logical types to ensure forward-compatibility in format v1.
+   */
+  10: optional LogicalType logicalType
 }
 
 /**
@@ -324,7 +421,7 @@ enum Encoding {
    */
   PLAIN_DICTIONARY = 2;
 
-  /** Group packed run length encoding. Usable for definition/reptition levels
+  /** Group packed run length encoding. Usable for definition/repetition levels
    * encoding and Booleans (on one bit: 0 is false; 1 is true.)
    */
   RLE = 3;
@@ -356,15 +453,20 @@ enum Encoding {
 
 /**
  * Supported compression algorithms.
+ *
+ * Codecs added in 2.3.2 can be read by readers based on 2.3.2 and later.
+ * Codec support may vary between readers based on the format version and
+ * libraries available at runtime. Gzip, Snappy, and LZ4 codecs are
+ * widely available, while Zstd and Brotli require additional libraries.
  */
 enum CompressionCodec {
   UNCOMPRESSED = 0;
   SNAPPY = 1;
   GZIP = 2;
   LZO = 3;
-  BROTLI = 4;
-  LZ4 = 5;
-  ZSTD = 6;
+  BROTLI = 4; // Added in 2.3.2
+  LZ4 = 5;    // Added in 2.3.2
+  ZSTD = 6;   // Added in 2.3.2
 }
 
 enum PageType {
@@ -374,6 +476,16 @@ enum PageType {
   DATA_PAGE_V2 = 3;
 }
 
+/**
+ * Enum to annotate whether lists of min/max elements inside ColumnIndex
+ * are ordered and if so, in which direction.
+ */
+enum BoundaryOrder {
+  UNORDERED = 0;
+  ASCENDING = 1;
+  DESCENDING = 2;
+}
+
 /** Data page header */
 struct DataPageHeader {
   /** Number of values, including NULLs, in this data page. **/
@@ -408,7 +520,7 @@ struct DictionaryPageHeader {
 }
 
 /**
- * New page format alowing reading levels without decompressing the data
+ * New page format allowing reading levels without decompressing the data
  * Repetition and definition levels are uncompressed
  * The remaining section containing the data is compressed if is_compressed is true
  **/
@@ -425,9 +537,9 @@ struct DataPageHeaderV2 {
 
   // repetition levels and definition levels are always using RLE (without size in it)
 
-  /** length of the repetition levels */
-  5: required i32 definition_levels_byte_length;
   /** length of the definition levels */
+  5: required i32 definition_levels_byte_length;
+  /** length of the repetition levels */
   6: required i32 repetition_levels_byte_length;
 
   /**  whether the values are compressed.
@@ -563,6 +675,18 @@ struct ColumnChunk {
    * metadata.
    **/
   3: optional ColumnMetaData meta_data
+
+  /** File offset of ColumnChunk's OffsetIndex **/
+  4: optional i64 offset_index_offset
+
+  /** Size of ColumnChunk's OffsetIndex, in bytes **/
+  5: optional i32 offset_index_length
+
+  /** File offset of ColumnChunk's ColumnIndex **/
+  6: optional i64 column_index_offset
+
+  /** Size of ColumnChunk's ColumnIndex, in bytes **/
+  7: optional i32 column_index_length
 }
 
 struct RowGroup {
@@ -587,7 +711,9 @@ struct RowGroup {
 struct TypeDefinedOrder {}
 
 /**
- * Union to specify the order used for min, max, and sorting values in a column.
+ * Union to specify the order used for the min_value and max_value fields for a
+ * column. This union takes the role of an enhanced enum that allows rich
+ * elements (which will be needed for a collation-based ordering in the future).
  *
  * Possible values are:
  * * TypeDefinedOrder - the column uses the order defined by its logical or
@@ -597,9 +723,107 @@ struct TypeDefinedOrder {}
  * for this column should be ignored.
  */
 union ColumnOrder {
+
+  /**
+   * The sort orders for logical types are:
+   *   UTF8 - unsigned byte-wise comparison
+   *   INT8 - signed comparison
+   *   INT16 - signed comparison
+   *   INT32 - signed comparison
+   *   INT64 - signed comparison
+   *   UINT8 - unsigned comparison
+   *   UINT16 - unsigned comparison
+   *   UINT32 - unsigned comparison
+   *   UINT64 - unsigned comparison
+   *   DECIMAL - signed comparison of the represented value
+   *   DATE - signed comparison
+   *   TIME_MILLIS - signed comparison
+   *   TIME_MICROS - signed comparison
+   *   TIMESTAMP_MILLIS - signed comparison
+   *   TIMESTAMP_MICROS - signed comparison
+   *   INTERVAL - unsigned comparison
+   *   JSON - unsigned byte-wise comparison
+   *   BSON - unsigned byte-wise comparison
+   *   ENUM - unsigned byte-wise comparison
+   *   LIST - undefined
+   *   MAP - undefined
+   *
+   * In the absence of logical types, the sort order is determined by the physical type:
+   *   BOOLEAN - false, true
+   *   INT32 - signed comparison
+   *   INT64 - signed comparison
+   *   INT96 (only used for legacy timestamps) - unsigned comparison
+   *   FLOAT - signed comparison of the represented value
+   *   DOUBLE - signed comparison of the represented value
+   *   BYTE_ARRAY - unsigned byte-wise comparison
+   *   FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
+   */
   1: TypeDefinedOrder TYPE_ORDER;
 }
 
+struct PageLocation {
+  /** Offset of the page in the file **/
+  1: required i64 offset
+
+  /**
+   * Size of the page, including header. Sum of compressed_page_size and header
+   * length
+   */
+  2: required i32 compressed_page_size
+
+  /**
+   * Index within the RowGroup of the first row of the page; this means pages
+   * change on record boundaries (r = 0).
+   */
+  3: required i64 first_row_index
+}
+
+struct OffsetIndex {
+  /**
+   * PageLocations, ordered by increasing PageLocation.offset. It is required
+   * that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
+   */
+  1: required list<PageLocation> page_locations
+}
+
+/**
+ * Description for ColumnIndex.
+ * Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i]
+ */
+struct ColumnIndex {
+  /**
+   * A list of Boolean values to determine the validity of the corresponding
+   * min and max values. If true, a page contains only null values, and writers
+   * have to set the corresponding entries in min_values and max_values to
+   * byte[0], so that all lists have the same length. If false, the
+   * corresponding entries in min_values and max_values must be valid.
+   */
+  1: required list<bool> null_pages
+
+  /**
+   * Two lists containing lower and upper bounds for the values of each page.
+   * These may be the actual minimum and maximum values found on a page, but
+   * can also be (more compact) values that do not exist on a page. For
+   * example, instead of storing ""Blart Versenwald III", a writer may set
+   * min_values[i]="B", max_values[i]="C". Such more compact values must still
+   * be valid values within the column's logical type. Readers must make sure
+   * that list entries are populated before using them by inspecting null_pages.
+   */
+  2: required list<binary> min_values
+  3: required list<binary> max_values
+
+  /**
+   * Stores whether both min_values and max_values are orderd and if so, in
+   * which direction. This allows readers to perform binary searches in both
+   * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
+   * if the lists are ordered.
+   */
+  4: required BoundaryOrder boundary_order
+
+  /** A list containing the number of null values for each page **/
+  5: optional list<i64> null_counts
+}
+
 /**
  * Description for file metadata
  */
@@ -631,11 +855,16 @@ struct FileMetaData {
   6: optional string created_by
 
   /**
-   * Sort order used for each column in this file.
+   * Sort order used for the min_value and max_value fields of each column in
+   * this file. Each sort order corresponds to one column, determined by its
+   * position in the list, matching the position of the column in the schema.
+   *
+   * Without column_orders, the meaning of the min_value and max_value fields is
+   * undefined. To ensure well-defined behaviour, if min_value and max_value are
+   * written to a Parquet file, column_orders must be written as well.
    *
-   * If this list is not present, then the order for each column is assumed to
-   * be Signed. In addition, min and max values for INTERVAL or DECIMAL stored
-   * as fixed or bytes should be ignored.
+   * The obsolete min and max fields are always sorted by signed comparison
+   * regardless of column_orders.
    */
   7: optional list<ColumnOrder> column_orders;
 }
diff --git a/src/parquet/schema.cc b/src/parquet/schema.cc
index 6075ab6..826ef76 100644
--- a/src/parquet/schema.cc
+++ b/src/parquet/schema.cc
@@ -602,6 +602,37 @@ void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema)
{
   Init(NodePtr(schema.release()));
 }
 
+class SchemaUpdater : public Node::Visitor {
+ public:
+  explicit SchemaUpdater(const std::vector<ColumnOrder>& column_orders)
+      : column_orders_(column_orders), leaf_count_(0) {}
+  virtual ~SchemaUpdater() {}
+
+  void Visit(Node* node) override {
+    if (node->is_group()) {
+      GroupNode* group_node = static_cast<GroupNode*>(node);
+      for (int i = 0; i < group_node->field_count(); ++i) {
+        group_node->field(i)->Visit(this);
+      }
+    } else {  // leaf node
+      PrimitiveNode* leaf_node = static_cast<PrimitiveNode*>(node);
+      leaf_node->SetColumnOrder(column_orders_[leaf_count_++]);
+    }
+  }
+
+ private:
+  const std::vector<ColumnOrder>& column_orders_;
+  int leaf_count_;
+};
+
+void SchemaDescriptor::updateColumnOrders(const std::vector<ColumnOrder>& column_orders)
{
+  if (static_cast<int>(column_orders.size()) != num_columns()) {
+    throw ParquetException("Malformed schema: not enough ColumnOrder values");
+  }
+  SchemaUpdater visitor(column_orders);
+  const_cast<GroupNode*>(group_node_)->Visit(&visitor);
+}
+
 void SchemaDescriptor::Init(const NodePtr& schema) {
   schema_ = schema;
 
diff --git a/src/parquet/schema.h b/src/parquet/schema.h
index f93f0db..7b6793b 100644
--- a/src/parquet/schema.h
+++ b/src/parquet/schema.h
@@ -209,6 +209,10 @@ class PARQUET_EXPORT PrimitiveNode : public Node {
 
   Type::type physical_type() const { return physical_type_; }
 
+  ColumnOrder column_order() const { return column_order_; }
+
+  void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
+
   int32_t type_length() const { return type_length_; }
 
   const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
@@ -225,6 +229,7 @@ class PARQUET_EXPORT PrimitiveNode : public Node {
   Type::type physical_type_;
   int32_t type_length_;
   DecimalMetadata decimal_metadata_;
+  ColumnOrder column_order_;
 
   // For FIXED_LEN_BYTE_ARRAY
   void SetTypeLength(int32_t length) { type_length_ = length; }
@@ -335,6 +340,8 @@ class PARQUET_EXPORT ColumnDescriptor {
 
   LogicalType::type logical_type() const { return primitive_node_->logical_type(); }
 
+  ColumnOrder column_order() const { return primitive_node_->column_order(); }
+
   SortOrder::type sort_order() const {
     return GetSortOrder(logical_type(), physical_type());
   }
@@ -407,10 +414,14 @@ class PARQUET_EXPORT SchemaDescriptor {
 
   std::string ToString() const;
 
+  void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
+
  private:
   friend class ColumnDescriptor;
 
+  // Root Node
   schema::NodePtr schema_;
+  // Root Node
   const schema::GroupNode* group_node_;
 
   void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc
index bc6eac2..ec8f90a 100644
--- a/src/parquet/statistics-test.cc
+++ b/src/parquet/statistics-test.cc
@@ -277,7 +277,7 @@ void TestRowGroupStatistics<ByteArrayType>::TestMinMaxEncode() {
   ASSERT_EQ(statistics1.max(), statistics2.max());
 }
 
-using TestTypes = ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
+using TestTypes = ::testing::Types<Int32Type, Int64Type, FloatType, DoubleType,
                                    ByteArrayType, FLBAType, BooleanType>;
 
 TYPED_TEST_CASE(TestRowGroupStatistics, TestTypes);
@@ -397,7 +397,7 @@ TEST(CorrectStatistics, Basics) {
   ASSERT_TRUE(column_chunk4->is_stats_set());
   auto column_chunk5 = ColumnChunkMetaData::Make(
       reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(4), &version);
-  ASSERT_TRUE(column_chunk5->is_stats_set());
+  ASSERT_FALSE(column_chunk5->is_stats_set());
   auto column_chunk6 = ColumnChunkMetaData::Make(
       reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(5), &version);
   ASSERT_TRUE(column_chunk6->is_stats_set());
@@ -478,8 +478,8 @@ class TestStatistics : public ::testing::Test {
   std::vector<EncodedStatistics> stats_;
 };
 
-using CompareTestTypes = ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType,
-                                          DoubleType, ByteArrayType, FLBAType>;
+using CompareTestTypes = ::testing::Types<Int32Type, Int64Type, FloatType, DoubleType,
+                                          ByteArrayType, FLBAType>;
 
 // TYPE::INT32
 template <>
@@ -537,28 +537,6 @@ void TestStatistics<Int64Type>::SetValues() {
       .set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(T)));
 }
 
-// TYPE::INT96
-template <>
-void TestStatistics<Int96Type>::AddNodes(std::string name) {
-  // INT96 physical type has only Unsigned Statistics
-  fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT96,
-                                                LogicalType::NONE));
-}
-
-template <>
-void TestStatistics<Int96Type>::SetValues() {
-  for (int i = 0; i < NUM_VALUES; i++) {
-    values_[i].value[0] = i - 5;  // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
-    values_[i].value[1] = i - 5;  // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
-    values_[i].value[2] = i - 5;  // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
-  }
-
-  // Write Int96 min/max values
-  stats_[0]
-      .set_min(std::string(reinterpret_cast<const char*>(&values_[5]), sizeof(T)))
-      .set_max(std::string(reinterpret_cast<const char*>(&values_[4]), sizeof(T)));
-}
-
 // TYPE::FLOAT
 template <>
 void TestStatistics<FloatType>::SetValues() {
diff --git a/src/parquet/test-specialization.h b/src/parquet/test-specialization.h
index 08160a6..a6112a2 100644
--- a/src/parquet/test-specialization.h
+++ b/src/parquet/test-specialization.h
@@ -35,13 +35,14 @@ namespace parquet {
 namespace test {
 
 template <>
-void InitValues<bool>(int num_values, vector<bool>& values, vector<uint8_t>&
buffer) {
+void inline InitValues<bool>(int num_values, vector<bool>& values,
+                             vector<uint8_t>& buffer) {
   values = flip_coins(num_values, 0);
 }
 
 template <>
-void InitValues<ByteArray>(int num_values, vector<ByteArray>& values,
-                           vector<uint8_t>& buffer) {
+void inline InitValues<ByteArray>(int num_values, vector<ByteArray>& values,
+                                  vector<uint8_t>& buffer) {
   int max_byte_array_len = 12;
   int num_bytes = static_cast<int>(max_byte_array_len + sizeof(uint32_t));
   size_t nbytes = num_values * num_bytes;
@@ -50,14 +51,16 @@ void InitValues<ByteArray>(int num_values, vector<ByteArray>&
values,
 }
 
 template <>
-void InitValues<FLBA>(int num_values, vector<FLBA>& values, vector<uint8_t>&
buffer) {
+void inline InitValues<FLBA>(int num_values, vector<FLBA>& values,
+                             vector<uint8_t>& buffer) {
   size_t nbytes = num_values * FLBA_LENGTH;
   buffer.resize(nbytes);
   random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data());
 }
 
 template <>
-void InitValues<Int96>(int num_values, vector<Int96>& values, vector<uint8_t>&
buffer) {
+void inline InitValues<Int96>(int num_values, vector<Int96>& values,
+                              vector<uint8_t>& buffer) {
   random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
                        std::numeric_limits<int32_t>::max(), values.data());
 }
diff --git a/src/parquet/types.cc b/src/parquet/types.cc
index 4e6770f..a4929d5 100644
--- a/src/parquet/types.cc
+++ b/src/parquet/types.cc
@@ -213,8 +213,9 @@ SortOrder::type DefaultSortOrder(Type::type primitive) {
       return SortOrder::SIGNED;
     case Type::BYTE_ARRAY:
     case Type::FIXED_LEN_BYTE_ARRAY:
-    case Type::INT96:  // only used for timestamp, which uses unsigned values
       return SortOrder::UNSIGNED;
+    case Type::INT96:
+      return SortOrder::UNKNOWN;
   }
   return SortOrder::UNKNOWN;
 }
@@ -254,4 +255,7 @@ SortOrder::type GetSortOrder(LogicalType::type converted, Type::type primitive)
   return SortOrder::UNKNOWN;
 }
 
+ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED);
+ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER);
+
 }  // namespace parquet
diff --git a/src/parquet/types.h b/src/parquet/types.h
index c1e9598..2179d50 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -130,6 +130,21 @@ struct SortOrder {
   enum type { SIGNED, UNSIGNED, UNKNOWN };
 };
 
+class ColumnOrder {
+ public:
+  enum type { UNDEFINED, TYPE_DEFINED_ORDER };
+  explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
+  // Default to Type Defined Order
+  ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
+  ColumnOrder::type get_order() { return column_order_; }
+
+  static ColumnOrder undefined_;
+  static ColumnOrder type_defined_;
+
+ private:
+  ColumnOrder::type column_order_;
+};
+
 // ----------------------------------------------------------------------
 
 struct ByteArray {
diff --git a/src/parquet/util/comparison.cc b/src/parquet/util/comparison.cc
index 1d7bb9d..a0768b3 100644
--- a/src/parquet/util/comparison.cc
+++ b/src/parquet/util/comparison.cc
@@ -33,8 +33,6 @@ std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor*
descr) {
         return std::make_shared<CompareDefaultInt32>();
       case Type::INT64:
         return std::make_shared<CompareDefaultInt64>();
-      case Type::INT96:
-        return std::make_shared<CompareDefaultInt96>();
       case Type::FLOAT:
         return std::make_shared<CompareDefaultFloat>();
       case Type::DOUBLE:
@@ -52,8 +50,6 @@ std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor*
descr) {
         return std::make_shared<CompareUnsignedInt32>();
       case Type::INT64:
         return std::make_shared<CompareUnsignedInt64>();
-      case Type::INT96:
-        return std::make_shared<CompareUnsignedInt96>();
       case Type::BYTE_ARRAY:
         return std::make_shared<CompareUnsignedByteArray>();
       case Type::FIXED_LEN_BYTE_ARRAY:

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.

Mime
View raw message