parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From n...@apache.org
Subject [1/7] parquet-cpp git commit: PARQUET-416: C++11 compilation, code reorg, libparquet and installation targets
Date Fri, 08 Jan 2016 23:51:52 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master ea30decd9 -> 337cf584e


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/thrift/parquet_types.h
----------------------------------------------------------------------
diff --git a/src/parquet/thrift/parquet_types.h b/src/parquet/thrift/parquet_types.h
new file mode 100644
index 0000000..4360d02
--- /dev/null
+++ b/src/parquet/thrift/parquet_types.h
@@ -0,0 +1,1123 @@
+/**
+ * Autogenerated by Thrift Compiler (0.9.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ *  @generated
+ */
+#ifndef parquet_TYPES_H
+#define parquet_TYPES_H
+
+#include <thrift/Thrift.h>
+#include <thrift/TApplicationException.h>
+#include <thrift/protocol/TProtocol.h>
+#include <thrift/transport/TTransport.h>
+
+
+
+namespace parquet {
+
+struct Type {
+  enum type {
+    BOOLEAN = 0,
+    INT32 = 1,
+    INT64 = 2,
+    INT96 = 3,
+    FLOAT = 4,
+    DOUBLE = 5,
+    BYTE_ARRAY = 6,
+    FIXED_LEN_BYTE_ARRAY = 7
+  };
+};
+
+extern const std::map<int, const char*> _Type_VALUES_TO_NAMES;
+
+struct ConvertedType {
+  enum type {
+    UTF8 = 0,
+    MAP = 1,
+    MAP_KEY_VALUE = 2,
+    LIST = 3,
+    ENUM = 4,
+    DECIMAL = 5
+  };
+};
+
+extern const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES;
+
+struct FieldRepetitionType {
+  enum type {
+    REQUIRED = 0,
+    OPTIONAL = 1,
+    REPEATED = 2
+  };
+};
+
+extern const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES;
+
+struct Encoding {
+  enum type {
+    PLAIN = 0,
+    PLAIN_DICTIONARY = 2,
+    RLE = 3,
+    BIT_PACKED = 4,
+    DELTA_BINARY_PACKED = 5,
+    DELTA_LENGTH_BYTE_ARRAY = 6,
+    DELTA_BYTE_ARRAY = 7,
+    RLE_DICTIONARY = 8
+  };
+};
+
+extern const std::map<int, const char*> _Encoding_VALUES_TO_NAMES;
+
+struct CompressionCodec {
+  enum type {
+    UNCOMPRESSED = 0,
+    SNAPPY = 1,
+    GZIP = 2,
+    LZO = 3
+  };
+};
+
+extern const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES;
+
+struct PageType {
+  enum type {
+    DATA_PAGE = 0,
+    INDEX_PAGE = 1,
+    DICTIONARY_PAGE = 2,
+    DATA_PAGE_V2 = 3
+  };
+};
+
+extern const std::map<int, const char*> _PageType_VALUES_TO_NAMES;
+
+typedef struct _Statistics__isset {
+  _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false) {}
+  bool max;
+  bool min;
+  bool null_count;
+  bool distinct_count;
+} _Statistics__isset;
+
+class Statistics {
+ public:
+
+  static const char* ascii_fingerprint; // = "CE004821871820DD79A8FD98BB101F6D";
+  static const uint8_t binary_fingerprint[16]; // = {0xCE,0x00,0x48,0x21,0x87,0x18,0x20,0xDD,0x79,0xA8,0xFD,0x98,0xBB,0x10,0x1F,0x6D};
+
+  Statistics() : max(), min(), null_count(0), distinct_count(0) {
+  }
+
+  virtual ~Statistics() throw() {}
+
+  std::string max;
+  std::string min;
+  int64_t null_count;
+  int64_t distinct_count;
+
+  _Statistics__isset __isset;
+
+  void __set_max(const std::string& val) {
+    max = val;
+    __isset.max = true;
+  }
+
+  void __set_min(const std::string& val) {
+    min = val;
+    __isset.min = true;
+  }
+
+  void __set_null_count(const int64_t val) {
+    null_count = val;
+    __isset.null_count = true;
+  }
+
+  void __set_distinct_count(const int64_t val) {
+    distinct_count = val;
+    __isset.distinct_count = true;
+  }
+
+  bool operator == (const Statistics & rhs) const
+  {
+    if (__isset.max != rhs.__isset.max)
+      return false;
+    else if (__isset.max && !(max == rhs.max))
+      return false;
+    if (__isset.min != rhs.__isset.min)
+      return false;
+    else if (__isset.min && !(min == rhs.min))
+      return false;
+    if (__isset.null_count != rhs.__isset.null_count)
+      return false;
+    else if (__isset.null_count && !(null_count == rhs.null_count))
+      return false;
+    if (__isset.distinct_count != rhs.__isset.distinct_count)
+      return false;
+    else if (__isset.distinct_count && !(distinct_count == rhs.distinct_count))
+      return false;
+    return true;
+  }
+  bool operator != (const Statistics &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const Statistics & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(Statistics &a, Statistics &b);
+
+typedef struct _SchemaElement__isset {
+  _SchemaElement__isset() : type(false), type_length(false), repetition_type(false), num_children(false), converted_type(false), scale(false), precision(false) {}
+  bool type;
+  bool type_length;
+  bool repetition_type;
+  bool num_children;
+  bool converted_type;
+  bool scale;
+  bool precision;
+} _SchemaElement__isset;
+
+class SchemaElement {
+ public:
+
+  static const char* ascii_fingerprint; // = "388A784401753800444CFEAC8BC1B1A1";
+  static const uint8_t binary_fingerprint[16]; // = {0x38,0x8A,0x78,0x44,0x01,0x75,0x38,0x00,0x44,0x4C,0xFE,0xAC,0x8B,0xC1,0xB1,0xA1};
+
+  SchemaElement() : type((Type::type)0), type_length(0), repetition_type((FieldRepetitionType::type)0), name(), num_children(0), converted_type((ConvertedType::type)0), scale(0), precision(0) {
+  }
+
+  virtual ~SchemaElement() throw() {}
+
+  Type::type type;
+  int32_t type_length;
+  FieldRepetitionType::type repetition_type;
+  std::string name;
+  int32_t num_children;
+  ConvertedType::type converted_type;
+  int32_t scale;
+  int32_t precision;
+
+  _SchemaElement__isset __isset;
+
+  void __set_type(const Type::type val) {
+    type = val;
+    __isset.type = true;
+  }
+
+  void __set_type_length(const int32_t val) {
+    type_length = val;
+    __isset.type_length = true;
+  }
+
+  void __set_repetition_type(const FieldRepetitionType::type val) {
+    repetition_type = val;
+    __isset.repetition_type = true;
+  }
+
+  void __set_name(const std::string& val) {
+    name = val;
+  }
+
+  void __set_num_children(const int32_t val) {
+    num_children = val;
+    __isset.num_children = true;
+  }
+
+  void __set_converted_type(const ConvertedType::type val) {
+    converted_type = val;
+    __isset.converted_type = true;
+  }
+
+  void __set_scale(const int32_t val) {
+    scale = val;
+    __isset.scale = true;
+  }
+
+  void __set_precision(const int32_t val) {
+    precision = val;
+    __isset.precision = true;
+  }
+
+  bool operator == (const SchemaElement & rhs) const
+  {
+    if (__isset.type != rhs.__isset.type)
+      return false;
+    else if (__isset.type && !(type == rhs.type))
+      return false;
+    if (__isset.type_length != rhs.__isset.type_length)
+      return false;
+    else if (__isset.type_length && !(type_length == rhs.type_length))
+      return false;
+    if (__isset.repetition_type != rhs.__isset.repetition_type)
+      return false;
+    else if (__isset.repetition_type && !(repetition_type == rhs.repetition_type))
+      return false;
+    if (!(name == rhs.name))
+      return false;
+    if (__isset.num_children != rhs.__isset.num_children)
+      return false;
+    else if (__isset.num_children && !(num_children == rhs.num_children))
+      return false;
+    if (__isset.converted_type != rhs.__isset.converted_type)
+      return false;
+    else if (__isset.converted_type && !(converted_type == rhs.converted_type))
+      return false;
+    if (__isset.scale != rhs.__isset.scale)
+      return false;
+    else if (__isset.scale && !(scale == rhs.scale))
+      return false;
+    if (__isset.precision != rhs.__isset.precision)
+      return false;
+    else if (__isset.precision && !(precision == rhs.precision))
+      return false;
+    return true;
+  }
+  bool operator != (const SchemaElement &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const SchemaElement & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(SchemaElement &a, SchemaElement &b);
+
+typedef struct _DataPageHeader__isset {
+  _DataPageHeader__isset() : statistics(false) {}
+  bool statistics;
+} _DataPageHeader__isset;
+
+class DataPageHeader {
+ public:
+
+  static const char* ascii_fingerprint; // = "5FC1792B0483E9C984475384165040B1";
+  static const uint8_t binary_fingerprint[16]; // = {0x5F,0xC1,0x79,0x2B,0x04,0x83,0xE9,0xC9,0x84,0x47,0x53,0x84,0x16,0x50,0x40,0xB1};
+
+  DataPageHeader() : num_values(0), encoding((Encoding::type)0), definition_level_encoding((Encoding::type)0), repetition_level_encoding((Encoding::type)0) {
+  }
+
+  virtual ~DataPageHeader() throw() {}
+
+  int32_t num_values;
+  Encoding::type encoding;
+  Encoding::type definition_level_encoding;
+  Encoding::type repetition_level_encoding;
+  Statistics statistics;
+
+  _DataPageHeader__isset __isset;
+
+  void __set_num_values(const int32_t val) {
+    num_values = val;
+  }
+
+  void __set_encoding(const Encoding::type val) {
+    encoding = val;
+  }
+
+  void __set_definition_level_encoding(const Encoding::type val) {
+    definition_level_encoding = val;
+  }
+
+  void __set_repetition_level_encoding(const Encoding::type val) {
+    repetition_level_encoding = val;
+  }
+
+  void __set_statistics(const Statistics& val) {
+    statistics = val;
+    __isset.statistics = true;
+  }
+
+  bool operator == (const DataPageHeader & rhs) const
+  {
+    if (!(num_values == rhs.num_values))
+      return false;
+    if (!(encoding == rhs.encoding))
+      return false;
+    if (!(definition_level_encoding == rhs.definition_level_encoding))
+      return false;
+    if (!(repetition_level_encoding == rhs.repetition_level_encoding))
+      return false;
+    if (__isset.statistics != rhs.__isset.statistics)
+      return false;
+    else if (__isset.statistics && !(statistics == rhs.statistics))
+      return false;
+    return true;
+  }
+  bool operator != (const DataPageHeader &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const DataPageHeader & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(DataPageHeader &a, DataPageHeader &b);
+
+
+class IndexPageHeader {
+ public:
+
+  static const char* ascii_fingerprint; // = "99914B932BD37A50B983C5E7C90AE93B";
+  static const uint8_t binary_fingerprint[16]; // = {0x99,0x91,0x4B,0x93,0x2B,0xD3,0x7A,0x50,0xB9,0x83,0xC5,0xE7,0xC9,0x0A,0xE9,0x3B};
+
+  IndexPageHeader() {
+  }
+
+  virtual ~IndexPageHeader() throw() {}
+
+
+  bool operator == (const IndexPageHeader & /* rhs */) const
+  {
+    return true;
+  }
+  bool operator != (const IndexPageHeader &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const IndexPageHeader & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(IndexPageHeader &a, IndexPageHeader &b);
+
+typedef struct _DictionaryPageHeader__isset {
+  _DictionaryPageHeader__isset() : is_sorted(false) {}
+  bool is_sorted;
+} _DictionaryPageHeader__isset;
+
+class DictionaryPageHeader {
+ public:
+
+  static const char* ascii_fingerprint; // = "B149E4528254D495610C22AE4BD539C5";
+  static const uint8_t binary_fingerprint[16]; // = {0xB1,0x49,0xE4,0x52,0x82,0x54,0xD4,0x95,0x61,0x0C,0x22,0xAE,0x4B,0xD5,0x39,0xC5};
+
+  DictionaryPageHeader() : num_values(0), encoding((Encoding::type)0), is_sorted(0) {
+  }
+
+  virtual ~DictionaryPageHeader() throw() {}
+
+  int32_t num_values;
+  Encoding::type encoding;
+  bool is_sorted;
+
+  _DictionaryPageHeader__isset __isset;
+
+  void __set_num_values(const int32_t val) {
+    num_values = val;
+  }
+
+  void __set_encoding(const Encoding::type val) {
+    encoding = val;
+  }
+
+  void __set_is_sorted(const bool val) {
+    is_sorted = val;
+    __isset.is_sorted = true;
+  }
+
+  bool operator == (const DictionaryPageHeader & rhs) const
+  {
+    if (!(num_values == rhs.num_values))
+      return false;
+    if (!(encoding == rhs.encoding))
+      return false;
+    if (__isset.is_sorted != rhs.__isset.is_sorted)
+      return false;
+    else if (__isset.is_sorted && !(is_sorted == rhs.is_sorted))
+      return false;
+    return true;
+  }
+  bool operator != (const DictionaryPageHeader &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const DictionaryPageHeader & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(DictionaryPageHeader &a, DictionaryPageHeader &b);
+
+typedef struct _DataPageHeaderV2__isset {
+  _DataPageHeaderV2__isset() : is_compressed(true), statistics(false) {}
+  bool is_compressed;
+  bool statistics;
+} _DataPageHeaderV2__isset;
+
+class DataPageHeaderV2 {
+ public:
+
+  static const char* ascii_fingerprint; // = "69FF2F6BD1A443440D5E46ABA5A3A919";
+  static const uint8_t binary_fingerprint[16]; // = {0x69,0xFF,0x2F,0x6B,0xD1,0xA4,0x43,0x44,0x0D,0x5E,0x46,0xAB,0xA5,0xA3,0xA9,0x19};
+
+  DataPageHeaderV2() : num_values(0), num_nulls(0), num_rows(0), encoding((Encoding::type)0), definition_levels_byte_length(0), repetition_levels_byte_length(0), is_compressed(true) {
+  }
+
+  virtual ~DataPageHeaderV2() throw() {}
+
+  int32_t num_values;
+  int32_t num_nulls;
+  int32_t num_rows;
+  Encoding::type encoding;
+  int32_t definition_levels_byte_length;
+  int32_t repetition_levels_byte_length;
+  bool is_compressed;
+  Statistics statistics;
+
+  _DataPageHeaderV2__isset __isset;
+
+  void __set_num_values(const int32_t val) {
+    num_values = val;
+  }
+
+  void __set_num_nulls(const int32_t val) {
+    num_nulls = val;
+  }
+
+  void __set_num_rows(const int32_t val) {
+    num_rows = val;
+  }
+
+  void __set_encoding(const Encoding::type val) {
+    encoding = val;
+  }
+
+  void __set_definition_levels_byte_length(const int32_t val) {
+    definition_levels_byte_length = val;
+  }
+
+  void __set_repetition_levels_byte_length(const int32_t val) {
+    repetition_levels_byte_length = val;
+  }
+
+  void __set_is_compressed(const bool val) {
+    is_compressed = val;
+    __isset.is_compressed = true;
+  }
+
+  void __set_statistics(const Statistics& val) {
+    statistics = val;
+    __isset.statistics = true;
+  }
+
+  bool operator == (const DataPageHeaderV2 & rhs) const
+  {
+    if (!(num_values == rhs.num_values))
+      return false;
+    if (!(num_nulls == rhs.num_nulls))
+      return false;
+    if (!(num_rows == rhs.num_rows))
+      return false;
+    if (!(encoding == rhs.encoding))
+      return false;
+    if (!(definition_levels_byte_length == rhs.definition_levels_byte_length))
+      return false;
+    if (!(repetition_levels_byte_length == rhs.repetition_levels_byte_length))
+      return false;
+    if (__isset.is_compressed != rhs.__isset.is_compressed)
+      return false;
+    else if (__isset.is_compressed && !(is_compressed == rhs.is_compressed))
+      return false;
+    if (__isset.statistics != rhs.__isset.statistics)
+      return false;
+    else if (__isset.statistics && !(statistics == rhs.statistics))
+      return false;
+    return true;
+  }
+  bool operator != (const DataPageHeaderV2 &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const DataPageHeaderV2 & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b);
+
+typedef struct _PageHeader__isset {
+  _PageHeader__isset() : crc(false), data_page_header(false), index_page_header(false), dictionary_page_header(false), data_page_header_v2(false) {}
+  bool crc;
+  bool data_page_header;
+  bool index_page_header;
+  bool dictionary_page_header;
+  bool data_page_header_v2;
+} _PageHeader__isset;
+
+class PageHeader {
+ public:
+
+  static const char* ascii_fingerprint; // = "B5BD2BDF3756C883A58B30B9C9F204A0";
+  static const uint8_t binary_fingerprint[16]; // = {0xB5,0xBD,0x2B,0xDF,0x37,0x56,0xC8,0x83,0xA5,0x8B,0x30,0xB9,0xC9,0xF2,0x04,0xA0};
+
+  PageHeader() : type((PageType::type)0), uncompressed_page_size(0), compressed_page_size(0), crc(0) {
+  }
+
+  virtual ~PageHeader() throw() {}
+
+  PageType::type type;
+  int32_t uncompressed_page_size;
+  int32_t compressed_page_size;
+  int32_t crc;
+  DataPageHeader data_page_header;
+  IndexPageHeader index_page_header;
+  DictionaryPageHeader dictionary_page_header;
+  DataPageHeaderV2 data_page_header_v2;
+
+  _PageHeader__isset __isset;
+
+  void __set_type(const PageType::type val) {
+    type = val;
+  }
+
+  void __set_uncompressed_page_size(const int32_t val) {
+    uncompressed_page_size = val;
+  }
+
+  void __set_compressed_page_size(const int32_t val) {
+    compressed_page_size = val;
+  }
+
+  void __set_crc(const int32_t val) {
+    crc = val;
+    __isset.crc = true;
+  }
+
+  void __set_data_page_header(const DataPageHeader& val) {
+    data_page_header = val;
+    __isset.data_page_header = true;
+  }
+
+  void __set_index_page_header(const IndexPageHeader& val) {
+    index_page_header = val;
+    __isset.index_page_header = true;
+  }
+
+  void __set_dictionary_page_header(const DictionaryPageHeader& val) {
+    dictionary_page_header = val;
+    __isset.dictionary_page_header = true;
+  }
+
+  void __set_data_page_header_v2(const DataPageHeaderV2& val) {
+    data_page_header_v2 = val;
+    __isset.data_page_header_v2 = true;
+  }
+
+  bool operator == (const PageHeader & rhs) const
+  {
+    if (!(type == rhs.type))
+      return false;
+    if (!(uncompressed_page_size == rhs.uncompressed_page_size))
+      return false;
+    if (!(compressed_page_size == rhs.compressed_page_size))
+      return false;
+    if (__isset.crc != rhs.__isset.crc)
+      return false;
+    else if (__isset.crc && !(crc == rhs.crc))
+      return false;
+    if (__isset.data_page_header != rhs.__isset.data_page_header)
+      return false;
+    else if (__isset.data_page_header && !(data_page_header == rhs.data_page_header))
+      return false;
+    if (__isset.index_page_header != rhs.__isset.index_page_header)
+      return false;
+    else if (__isset.index_page_header && !(index_page_header == rhs.index_page_header))
+      return false;
+    if (__isset.dictionary_page_header != rhs.__isset.dictionary_page_header)
+      return false;
+    else if (__isset.dictionary_page_header && !(dictionary_page_header == rhs.dictionary_page_header))
+      return false;
+    if (__isset.data_page_header_v2 != rhs.__isset.data_page_header_v2)
+      return false;
+    else if (__isset.data_page_header_v2 && !(data_page_header_v2 == rhs.data_page_header_v2))
+      return false;
+    return true;
+  }
+  bool operator != (const PageHeader &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const PageHeader & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(PageHeader &a, PageHeader &b);
+
+typedef struct _KeyValue__isset {
+  _KeyValue__isset() : value(false) {}
+  bool value;
+} _KeyValue__isset;
+
+class KeyValue {
+ public:
+
+  static const char* ascii_fingerprint; // = "5B708A954C550ECA9C1A49D3C5CAFAB9";
+  static const uint8_t binary_fingerprint[16]; // = {0x5B,0x70,0x8A,0x95,0x4C,0x55,0x0E,0xCA,0x9C,0x1A,0x49,0xD3,0xC5,0xCA,0xFA,0xB9};
+
+  KeyValue() : key(), value() {
+  }
+
+  virtual ~KeyValue() throw() {}
+
+  std::string key;
+  std::string value;
+
+  _KeyValue__isset __isset;
+
+  void __set_key(const std::string& val) {
+    key = val;
+  }
+
+  void __set_value(const std::string& val) {
+    value = val;
+    __isset.value = true;
+  }
+
+  bool operator == (const KeyValue & rhs) const
+  {
+    if (!(key == rhs.key))
+      return false;
+    if (__isset.value != rhs.__isset.value)
+      return false;
+    else if (__isset.value && !(value == rhs.value))
+      return false;
+    return true;
+  }
+  bool operator != (const KeyValue &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const KeyValue & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(KeyValue &a, KeyValue &b);
+
+
+class SortingColumn {
+ public:
+
+  static const char* ascii_fingerprint; // = "F079C2D58A783AD90F9BE05D10DBBC6F";
+  static const uint8_t binary_fingerprint[16]; // = {0xF0,0x79,0xC2,0xD5,0x8A,0x78,0x3A,0xD9,0x0F,0x9B,0xE0,0x5D,0x10,0xDB,0xBC,0x6F};
+
+  SortingColumn() : column_idx(0), descending(0), nulls_first(0) {
+  }
+
+  virtual ~SortingColumn() throw() {}
+
+  int32_t column_idx;
+  bool descending;
+  bool nulls_first;
+
+  void __set_column_idx(const int32_t val) {
+    column_idx = val;
+  }
+
+  void __set_descending(const bool val) {
+    descending = val;
+  }
+
+  void __set_nulls_first(const bool val) {
+    nulls_first = val;
+  }
+
+  bool operator == (const SortingColumn & rhs) const
+  {
+    if (!(column_idx == rhs.column_idx))
+      return false;
+    if (!(descending == rhs.descending))
+      return false;
+    if (!(nulls_first == rhs.nulls_first))
+      return false;
+    return true;
+  }
+  bool operator != (const SortingColumn &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const SortingColumn & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(SortingColumn &a, SortingColumn &b);
+
+typedef struct _ColumnMetaData__isset {
+  _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false) {}
+  bool key_value_metadata;
+  bool index_page_offset;
+  bool dictionary_page_offset;
+  bool statistics;
+} _ColumnMetaData__isset;
+
+class ColumnMetaData {
+ public:
+
+  static const char* ascii_fingerprint; // = "1AF797732BCB4465C6314FB29B86638D";
+  static const uint8_t binary_fingerprint[16]; // = {0x1A,0xF7,0x97,0x73,0x2B,0xCB,0x44,0x65,0xC6,0x31,0x4F,0xB2,0x9B,0x86,0x63,0x8D};
+
+  ColumnMetaData() : type((Type::type)0), codec((CompressionCodec::type)0), num_values(0), total_uncompressed_size(0), total_compressed_size(0), data_page_offset(0), index_page_offset(0), dictionary_page_offset(0) {
+  }
+
+  virtual ~ColumnMetaData() throw() {}
+
+  Type::type type;
+  std::vector<Encoding::type>  encodings;
+  std::vector<std::string>  path_in_schema;
+  CompressionCodec::type codec;
+  int64_t num_values;
+  int64_t total_uncompressed_size;
+  int64_t total_compressed_size;
+  std::vector<KeyValue>  key_value_metadata;
+  int64_t data_page_offset;
+  int64_t index_page_offset;
+  int64_t dictionary_page_offset;
+  Statistics statistics;
+
+  _ColumnMetaData__isset __isset;
+
+  void __set_type(const Type::type val) {
+    type = val;
+  }
+
+  void __set_encodings(const std::vector<Encoding::type> & val) {
+    encodings = val;
+  }
+
+  void __set_path_in_schema(const std::vector<std::string> & val) {
+    path_in_schema = val;
+  }
+
+  void __set_codec(const CompressionCodec::type val) {
+    codec = val;
+  }
+
+  void __set_num_values(const int64_t val) {
+    num_values = val;
+  }
+
+  void __set_total_uncompressed_size(const int64_t val) {
+    total_uncompressed_size = val;
+  }
+
+  void __set_total_compressed_size(const int64_t val) {
+    total_compressed_size = val;
+  }
+
+  void __set_key_value_metadata(const std::vector<KeyValue> & val) {
+    key_value_metadata = val;
+    __isset.key_value_metadata = true;
+  }
+
+  void __set_data_page_offset(const int64_t val) {
+    data_page_offset = val;
+  }
+
+  void __set_index_page_offset(const int64_t val) {
+    index_page_offset = val;
+    __isset.index_page_offset = true;
+  }
+
+  void __set_dictionary_page_offset(const int64_t val) {
+    dictionary_page_offset = val;
+    __isset.dictionary_page_offset = true;
+  }
+
+  void __set_statistics(const Statistics& val) {
+    statistics = val;
+    __isset.statistics = true;
+  }
+
+  bool operator == (const ColumnMetaData & rhs) const
+  {
+    if (!(type == rhs.type))
+      return false;
+    if (!(encodings == rhs.encodings))
+      return false;
+    if (!(path_in_schema == rhs.path_in_schema))
+      return false;
+    if (!(codec == rhs.codec))
+      return false;
+    if (!(num_values == rhs.num_values))
+      return false;
+    if (!(total_uncompressed_size == rhs.total_uncompressed_size))
+      return false;
+    if (!(total_compressed_size == rhs.total_compressed_size))
+      return false;
+    if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
+      return false;
+    else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
+      return false;
+    if (!(data_page_offset == rhs.data_page_offset))
+      return false;
+    if (__isset.index_page_offset != rhs.__isset.index_page_offset)
+      return false;
+    else if (__isset.index_page_offset && !(index_page_offset == rhs.index_page_offset))
+      return false;
+    if (__isset.dictionary_page_offset != rhs.__isset.dictionary_page_offset)
+      return false;
+    else if (__isset.dictionary_page_offset && !(dictionary_page_offset == rhs.dictionary_page_offset))
+      return false;
+    if (__isset.statistics != rhs.__isset.statistics)
+      return false;
+    else if (__isset.statistics && !(statistics == rhs.statistics))
+      return false;
+    return true;
+  }
+  bool operator != (const ColumnMetaData &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const ColumnMetaData & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(ColumnMetaData &a, ColumnMetaData &b);
+
+typedef struct _ColumnChunk__isset {
+  _ColumnChunk__isset() : file_path(false), meta_data(false) {}
+  bool file_path;
+  bool meta_data;
+} _ColumnChunk__isset;
+
+class ColumnChunk {
+ public:
+
+  static const char* ascii_fingerprint; // = "169FC47057EF3D82E2FACDDEC2641AE8";
+  static const uint8_t binary_fingerprint[16]; // = {0x16,0x9F,0xC4,0x70,0x57,0xEF,0x3D,0x82,0xE2,0xFA,0xCD,0xDE,0xC2,0x64,0x1A,0xE8};
+
+  ColumnChunk() : file_path(), file_offset(0) {
+  }
+
+  virtual ~ColumnChunk() throw() {}
+
+  std::string file_path;
+  int64_t file_offset;
+  ColumnMetaData meta_data;
+
+  _ColumnChunk__isset __isset;
+
+  void __set_file_path(const std::string& val) {
+    file_path = val;
+    __isset.file_path = true;
+  }
+
+  void __set_file_offset(const int64_t val) {
+    file_offset = val;
+  }
+
+  void __set_meta_data(const ColumnMetaData& val) {
+    meta_data = val;
+    __isset.meta_data = true;
+  }
+
+  bool operator == (const ColumnChunk & rhs) const
+  {
+    if (__isset.file_path != rhs.__isset.file_path)
+      return false;
+    else if (__isset.file_path && !(file_path == rhs.file_path))
+      return false;
+    if (!(file_offset == rhs.file_offset))
+      return false;
+    if (__isset.meta_data != rhs.__isset.meta_data)
+      return false;
+    else if (__isset.meta_data && !(meta_data == rhs.meta_data))
+      return false;
+    return true;
+  }
+  bool operator != (const ColumnChunk &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const ColumnChunk & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(ColumnChunk &a, ColumnChunk &b);
+
+typedef struct _RowGroup__isset {
+  _RowGroup__isset() : sorting_columns(false) {}
+  bool sorting_columns;
+} _RowGroup__isset;
+
+class RowGroup {
+ public:
+
+  static const char* ascii_fingerprint; // = "DC7968627FA826DDC4C6C9BE773586C9";
+  static const uint8_t binary_fingerprint[16]; // = {0xDC,0x79,0x68,0x62,0x7F,0xA8,0x26,0xDD,0xC4,0xC6,0xC9,0xBE,0x77,0x35,0x86,0xC9};
+
+  RowGroup() : total_byte_size(0), num_rows(0) {
+  }
+
+  virtual ~RowGroup() throw() {}
+
+  std::vector<ColumnChunk>  columns;
+  int64_t total_byte_size;
+  int64_t num_rows;
+  std::vector<SortingColumn>  sorting_columns;
+
+  _RowGroup__isset __isset;
+
+  void __set_columns(const std::vector<ColumnChunk> & val) {
+    columns = val;
+  }
+
+  void __set_total_byte_size(const int64_t val) {
+    total_byte_size = val;
+  }
+
+  void __set_num_rows(const int64_t val) {
+    num_rows = val;
+  }
+
+  void __set_sorting_columns(const std::vector<SortingColumn> & val) {
+    sorting_columns = val;
+    __isset.sorting_columns = true;
+  }
+
+  bool operator == (const RowGroup & rhs) const
+  {
+    if (!(columns == rhs.columns))
+      return false;
+    if (!(total_byte_size == rhs.total_byte_size))
+      return false;
+    if (!(num_rows == rhs.num_rows))
+      return false;
+    if (__isset.sorting_columns != rhs.__isset.sorting_columns)
+      return false;
+    else if (__isset.sorting_columns && !(sorting_columns == rhs.sorting_columns))
+      return false;
+    return true;
+  }
+  bool operator != (const RowGroup &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const RowGroup & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(RowGroup &a, RowGroup &b);
+
+typedef struct _FileMetaData__isset {
+  _FileMetaData__isset() : key_value_metadata(false), created_by(false) {}
+  bool key_value_metadata;
+  bool created_by;
+} _FileMetaData__isset;
+
+class FileMetaData {
+ public:
+
+  static const char* ascii_fingerprint; // = "44DC7D83A66D54A7B7892A985C4125C9";
+  static const uint8_t binary_fingerprint[16]; // = {0x44,0xDC,0x7D,0x83,0xA6,0x6D,0x54,0xA7,0xB7,0x89,0x2A,0x98,0x5C,0x41,0x25,0xC9};
+
+  FileMetaData() : version(0), num_rows(0), created_by() {
+  }
+
+  virtual ~FileMetaData() throw() {}
+
+  int32_t version;
+  std::vector<SchemaElement>  schema;
+  int64_t num_rows;
+  std::vector<RowGroup>  row_groups;
+  std::vector<KeyValue>  key_value_metadata;
+  std::string created_by;
+
+  _FileMetaData__isset __isset;
+
+  void __set_version(const int32_t val) {
+    version = val;
+  }
+
+  void __set_schema(const std::vector<SchemaElement> & val) {
+    schema = val;
+  }
+
+  void __set_num_rows(const int64_t val) {
+    num_rows = val;
+  }
+
+  void __set_row_groups(const std::vector<RowGroup> & val) {
+    row_groups = val;
+  }
+
+  void __set_key_value_metadata(const std::vector<KeyValue> & val) {
+    key_value_metadata = val;
+    __isset.key_value_metadata = true;
+  }
+
+  void __set_created_by(const std::string& val) {
+    created_by = val;
+    __isset.created_by = true;
+  }
+
+  bool operator == (const FileMetaData & rhs) const
+  {
+    if (!(version == rhs.version))
+      return false;
+    if (!(schema == rhs.schema))
+      return false;
+    if (!(num_rows == rhs.num_rows))
+      return false;
+    if (!(row_groups == rhs.row_groups))
+      return false;
+    if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
+      return false;
+    else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
+      return false;
+    if (__isset.created_by != rhs.__isset.created_by)
+      return false;
+    else if (__isset.created_by && !(created_by == rhs.created_by))
+      return false;
+    return true;
+  }
+  bool operator != (const FileMetaData &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator < (const FileMetaData & ) const;
+
+  uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+  uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+};
+
+void swap(FileMetaData &a, FileMetaData &b);
+
+} // namespace
+
+#endif

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt
new file mode 100644
index 0000000..1a5de97
--- /dev/null
+++ b/src/parquet/util/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Copyright 2015 Cloudera Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Headers: util
+install(FILES
+  bit-stream-utils.h
+  bit-stream-utils.inline.h
+  bit-util.h
+  compiler-util.h
+  logging.h
+  rle-encoding.h
+  stopwatch.h
+  DESTINATION include/parquet/util)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/util/bit-stream-utils.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-stream-utils.h b/src/parquet/util/bit-stream-utils.h
new file mode 100644
index 0000000..7fba30a
--- /dev/null
+++ b/src/parquet/util/bit-stream-utils.h
@@ -0,0 +1,147 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef PARQUET_UTIL_BIT_STREAM_UTILS_H
+#define PARQUET_UTIL_BIT_STREAM_UTILS_H
+
+#include <string.h>
+#include <algorithm>
+#include <cstdint>
+
+#include "parquet/util/compiler-util.h"
+#include "parquet/util/bit-util.h"
+#include "parquet/util/logging.h"
+
+namespace parquet_cpp {
+
+// Utility class to write bit/byte streams.  This class can write data to either be
+// bit packed or byte aligned (and a single stream that has a mix of both).
+// This class does not allocate memory.
+class BitWriter {
+ public:
+  // buffer: buffer to write bits to.  Buffer should be preallocated with
+  // 'buffer_len' bytes.
+  BitWriter(uint8_t* buffer, int buffer_len) :
+      buffer_(buffer),
+      max_bytes_(buffer_len) {
+    Clear();
+  }
+
+  void Clear() {
+    buffered_values_ = 0;
+    byte_offset_ = 0;
+    bit_offset_ = 0;
+  }
+
+  // The number of current bytes written, including the current byte (i.e. may include a
+  // fraction of a byte). Includes buffered values.
+  int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); }
+  uint8_t* buffer() const { return buffer_; }
+  int buffer_len() const { return max_bytes_; }
+
+  // Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
+  // packed.  Returns false if there was not enough space. num_bits must be <= 32.
+  bool PutValue(uint64_t v, int num_bits);
+
+  // Writes v to the next aligned byte using num_bytes. If T is larger than num_bytes, the
+  // extra high-order bytes will be ignored. Returns false if there was not enough space.
+  template<typename T>
+  bool PutAligned(T v, int num_bytes);
+
+  // Write a Vlq encoded int to the buffer.  Returns false if there was not enough
+  // room.  The value is written byte aligned.
+  // For more details on vlq:
+  // en.wikipedia.org/wiki/Variable-length_quantity
+  bool PutVlqInt(uint32_t v);
+  bool PutZigZagVlqInt(int32_t v);
+
+  // Get a pointer to the next aligned byte and advance the underlying buffer
+  // by num_bytes.
+  // Returns NULL if there was not enough space.
+  uint8_t* GetNextBytePtr(int num_bytes = 1);
+
+  // Flushes all buffered values to the buffer. Call this when done writing to the buffer.
+  // If 'align' is true, buffered_values_ is reset and any future writes will be written
+  // to the next byte boundary.
+  void Flush(bool align = false);
+
+ private:
+  uint8_t* buffer_;
+  int max_bytes_;
+
+  // Bit-packed values are initially written to this variable before being memcpy'd to
+  // buffer_. This is faster than writing values byte by byte directly to buffer_.
+  uint64_t buffered_values_;
+
+  int byte_offset_;       // Offset in buffer_
+  int bit_offset_;        // Offset in buffered_values_
+};
+
+// Utility class to read bit/byte stream.  This class can read bits or bytes
+// that are either byte aligned or not.  It also has utilities to read multiple
+// bytes in one read (e.g. encoded int).
+class BitReader {
+ public:
+  // 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
+  BitReader(const uint8_t* buffer, int buffer_len) :
+      buffer_(buffer),
+      max_bytes_(buffer_len),
+      byte_offset_(0),
+      bit_offset_(0) {
+    int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+    memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+  }
+
+  BitReader() : buffer_(NULL), max_bytes_(0) {}
+
+  // Gets the next value from the buffer.  Returns true if 'v' could be read or false if
+  // there are not enough bytes left. num_bits must be <= 32.
+  template<typename T>
+  bool GetValue(int num_bits, T* v);
+
+  // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a
+  // little-endian native type and big enough to store 'num_bytes'. The value is assumed
+  // to be byte-aligned so the stream will be advanced to the start of the next byte
+  // before 'v' is read. Returns false if there are not enough bytes left.
+  template<typename T>
+  bool GetAligned(int num_bytes, T* v);
+
+  // Reads a vlq encoded int from the stream.  The encoded int must start at the
+  // beginning of a byte. Return false if there were not enough bytes in the buffer.
+  bool GetVlqInt(uint64_t* v);
+  bool GetZigZagVlqInt(int64_t* v);
+
+  // Returns the number of bytes left in the stream, not including the current byte (i.e.,
+  // there may be an additional fraction of a byte).
+  int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); }
+
+  // Maximum byte length of a vlq encoded int
+  static const int MAX_VLQ_BYTE_LEN = 5;
+
+ private:
+  const uint8_t* buffer_;
+  int max_bytes_;
+
+  // Bytes are memcpy'd from buffer_ and values are read from this variable. This is
+  // faster than reading values byte by byte directly from buffer_.
+  uint64_t buffered_values_;
+
+  int byte_offset_;       // Offset in buffer_
+  int bit_offset_;        // Offset in buffered_values_
+};
+
+} // namespace parquet_cpp
+
+#endif

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/util/bit-stream-utils.inline.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-stream-utils.inline.h b/src/parquet/util/bit-stream-utils.inline.h
new file mode 100644
index 0000000..8678e50
--- /dev/null
+++ b/src/parquet/util/bit-stream-utils.inline.h
@@ -0,0 +1,164 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
+#define PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
+
+#include "parquet/util/bit-stream-utils.h"
+
+namespace parquet_cpp {
+
+inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
+  // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
+  DCHECK_LE(num_bits, 32);
+  DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
+
+  if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false;
+
+  buffered_values_ |= v << bit_offset_;
+  bit_offset_ += num_bits;
+
+  if (UNLIKELY(bit_offset_ >= 64)) {
+    // Flush buffered_values_ and write out bits of v that did not fit
+    memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
+    buffered_values_ = 0;
+    byte_offset_ += 8;
+    bit_offset_ -= 64;
+    buffered_values_ = v >> (num_bits - bit_offset_);
+  }
+  DCHECK_LT(bit_offset_, 64);
+  return true;
+}
+
+inline void BitWriter::Flush(bool align) {
+  int num_bytes = BitUtil::Ceil(bit_offset_, 8);
+  DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
+  memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes);
+
+  if (align) {
+    buffered_values_ = 0;
+    byte_offset_ += num_bytes;
+    bit_offset_ = 0;
+  }
+}
+
+inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
+  Flush(/* align */ true);
+  DCHECK_LE(byte_offset_, max_bytes_);
+  if (byte_offset_ + num_bytes > max_bytes_) return NULL;
+  uint8_t* ptr = buffer_ + byte_offset_;
+  byte_offset_ += num_bytes;
+  return ptr;
+}
+
+template<typename T>
+inline bool BitWriter::PutAligned(T val, int num_bytes) {
+  uint8_t* ptr = GetNextBytePtr(num_bytes);
+  if (ptr == NULL) return false;
+  memcpy(ptr, &val, num_bytes);
+  return true;
+}
+
+inline bool BitWriter::PutVlqInt(uint32_t v) {
+  bool result = true;
+  while ((v & 0xFFFFFF80) != 0L) {
+    result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
+    v >>= 7;
+  }
+  result &= PutAligned<uint8_t>(v & 0x7F, 1);
+  return result;
+}
+
+inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+  uint32_t u = (v << 1) ^ (v >> 31);
+  return PutVlqInt(u);
+}
+
+template<typename T>
+inline bool BitReader::GetValue(int num_bits, T* v) {
+  // TODO: revisit this limit if necessary
+  DCHECK_LE(num_bits, 32);
+  DCHECK_LE(num_bits, sizeof(T) * 8);
+
+  if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false;
+
+  *v = BitUtil::TrailingBits(buffered_values_, bit_offset_ + num_bits) >> bit_offset_;
+
+  bit_offset_ += num_bits;
+  if (bit_offset_ >= 64) {
+    byte_offset_ += 8;
+    bit_offset_ -= 64;
+
+    int bytes_remaining = max_bytes_ - byte_offset_;
+    if (LIKELY(bytes_remaining >= 8)) {
+      memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
+    } else {
+      memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
+    }
+
+    // Read bits of v that crossed into new buffered_values_
+    *v |= BitUtil::TrailingBits(buffered_values_, bit_offset_)
+          << (num_bits - bit_offset_);
+  }
+  DCHECK_LE(bit_offset_, 64);
+  return true;
+}
+
+template<typename T>
+inline bool BitReader::GetAligned(int num_bytes, T* v) {
+  DCHECK_LE(num_bytes, sizeof(T));
+  int bytes_read = BitUtil::Ceil(bit_offset_, 8);
+  if (UNLIKELY(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false;
+
+  // Advance byte_offset to next unread byte and read num_bytes
+  byte_offset_ += bytes_read;
+  memcpy(v, buffer_ + byte_offset_, num_bytes);
+  byte_offset_ += num_bytes;
+
+  // Reset buffered_values_
+  bit_offset_ = 0;
+  int bytes_remaining = max_bytes_ - byte_offset_;
+  if (LIKELY(bytes_remaining >= 8)) {
+    memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
+  } else {
+    memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
+  }
+  return true;
+}
+
+inline bool BitReader::GetVlqInt(uint64_t* v) {
+  *v = 0;
+  int shift = 0;
+  int num_bytes = 0;
+  uint8_t byte = 0;
+  do {
+    if (!GetAligned<uint8_t>(1, &byte)) return false;
+    *v |= (byte & 0x7F) << shift;
+    shift += 7;
+    DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN);
+  } while ((byte & 0x80) != 0);
+  return true;
+}
+
+inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
+  uint64_t u;
+  if (!GetVlqInt(&u)) return false;
+  *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1);
+  return true;
+}
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/util/bit-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-util.h b/src/parquet/util/bit-util.h
new file mode 100644
index 0000000..3fbdbbe
--- /dev/null
+++ b/src/parquet/util/bit-util.h
@@ -0,0 +1,174 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef PARQUET_UTIL_BIT_UTIL_H
+#define PARQUET_UTIL_BIT_UTIL_H
+
+#if defined(__APPLE__)
+  #include <machine/endian.h>
+#else
+  #include <endian.h>
+#endif
+
+#include "parquet/util/compiler-util.h"
+#include "parquet/util/logging.h"
+
+namespace parquet_cpp {
+
+// Utility class to do standard bit tricks
+// TODO: is this in boost or something else like that?
+class BitUtil {
+ public:
+  // Returns the ceil of value/divisor
+  static inline int Ceil(int value, int divisor) {
+    return value / divisor + (value % divisor != 0);
+  }
+
+  // Returns 'value' rounded up to the nearest multiple of 'factor'
+  static inline int RoundUp(int value, int factor) {
+    return (value + (factor - 1)) / factor * factor;
+  }
+
+  // Returns 'value' rounded down to the nearest multiple of 'factor'
+  static inline int RoundDown(int value, int factor) {
+    return (value / factor) * factor;
+  }
+
+  // Returns the number of set bits in x
+  static inline int Popcount(uint64_t x) {
+    int count = 0;
+    for (; x != 0; ++count) x &= x-1;
+    return count;
+  }
+
+  // Returns the 'num_bits' least-significant bits of 'v'.
+  static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
+    if (UNLIKELY(num_bits == 0)) return 0;
+    if (UNLIKELY(num_bits >= 64)) return v;
+    int n = 64 - num_bits;
+    return (v << n) >> n;
+  }
+
+  // Returns ceil(log2(x)).
+  // TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
+  // in a hot path.
+  static inline int Log2(uint64_t x) {
+    if (x == 0) return 0;
+    // Compute result = ceil(log2(x))
+    //                = floor(log2(x - 1)) + 1, for x > 1
+    // by finding the position of the most significant bit (1-indexed) of x - 1
+    // (floor(log2(n)) = MSB(n) (0-indexed))
+    --x;
+    int result = 1;
+    while (x >>= 1) ++result;
+    return result;
+  }
+
+  // Returns the minimum number of bits needed to represent the value of 'x'
+  static inline int NumRequiredBits(uint64_t x) {
+    for (int i = 63; i >= 0; --i) {
+      if (x & 1L << i) return i + 1;
+    }
+    return 0;
+  }
+
+  // Swaps the byte order (i.e. endianess)
+  static inline int64_t ByteSwap(int64_t value) {
+    return __builtin_bswap64(value);
+  }
+  static inline uint64_t ByteSwap(uint64_t value) {
+    return static_cast<uint64_t>(__builtin_bswap64(value));
+  }
+  static inline int32_t ByteSwap(int32_t value) {
+    return __builtin_bswap32(value);
+  }
+  static inline uint32_t ByteSwap(uint32_t value) {
+    return static_cast<uint32_t>(__builtin_bswap32(value));
+  }
+  static inline int16_t ByteSwap(int16_t value) {
+    return (((value >> 8) & 0xff) | ((value & 0xff) << 8));
+  }
+  static inline uint16_t ByteSwap(uint16_t value) {
+    return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
+  }
+
+  // Write the swapped bytes into dst. Src and st cannot overlap.
+  static inline void ByteSwap(void* dst, const void* src, int len) {
+    switch (len) {
+      case 1:
+        *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
+        return;
+      case 2:
+        *reinterpret_cast<int16_t*>(dst) =
+            ByteSwap(*reinterpret_cast<const int16_t*>(src));
+        return;
+      case 4:
+        *reinterpret_cast<int32_t*>(dst) =
+            ByteSwap(*reinterpret_cast<const int32_t*>(src));
+        return;
+      case 8:
+        *reinterpret_cast<int64_t*>(dst) =
+            ByteSwap(*reinterpret_cast<const int64_t*>(src));
+        return;
+      default: break;
+    }
+
+    uint8_t* d = reinterpret_cast<uint8_t*>(dst);
+    const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
+    for (int i = 0; i < len; ++i) {
+      d[i] = s[len - i - 1];
+    }
+  }
+
+  // Converts to big endian format (if not already in big endian) from the
+  // machine's native endian format.
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  static inline int64_t  ToBigEndian(int64_t value)  { return ByteSwap(value); }
+  static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
+  static inline int32_t  ToBigEndian(int32_t value)  { return ByteSwap(value); }
+  static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); }
+  static inline int16_t  ToBigEndian(int16_t value)  { return ByteSwap(value); }
+  static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); }
+#else
+  static inline int64_t  ToBigEndian(int64_t val)  { return val; }
+  static inline uint64_t ToBigEndian(uint64_t val) { return val; }
+  static inline int32_t  ToBigEndian(int32_t val)  { return val; }
+  static inline uint32_t ToBigEndian(uint32_t val) { return val; }
+  static inline int16_t  ToBigEndian(int16_t val)  { return val; }
+  static inline uint16_t ToBigEndian(uint16_t val) { return val; }
+#endif
+
+  // Converts from big endian format to the machine's native endian format.
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  static inline int64_t  FromBigEndian(int64_t value)  { return ByteSwap(value); }
+  static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); }
+  static inline int32_t  FromBigEndian(int32_t value)  { return ByteSwap(value); }
+  static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); }
+  static inline int16_t  FromBigEndian(int16_t value)  { return ByteSwap(value); }
+  static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); }
+#else
+  static inline int64_t  FromBigEndian(int64_t val)  { return val; }
+  static inline uint64_t FromBigEndian(uint64_t val) { return val; }
+  static inline int32_t  FromBigEndian(int32_t val)  { return val; }
+  static inline uint32_t FromBigEndian(uint32_t val) { return val; }
+  static inline int16_t  FromBigEndian(int16_t val)  { return val; }
+  static inline uint16_t FromBigEndian(uint16_t val) { return val; }
+#endif
+
+};
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_UTIL_BIT_UTIL_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/util/compiler-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/compiler-util.h b/src/parquet/util/compiler-util.h
new file mode 100644
index 0000000..6425247
--- /dev/null
+++ b/src/parquet/util/compiler-util.h
@@ -0,0 +1,37 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef PARQUET_UTIL_COMPILER_UTIL_H
+#define PARQUET_UTIL_COMPILER_UTIL_H
+
+// Compiler hint that this branch is likely or unlikely to
+// be taken. Take from the "What all programmers should know
+// about memory" paper.
+// example: if (LIKELY(size > 0)) { ... }
+// example: if (UNLIKELY(!status.ok())) { ... }
+#ifdef LIKELY
+#undef LIKELY
+#endif
+
+#ifdef UNLIKELY
+#undef UNLIKELY
+#endif
+
+#define LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+
+#define PREFETCH(addr) __builtin_prefetch(addr)
+
+#endif // PARQUET_UTIL_COMPILER_UTIL_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/util/logging.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/logging.h b/src/parquet/util/logging.h
new file mode 100644
index 0000000..c6e6303
--- /dev/null
+++ b/src/parquet/util/logging.h
@@ -0,0 +1,31 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef PARQUET_UTIL_LOGGING_H
+#define PARQUET_UTIL_LOGGING_H
+
+#include <iostream>
+
+#define DCHECK(condition) while (false) std::cout
+#define DCHECK_EQ(a, b) while (false) std::cout
+#define DCHECK_NE(a, b) while (false) std::cout
+#define DCHECK_GT(a, b) while (false) std::cout
+#define DCHECK_LT(a, b) while (false) std::cout
+#define DCHECK_GE(a, b) while (false) std::cout
+#define DCHECK_LE(a, b) while (false) std::cout
+// Similar to how glog defines DCHECK for release.
+#define LOG(level) while (false) std::cout
+
+#endif

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h
new file mode 100644
index 0000000..b074d6d
--- /dev/null
+++ b/src/parquet/util/rle-encoding.h
@@ -0,0 +1,419 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARQUET_UTIL_RLE_ENCODING_H
+#define PARQUET_UTIL_RLE_ENCODING_H
+
+#include <math.h>
+#include <algorithm>
+
+#include "parquet/util/compiler-util.h"
+#include "parquet/util/bit-stream-utils.inline.h"
+#include "parquet/util/bit-util.h"
+#include "parquet/util/logging.h"
+
+namespace parquet_cpp {
+
+// Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
+// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
+// (literal encoding).
+// For both types of runs, there is a byte-aligned indicator which encodes the length
+// of the run and the type of the run.
+// This encoding has the benefit that when there aren't any long enough runs, values
+// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
+// the run length are byte aligned. This allows for very efficient decoding
+// implementations.
+// The encoding is:
+//    encoded-block := run*
+//    run := literal-run | repeated-run
+//    literal-run := literal-indicator < literal bytes >
+//    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
+//    literal-indicator := varint_encode( number_of_groups << 1 | 1)
+//    repeated-indicator := varint_encode( number_of_repetitions << 1 )
+//
+// Each run is preceded by a varint. The varint's least significant bit is
+// used to indicate whether the run is a literal run or a repeated run. The rest
+// of the varint is used to determine the length of the run (eg how many times the
+// value repeats).
+//
+// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
+// in groups of 8), so that no matter the bit-width of the value, the sequence will end
+// on a byte boundary without padding.
+// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
+// the actual number of encoded ints. (This means that the total number of encoded values
+// can not be determined from the encoded data, since the number of values in the last
+// group may not be a multiple of 8). For the last group of literal runs, we pad
+// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
+// without the need for additional checks.
+//
+// There is a break-even point when it is more storage efficient to do run length
+// encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
+// for both the repeated encoding or the literal encoding.  This value can always
+// be computed based on the bit-width.
+// TODO: think about how to use this for strings.  The bit packing isn't quite the same.
+//
+// Examples with bit-width 1 (eg encoding booleans):
+// ----------------------------------------
+// 100 1s followed by 100 0s:
+// <varint(100 << 1)> <1, padded to 1 byte>  <varint(100 << 1)> <0, padded to 1 byte>
+//  - (total 4 bytes)
+//
+// alternating 1s and 0s (200 total):
+// 200 ints = 25 groups of 8
+// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
+// (total 26 bytes, 1 byte overhead)
+//
+
+// Decoder class for RLE encoded data.
+class RleDecoder {
+ public:
+  // Create a decoder object. buffer/buffer_len is the decoded data.
+  // bit_width is the width of each value (before encoding).
+  RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
+    : bit_reader_(buffer, buffer_len),
+      bit_width_(bit_width),
+      current_value_(0),
+      repeat_count_(0),
+      literal_count_(0) {
+    DCHECK_GE(bit_width_, 0);
+    DCHECK_LE(bit_width_, 64);
+  }
+
+  RleDecoder() {}
+
+  // Gets the next value.  Returns false if there are no more.
+  template<typename T>
+  bool Get(T* val);
+
+ private:
+  BitReader bit_reader_;
+  int bit_width_;
+  uint64_t current_value_;
+  uint32_t repeat_count_;
+  uint32_t literal_count_;
+};
+
+// Class to incrementally build the rle data.   This class does not allocate any memory.
+// The encoding has two modes: encoding repeated runs and literal runs.
+// If the run is sufficiently short, it is more efficient to encode as a literal run.
+// This class does so by buffering 8 values at a time.  If they are not all the same
+// they are added to the literal run.  If they are the same, they are added to the
+// repeated run.  When we switch modes, the previous run is flushed out.
+class RleEncoder {
+ public:
+  // buffer/buffer_len: preallocated output buffer.
+  // bit_width: max number of bits for value.
+  // TODO: consider adding a min_repeated_run_length so the caller can control
+  // when values should be encoded as repeated runs.  Currently this is derived
+  // based on the bit_width, which can determine a storage optimal choice.
+  // TODO: allow 0 bit_width (and have dict encoder use it)
+  RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
+    : bit_width_(bit_width),
+      bit_writer_(buffer, buffer_len) {
+    DCHECK_GE(bit_width_, 1);
+    DCHECK_LE(bit_width_, 64);
+    max_run_byte_size_ = MinBufferSize(bit_width);
+    DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
+    Clear();
+  }
+
+  // Returns the minimum buffer size needed to use the encoder for 'bit_width'
+  // This is the maximum length of a single run for 'bit_width'.
+  // It is not valid to pass a buffer less than this length.
+  static int MinBufferSize(int bit_width) {
+    // 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
+    int max_literal_run_size = 1 +
+        BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8);
+    // Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value.
+    int max_repeated_run_size = BitReader::MAX_VLQ_BYTE_LEN + BitUtil::Ceil(bit_width, 8);
+    return std::max(max_literal_run_size, max_repeated_run_size);
+  }
+
+  // Returns the maximum byte size it could take to encode 'num_values'.
+  static int MaxBufferSize(int bit_width, int num_values) {
+    int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0);
+    int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN);
+    int literal_max_size = num_runs + num_runs * bytes_per_run;
+    int min_run_size = MinBufferSize(bit_width);
+    return std::max(min_run_size, literal_max_size) + min_run_size;
+  }
+
+  // Encode value.  Returns true if the value fits in buffer, false otherwise.
+  // This value must be representable with bit_width_ bits.
+  bool Put(uint64_t value);
+
+  // Flushes any pending values to the underlying buffer.
+  // Returns the total number of bytes written
+  int Flush();
+
+  // Resets all the state in the encoder.
+  void Clear();
+
+  // Returns pointer to underlying buffer
+  uint8_t* buffer() { return bit_writer_.buffer(); }
+  int32_t len() { return bit_writer_.bytes_written(); }
+
+ private:
+  // Flushes any buffered values.  If this is part of a repeated run, this is largely
+  // a no-op.
+  // If it is part of a literal run, this will call FlushLiteralRun, which writes
+  // out the buffered literal values.
+  // If 'done' is true, the current run would be written even if it would normally
+  // have been buffered more.  This should only be called at the end, when the
+  // encoder has received all values even if it would normally continue to be
+  // buffered.
+  void FlushBufferedValues(bool done);
+
+  // Flushes literal values to the underlying buffer.  If update_indicator_byte,
+  // then the current literal run is complete and the indicator byte is updated.
+  void FlushLiteralRun(bool update_indicator_byte);
+
+  // Flushes a repeated run to the underlying buffer.
+  void FlushRepeatedRun();
+
+  // Checks and sets buffer_full_. This must be called after flushing a run to
+  // make sure there are enough bytes remaining to encode the next run.
+  void CheckBufferFull();
+
+  // The maximum number of values in a single literal run
+  // (number of groups encodable by a 1-byte indicator * 8)
+  static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
+
+  // Number of bits needed to encode the value.
+  const int bit_width_;
+
+  // Underlying buffer.
+  BitWriter bit_writer_;
+
+  // If true, the buffer is full and subsequent Put()'s will fail.
+  bool buffer_full_;
+
+  // The maximum byte size a single run can take.
+  int max_run_byte_size_;
+
+  // We need to buffer at most 8 values for literals.  This happens when the
+  // bit_width is 1 (so 8 values fit in one byte).
+  // TODO: generalize this to other bit widths
+  int64_t buffered_values_[8];
+
+  // Number of values in buffered_values_
+  int num_buffered_values_;
+
+  // The current (also last) value that was written and the count of how
+  // many times in a row that value has been seen.  This is maintained even
+  // if we are in a literal run.  If the repeat_count_ get high enough, we switch
+  // to encoding repeated runs.
+  int64_t current_value_;
+  int repeat_count_;
+
+  // Number of literals in the current run.  This does not include the literals
+  // that might be in buffered_values_.  Only after we've got a group big enough
+  // can we decide if they should part of the literal_count_ or repeat_count_
+  int literal_count_;
+
+  // Pointer to a byte in the underlying buffer that stores the indicator byte.
+  // This is reserved as soon as we need a literal run but the value is written
+  // when the literal run is complete.
+  uint8_t* literal_indicator_byte_;
+};
+
+template<typename T>
+inline bool RleDecoder::Get(T* val) {
+  if (UNLIKELY(literal_count_ == 0 && repeat_count_ == 0)) {
+    // Read the next run's indicator int, it could be a literal or repeated run
+    // The int is encoded as a vlq-encoded value.
+    uint64_t indicator_value = 0;
+    bool result = bit_reader_.GetVlqInt(&indicator_value);
+    if (!result) return false;
+
+    // lsb indicates if it is a literal run or repeated run
+    bool is_literal = indicator_value & 1;
+    if (is_literal) {
+      literal_count_ = (indicator_value >> 1) * 8;
+    } else {
+      repeat_count_ = indicator_value >> 1;
+      bool result = bit_reader_.GetAligned<T>(
+          BitUtil::Ceil(bit_width_, 8), reinterpret_cast<T*>(&current_value_));
+      DCHECK(result);
+    }
+  }
+
+  if (LIKELY(repeat_count_ > 0)) {
+    *val = current_value_;
+    --repeat_count_;
+  } else {
+    DCHECK(literal_count_ > 0);
+    bool result = bit_reader_.GetValue(bit_width_, val);
+    DCHECK(result);
+    --literal_count_;
+  }
+
+  return true;
+}
+
+// This function buffers input values 8 at a time.  After seeing all 8 values,
+// it decides whether they should be encoded as a literal or repeated run.
+inline bool RleEncoder::Put(uint64_t value) {
+  DCHECK(bit_width_ == 64 || value < (1LL << bit_width_));
+  if (UNLIKELY(buffer_full_)) return false;
+
+  if (LIKELY(current_value_ == value)) {
+    ++repeat_count_;
+    if (repeat_count_ > 8) {
+      // This is just a continuation of the current run, no need to buffer the
+      // values.
+      // Note that this is the fast path for long repeated runs.
+      return true;
+    }
+  } else {
+    if (repeat_count_ >= 8) {
+      // We had a run that was long enough but it has ended.  Flush the
+      // current repeated run.
+      DCHECK_EQ(literal_count_, 0);
+      FlushRepeatedRun();
+    }
+    repeat_count_ = 1;
+    current_value_ = value;
+  }
+
+  buffered_values_[num_buffered_values_] = value;
+  if (++num_buffered_values_ == 8) {
+    DCHECK_EQ(literal_count_ % 8, 0);
+    FlushBufferedValues(false);
+  }
+  return true;
+}
+
+inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) {
+  if (literal_indicator_byte_ == NULL) {
+    // The literal indicator byte has not been reserved yet, get one now.
+    literal_indicator_byte_ = bit_writer_.GetNextBytePtr();
+    DCHECK(literal_indicator_byte_ != NULL);
+  }
+
+  // Write all the buffered values as bit packed literals
+  for (int i = 0; i < num_buffered_values_; ++i) {
+    bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_);
+    DCHECK(success) << "There is a bug in using CheckBufferFull()";
+  }
+  num_buffered_values_ = 0;
+
+  if (update_indicator_byte) {
+    // At this point we need to write the indicator byte for the literal run.
+    // We only reserve one byte, to allow for streaming writes of literal values.
+    // The logic makes sure we flush literal runs often enough to not overrun
+    // the 1 byte.
+    DCHECK_EQ(literal_count_ % 8, 0);
+    int num_groups = literal_count_ / 8;
+    int32_t indicator_value = (num_groups << 1) | 1;
+    DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
+    *literal_indicator_byte_ = indicator_value;
+    literal_indicator_byte_ = NULL;
+    literal_count_ = 0;
+    CheckBufferFull();
+  }
+}
+
+inline void RleEncoder::FlushRepeatedRun() {
+  DCHECK_GT(repeat_count_, 0);
+  bool result = true;
+  // The lsb of 0 indicates this is a repeated run
+  int32_t indicator_value = repeat_count_ << 1 | 0;
+  result &= bit_writer_.PutVlqInt(indicator_value);
+  result &= bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 8));
+  DCHECK(result);
+  num_buffered_values_ = 0;
+  repeat_count_ = 0;
+  CheckBufferFull();
+}
+
+// Flush the values that have been buffered.  At this point we decide whether
+// we need to switch between the run types or continue the current one.
+inline void RleEncoder::FlushBufferedValues(bool done) {
+  if (repeat_count_ >= 8) {
+    // Clear the buffered values.  They are part of the repeated run now and we
+    // don't want to flush them out as literals.
+    num_buffered_values_ = 0;
+    if (literal_count_ != 0) {
+      // There was a current literal run.  All the values in it have been flushed
+      // but we still need to update the indicator byte.
+      DCHECK_EQ(literal_count_ % 8, 0);
+      DCHECK_EQ(repeat_count_, 8);
+      FlushLiteralRun(true);
+    }
+    DCHECK_EQ(literal_count_, 0);
+    return;
+  }
+
+  literal_count_ += num_buffered_values_;
+  DCHECK_EQ(literal_count_ % 8, 0);
+  int num_groups = literal_count_ / 8;
+  if (num_groups + 1 >= (1 << 6)) {
+    // We need to start a new literal run because the indicator byte we've reserved
+    // cannot store more values.
+    DCHECK(literal_indicator_byte_ != NULL);
+    FlushLiteralRun(true);
+  } else {
+    FlushLiteralRun(done);
+  }
+  repeat_count_ = 0;
+}
+
+inline int RleEncoder::Flush() {
+  if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) {
+    bool all_repeat = literal_count_ == 0 &&
+        (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0);
+    // There is something pending, figure out if it's a repeated or literal run
+    if (repeat_count_ > 0 && all_repeat) {
+      FlushRepeatedRun();
+    } else  {
+      DCHECK_EQ(literal_count_ % 8, 0);
+      // Buffer the last group of literals to 8 by padding with 0s.
+      for (; num_buffered_values_ != 0 && num_buffered_values_ < 8;
+          ++num_buffered_values_) {
+        buffered_values_[num_buffered_values_] = 0;
+      }
+      literal_count_ += num_buffered_values_;
+      FlushLiteralRun(true);
+      repeat_count_ = 0;
+    }
+  }
+  bit_writer_.Flush();
+  DCHECK_EQ(num_buffered_values_, 0);
+  DCHECK_EQ(literal_count_, 0);
+  DCHECK_EQ(repeat_count_, 0);
+
+  return bit_writer_.bytes_written();
+}
+
+inline void RleEncoder::CheckBufferFull() {
+  int bytes_written = bit_writer_.bytes_written();
+  if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) {
+    buffer_full_ = true;
+  }
+}
+
+inline void RleEncoder::Clear() {
+  buffer_full_ = false;
+  current_value_ = 0;
+  repeat_count_ = 0;
+  num_buffered_values_ = 0;
+  literal_count_ = 0;
+  literal_indicator_byte_ = NULL;
+  bit_writer_.Clear();
+}
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_UTIL_RLE_ENCODING_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/parquet/util/stopwatch.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/stopwatch.h b/src/parquet/util/stopwatch.h
new file mode 100644
index 0000000..10ed9e9
--- /dev/null
+++ b/src/parquet/util/stopwatch.h
@@ -0,0 +1,49 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARQUET_UTIL_STOPWATCH_H
+#define PARQUET_UTIL_STOPWATCH_H
+
+#include <iostream>
+#include <stdio.h>
+#include <ctime>
+#include <sys/time.h>
+
+namespace parquet_cpp {
+
+class StopWatch {
+ public:
+  StopWatch() {
+  }
+
+  void Start() {
+    gettimeofday(&start_time, 0);
+  }
+
+  // Returns time in nanoseconds.
+  uint64_t Stop() {
+    struct timeval t_time;
+    gettimeofday(&t_time, 0);
+
+    return (1000L * 1000L * 1000L * (t_time.tv_sec - start_time.tv_sec)
+                   + (t_time.tv_usec - start_time.tv_usec));
+  }
+
+ private:
+  struct timeval  start_time;
+};
+
+} // namespace parquet_cpp
+
+#endif

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/337cf584/src/util/stopwatch.h
----------------------------------------------------------------------
diff --git a/src/util/stopwatch.h b/src/util/stopwatch.h
deleted file mode 100644
index 145f130..0000000
--- a/src/util/stopwatch.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2012 Cloudera Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PARQUET_UTIL_STOPWATCH_H
-#define PARQUET_UTIL_STOPWATCH_H
-
-#include <iostream>
-#include <stdio.h>
-#include <ctime>
-#include <sys/time.h>
-
-namespace parquet_cpp {
-
-class StopWatch {
- public:
-  StopWatch() {
-  }
-
-  void Start() {
-    gettimeofday(&start_time, 0);
-  }
-
-  // Returns time in nanoseconds.
-  uint64_t Stop() {
-    struct timeval t_time;
-    gettimeofday(&t_time, 0);
-
-    return (1000L * 1000L * 1000L * (t_time.tv_sec - start_time.tv_sec)
-                   + (t_time.tv_usec - start_time.tv_usec));
-  }
-
- private:
-  struct timeval  start_time;
-};
-
-}
-
-#endif


Mime
View raw message