arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-20: Add null_count_ member to array containers, remove nullable_ member
Date Thu, 03 Mar 2016 22:56:05 GMT
Repository: arrow
Updated Branches:
  refs/heads/master e41802085 -> b88b69e20


ARROW-20: Add null_count_ member to array containers, remove nullable_ member

Based off of ARROW-19.

After some contemplation / discussion, I believe it would be better to track nullability at the schema metadata level (if at all!) rather than making it a property of the data structures. This allows the data containers to be "plain ol' data" and thus both nullable data with `null_count == 0` and non-nullable data (implicitly `null_count == 0`) can be treated as semantically equivalent in algorithms code.

If it is deemed useful we can validate (cheaply) that physical data meets the metadata requirements (e.g. non-nullable type metadata cannot be associated with data containers having nulls).

Author: Wes McKinney <wesm@apache.org>

Closes #9 from wesm/ARROW-20 and squashes the following commits:

98be016 [Wes McKinney] ARROW-20: Add null_count_ member to Array containers, remove nullable member


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/b88b69e2
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/b88b69e2
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/b88b69e2

Branch: refs/heads/master
Commit: b88b69e204b59fa8f19cd20dcb6c091fe9bde3a9
Parents: e418020
Author: Wes McKinney <wesm@apache.org>
Authored: Thu Mar 3 14:56:31 2016 -0800
Committer: Wes McKinney <wesm@apache.org>
Committed: Thu Mar 3 14:56:31 2016 -0800

----------------------------------------------------------------------
 cpp/CMakeLists.txt                    |  2 +-
 cpp/src/arrow/array-test.cc           | 57 +++++++++---------
 cpp/src/arrow/array.cc                | 11 ++--
 cpp/src/arrow/array.h                 | 37 ++++++++----
 cpp/src/arrow/builder.cc              | 35 +++++------
 cpp/src/arrow/builder.h               | 29 ++++-----
 cpp/src/arrow/test-util.h             | 10 ++++
 cpp/src/arrow/type.h                  | 12 ++--
 cpp/src/arrow/types/collection.h      |  2 +-
 cpp/src/arrow/types/datetime.h        | 12 ++--
 cpp/src/arrow/types/json.h            |  4 +-
 cpp/src/arrow/types/list-test.cc      | 12 +---
 cpp/src/arrow/types/list.h            | 46 ++++++++-------
 cpp/src/arrow/types/primitive-test.cc | 34 ++++++-----
 cpp/src/arrow/types/primitive.cc      | 11 ++--
 cpp/src/arrow/types/primitive.h       | 95 +++++++++++++++++-------------
 cpp/src/arrow/types/string-test.cc    | 31 ++++------
 cpp/src/arrow/types/string.cc         |  2 +-
 cpp/src/arrow/types/string.h          | 43 +++++++-------
 cpp/src/arrow/types/struct-test.cc    |  6 +-
 cpp/src/arrow/types/struct.h          |  5 +-
 cpp/src/arrow/types/test-common.h     |  4 +-
 cpp/src/arrow/types/union.h           | 10 ++--
 cpp/src/arrow/util/bit-util.cc        |  4 +-
 cpp/src/arrow/util/bit-util.h         |  4 +-
 25 files changed, 265 insertions(+), 253 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d2c840a..f0eb73d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -92,7 +92,7 @@ endif()
 # For CMAKE_BUILD_TYPE=Release
 #   -O3: Enable all compiler optimizations
 #   -g: Enable symbols for profiler tools (TODO: remove for shipping)
-set(CXX_FLAGS_DEBUG "-ggdb")
+set(CXX_FLAGS_DEBUG "-ggdb -O0")
 set(CXX_FLAGS_FASTDEBUG "-ggdb -O1")
 set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG")
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 16afb9b..df827aa 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -20,7 +20,6 @@
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
-#include <string>
 #include <vector>
 
 #include "arrow/array.h"
@@ -32,60 +31,60 @@
 #include "arrow/util/memory-pool.h"
 #include "arrow/util/status.h"
 
-using std::string;
-using std::vector;
-
 namespace arrow {
 
 static TypePtr int32 = TypePtr(new Int32Type());
-static TypePtr int32_nn = TypePtr(new Int32Type(false));
-
 
 class TestArray : public ::testing::Test {
  public:
   void SetUp() {
     pool_ = GetDefaultMemoryPool();
-
-    auto data = std::make_shared<PoolBuffer>(pool_);
-    auto nulls = std::make_shared<PoolBuffer>(pool_);
-
-    ASSERT_OK(data->Resize(400));
-    ASSERT_OK(nulls->Resize(128));
-
-    arr_.reset(new Int32Array(100, data, nulls));
   }
 
  protected:
   MemoryPool* pool_;
-  std::unique_ptr<Int32Array> arr_;
 };
 
 
-TEST_F(TestArray, TestNullable) {
-  std::shared_ptr<Buffer> tmp = arr_->data();
-  std::unique_ptr<Int32Array> arr_nn(new Int32Array(100, tmp));
+TEST_F(TestArray, TestNullCount) {
+  auto data = std::make_shared<PoolBuffer>(pool_);
+  auto nulls = std::make_shared<PoolBuffer>(pool_);
 
-  ASSERT_TRUE(arr_->nullable());
-  ASSERT_FALSE(arr_nn->nullable());
+  std::unique_ptr<Int32Array> arr(new Int32Array(100, data, 10, nulls));
+  ASSERT_EQ(10, arr->null_count());
+
+  std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
+  ASSERT_EQ(0, arr_no_nulls->null_count());
 }
 
 
 TEST_F(TestArray, TestLength) {
-  ASSERT_EQ(arr_->length(), 100);
+  auto data = std::make_shared<PoolBuffer>(pool_);
+  std::unique_ptr<Int32Array> arr(new Int32Array(100, data));
+  ASSERT_EQ(arr->length(), 100);
 }
 
 TEST_F(TestArray, TestIsNull) {
-  vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0,
-                           1, 0, 1, 1, 0, 1, 0, 0,
-                           1, 0, 1, 1, 0, 1, 0, 0,
-                           1, 0, 1, 1, 0, 1, 0, 0,
-                           1, 0, 0, 1};
+  std::vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0,
+                                1, 0, 1, 1, 0, 1, 0, 0,
+                                1, 0, 1, 1, 0, 1, 0, 0,
+                                1, 0, 1, 1, 0, 1, 0, 0,
+                                1, 0, 0, 1};
+  int32_t null_count = 0;
+  for (uint8_t x : nulls) {
+    if (x > 0) ++null_count;
+  }
 
-  std::shared_ptr<Buffer> null_buf = bytes_to_null_buffer(nulls.data(), nulls.size());
+  std::shared_ptr<Buffer> null_buf = bytes_to_null_buffer(nulls.data(),
+      nulls.size());
   std::unique_ptr<Array> arr;
-  arr.reset(new Array(int32, nulls.size(), null_buf));
+  arr.reset(new Array(int32, nulls.size(), null_count, null_buf));
+
+  ASSERT_EQ(null_count, arr->null_count());
+  ASSERT_EQ(5, null_buf->size());
+
+  ASSERT_TRUE(arr->nulls()->Equals(*null_buf.get()));
 
-  ASSERT_EQ(null_buf->size(), 5);
   for (size_t i = 0; i < nulls.size(); ++i) {
     ASSERT_EQ(static_cast<bool>(nulls[i]), arr->IsNull(i));
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 1726a2f..ee4ef66 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -17,6 +17,8 @@
 
 #include "arrow/array.h"
 
+#include <cstdint>
+
 #include "arrow/util/buffer.h"
 
 namespace arrow {
@@ -24,18 +26,17 @@ namespace arrow {
 // ----------------------------------------------------------------------
 // Base array class
 
-Array::Array(const TypePtr& type, int64_t length,
+Array::Array(const TypePtr& type, int32_t length, int32_t null_count,
     const std::shared_ptr<Buffer>& nulls) {
-  Init(type, length, nulls);
+  Init(type, length, null_count, nulls);
 }
 
-void Array::Init(const TypePtr& type, int64_t length,
+void Array::Init(const TypePtr& type, int32_t length, int32_t null_count,
     const std::shared_ptr<Buffer>& nulls) {
   type_ = type;
   length_ = length;
+  null_count_ = null_count;
   nulls_ = nulls;
-
-  nullable_ = type->nullable;
   if (nulls_) {
     null_bits_ = nulls_->data();
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 0eaa28d..3d748c1 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -30,38 +30,49 @@ namespace arrow {
 class Buffer;
 
 // Immutable data array with some logical type and some length. Any memory is
-// owned by the respective Buffer instance (or its parents). May or may not be
-// nullable.
+// owned by the respective Buffer instance (or its parents).
 //
-// The base class only has a null array (if the data type is nullable)
+// The base class is only required to have a nulls buffer if the null count is
+// greater than 0
 //
 // Any buffers used to initialize the array have their references "stolen". If
 // you wish to use the buffer beyond the lifetime of the array, you need to
 // explicitly increment its reference count
 class Array {
  public:
-  Array() : length_(0), nulls_(nullptr), null_bits_(nullptr) {}
-  Array(const TypePtr& type, int64_t length,
+  Array() :
+      null_count_(0),
+      length_(0),
+      nulls_(nullptr),
+      null_bits_(nullptr) {}
+
+  Array(const TypePtr& type, int32_t length, int32_t null_count = 0,
       const std::shared_ptr<Buffer>& nulls = nullptr);
 
   virtual ~Array() {}
 
-  void Init(const TypePtr& type, int64_t length, const std::shared_ptr<Buffer>& nulls);
+  void Init(const TypePtr& type, int32_t length, int32_t null_count,
+      const std::shared_ptr<Buffer>& nulls);
 
-  // Determine if a slot if null. For inner loops. Does *not* boundscheck
-  bool IsNull(int64_t i) const {
-    return nullable_ && util::get_bit(null_bits_, i);
+  // Determine if a slot is null. For inner loops. Does *not* boundscheck
+  bool IsNull(int i) const {
+    return null_count_ > 0 && util::get_bit(null_bits_, i);
   }
 
-  int64_t length() const { return length_;}
-  bool nullable() const { return nullable_;}
+  int32_t length() const { return length_;}
+  int32_t null_count() const { return null_count_;}
+
   const TypePtr& type() const { return type_;}
   TypeEnum type_enum() const { return type_->type;}
 
+  const std::shared_ptr<Buffer>& nulls() const {
+    return nulls_;
+  }
+
  protected:
   TypePtr type_;
-  bool nullable_;
-  int64_t length_;
+  int32_t null_count_;
+  int32_t length_;
 
   std::shared_ptr<Buffer> nulls_;
   const uint8_t* null_bits_;

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index cb85067..ba70add 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -25,34 +25,29 @@
 
 namespace arrow {
 
-Status ArrayBuilder::Init(int64_t capacity) {
+Status ArrayBuilder::Init(int32_t capacity) {
   capacity_ = capacity;
-
-  if (nullable_) {
-    int64_t to_alloc = util::ceil_byte(capacity) / 8;
-    nulls_ = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(nulls_->Resize(to_alloc));
-    null_bits_ = nulls_->mutable_data();
-    memset(null_bits_, 0, to_alloc);
-  }
+  int32_t to_alloc = util::ceil_byte(capacity) / 8;
+  nulls_ = std::make_shared<PoolBuffer>(pool_);
+  RETURN_NOT_OK(nulls_->Resize(to_alloc));
+  null_bits_ = nulls_->mutable_data();
+  memset(null_bits_, 0, to_alloc);
   return Status::OK();
 }
 
-Status ArrayBuilder::Resize(int64_t new_bits) {
-  if (nullable_) {
-    int64_t new_bytes = util::ceil_byte(new_bits) / 8;
-    int64_t old_bytes = nulls_->size();
-    RETURN_NOT_OK(nulls_->Resize(new_bytes));
-    null_bits_ = nulls_->mutable_data();
-    if (old_bytes < new_bytes) {
-      memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes);
-    }
+Status ArrayBuilder::Resize(int32_t new_bits) {
+  int32_t new_bytes = util::ceil_byte(new_bits) / 8;
+  int32_t old_bytes = nulls_->size();
+  RETURN_NOT_OK(nulls_->Resize(new_bytes));
+  null_bits_ = nulls_->mutable_data();
+  if (old_bytes < new_bytes) {
+    memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes);
   }
   return Status::OK();
 }
 
-Status ArrayBuilder::Advance(int64_t elements) {
-  if (nullable_ && length_ + elements > capacity_) {
+Status ArrayBuilder::Advance(int32_t elements) {
+  if (length_ + elements > capacity_) {
     return Status::Invalid("Builder must be expanded");
   }
   length_ += elements;

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 456bb04..491b913 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -32,7 +32,7 @@ class Array;
 class MemoryPool;
 class PoolBuffer;
 
-static constexpr int64_t MIN_BUILDER_CAPACITY = 1 << 8;
+static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8;
 
 // Base class for all data array builders
 class ArrayBuilder {
@@ -40,8 +40,9 @@ class ArrayBuilder {
   explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) :
       pool_(pool),
       type_(type),
-      nullable_(type_->nullable),
-      nulls_(nullptr), null_bits_(nullptr),
+      nulls_(nullptr),
+      null_count_(0),
+      null_bits_(nullptr),
       length_(0),
       capacity_(0) {}
 
@@ -57,21 +58,21 @@ class ArrayBuilder {
     return children_.size();
   }
 
-  int64_t length() const { return length_;}
-  int64_t capacity() const { return capacity_;}
-  bool nullable() const { return nullable_;}
+  int32_t length() const { return length_;}
+  int32_t null_count() const { return null_count_;}
+  int32_t capacity() const { return capacity_;}
 
   // Allocates requires memory at this level, but children need to be
   // initialized independently
-  Status Init(int64_t capacity);
+  Status Init(int32_t capacity);
 
-  // Resizes the nulls array (if nullable)
-  Status Resize(int64_t new_bits);
+  // Resizes the nulls array
+  Status Resize(int32_t new_bits);
 
   // For cases where raw data was memcpy'd into the internal buffers, allows us
   // to advance the length of the builder. It is your responsibility to use
   // this function responsibly.
-  Status Advance(int64_t elements);
+  Status Advance(int32_t elements);
 
   const std::shared_ptr<PoolBuffer>& nulls() const { return nulls_;}
 
@@ -83,15 +84,15 @@ class ArrayBuilder {
   MemoryPool* pool_;
 
   TypePtr type_;
-  bool nullable_;
 
-  // If the type is not nullable, then null_ is nullptr after initialization
+  // When nulls are first appended to the builder, the null bitmap is allocated
   std::shared_ptr<PoolBuffer> nulls_;
+  int32_t null_count_;
   uint8_t* null_bits_;
 
   // Array length, so far. Also, the index of the next element to be added
-  int64_t length_;
-  int64_t capacity_;
+  int32_t length_;
+  int32_t capacity_;
 
   // Child value array builders. These are owned by this class
   std::vector<std::unique_ptr<ArrayBuilder> > children_;

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index 2233a4f..0898c8e 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -84,6 +84,16 @@ void random_nulls(int64_t n, double pct_null, std::vector<bool>* nulls) {
   }
 }
 
+static inline int null_count(const std::vector<uint8_t>& nulls) {
+  int result = 0;
+  for (size_t i = 0; i < nulls.size(); ++i) {
+    if (nulls[i] > 0) {
+      ++result;
+    }
+  }
+  return result;
+}
+
 std::shared_ptr<Buffer> bytes_to_null_buffer(uint8_t* bytes, int length) {
   std::shared_ptr<Buffer> out;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 220f99f..12f1960 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -57,11 +57,9 @@ struct LayoutType {
 // either a primitive physical type (bytes or bits of some fixed size), a
 // nested type consisting of other data types, or another data type (e.g. a
 // timestamp encoded as an int64)
-//
-// Any data type can be nullable
 
 enum class TypeEnum: char {
-  // A degerate NULL type represented as 0 bytes/bits
+  // A degenerate NULL type represented as 0 bytes/bits
   NA = 0,
 
   // Little-endian integer types
@@ -138,14 +136,12 @@ enum class TypeEnum: char {
 
 struct DataType {
   TypeEnum type;
-  bool nullable;
 
-  explicit DataType(TypeEnum type, bool nullable = true)
-      : type(type), nullable(nullable) {}
+  explicit DataType(TypeEnum type)
+      : type(type) {}
 
   virtual bool Equals(const DataType* other) {
-    return (this == other) || (this->type == other->type &&
-        this->nullable == other->nullable);
+    return this == other || this->type == other->type;
   }
 
   virtual std::string ToString() const = 0;

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/collection.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h
index 59ba614..094b63f 100644
--- a/cpp/src/arrow/types/collection.h
+++ b/cpp/src/arrow/types/collection.h
@@ -29,7 +29,7 @@ template <TypeEnum T>
 struct CollectionType : public DataType {
   std::vector<TypePtr> child_types_;
 
-  explicit CollectionType(bool nullable = true) : DataType(T, nullable) {}
+  CollectionType() : DataType(T) {}
 
   const TypePtr& child(int i) const {
     return child_types_[i];

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/datetime.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h
index b4d6252..d90883c 100644
--- a/cpp/src/arrow/types/datetime.h
+++ b/cpp/src/arrow/types/datetime.h
@@ -31,12 +31,12 @@ struct DateType : public DataType {
 
   Unit unit;
 
-  explicit DateType(Unit unit = Unit::DAY, bool nullable = true)
-      : DataType(TypeEnum::DATE, nullable),
+  explicit DateType(Unit unit = Unit::DAY)
+      : DataType(TypeEnum::DATE),
         unit(unit) {}
 
   DateType(const DateType& other)
-      : DateType(other.unit, other.nullable) {}
+      : DateType(other.unit) {}
 
   static char const *name() {
     return "date";
@@ -58,12 +58,12 @@ struct TimestampType : public DataType {
 
   Unit unit;
 
-  explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true)
-      : DataType(TypeEnum::TIMESTAMP, nullable),
+  explicit TimestampType(Unit unit = Unit::MILLI)
+      : DataType(TypeEnum::TIMESTAMP),
         unit(unit) {}
 
   TimestampType(const TimestampType& other)
-      : TimestampType(other.unit, other.nullable) {}
+      : TimestampType(other.unit) {}
 
   static char const *name() {
     return "timestamp";

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/json.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h
index 91fd132..6c2b097 100644
--- a/cpp/src/arrow/types/json.h
+++ b/cpp/src/arrow/types/json.h
@@ -28,8 +28,8 @@ struct JSONScalar : public DataType {
   static TypePtr dense_type;
   static TypePtr sparse_type;
 
-  explicit JSONScalar(bool dense = true, bool nullable = true)
-      : DataType(TypeEnum::JSON_SCALAR, nullable),
+  explicit JSONScalar(bool dense = true)
+      : DataType(TypeEnum::JSON_SCALAR),
         dense(dense) {}
 };
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/list-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc
index abfc8a3..1d9ddbe 100644
--- a/cpp/src/arrow/types/list-test.cc
+++ b/cpp/src/arrow/types/list-test.cc
@@ -44,11 +44,7 @@ TEST(TypesTest, TestListType) {
   std::shared_ptr<DataType> vt = std::make_shared<UInt8Type>();
 
   ListType list_type(vt);
-  ListType list_type_nn(vt, false);
-
   ASSERT_EQ(list_type.type, TypeEnum::LIST);
-  ASSERT_TRUE(list_type.nullable);
-  ASSERT_FALSE(list_type_nn.nullable);
 
   ASSERT_EQ(list_type.name(), string("list"));
   ASSERT_EQ(list_type.ToString(), string("list<uint8>"));
@@ -132,8 +128,8 @@ TEST_F(TestListBuilder, TestBasics) {
 
   Done();
 
-  ASSERT_TRUE(result_->nullable());
-  ASSERT_TRUE(result_->values()->nullable());
+  ASSERT_EQ(1, result_->null_count());
+  ASSERT_EQ(0, result_->values()->null_count());
 
   ASSERT_EQ(3, result_->length());
   vector<int32_t> ex_offsets = {0, 3, 3, 7};
@@ -153,10 +149,6 @@ TEST_F(TestListBuilder, TestBasics) {
   }
 }
 
-TEST_F(TestListBuilder, TestBasicsNonNullable) {
-}
-
-
 TEST_F(TestListBuilder, TestZeroLength) {
   // All buffers are null
   Done();

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/list.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h
index 4ca0f13..4190b53 100644
--- a/cpp/src/arrow/types/list.h
+++ b/cpp/src/arrow/types/list.h
@@ -40,8 +40,8 @@ struct ListType : public DataType {
   // List can contain any other logical value type
   TypePtr value_type;
 
-  explicit ListType(const TypePtr& value_type, bool nullable = true)
-      : DataType(TypeEnum::LIST, nullable),
+  explicit ListType(const TypePtr& value_type)
+      : DataType(TypeEnum::LIST),
         value_type(value_type) {}
 
   static char const *name() {
@@ -56,21 +56,25 @@ class ListArray : public Array {
  public:
   ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {}
 
-  ListArray(const TypePtr& type, int64_t length, std::shared_ptr<Buffer> offsets,
-      const ArrayPtr& values, std::shared_ptr<Buffer> nulls = nullptr) {
-    Init(type, length, offsets, values, nulls);
+  ListArray(const TypePtr& type, int32_t length, std::shared_ptr<Buffer> offsets,
+      const ArrayPtr& values,
+      int32_t null_count = 0,
+      std::shared_ptr<Buffer> nulls = nullptr) {
+    Init(type, length, offsets, values, null_count, nulls);
   }
 
   virtual ~ListArray() {}
 
-  void Init(const TypePtr& type, int64_t length, std::shared_ptr<Buffer> offsets,
-      const ArrayPtr& values, std::shared_ptr<Buffer> nulls = nullptr) {
+  void Init(const TypePtr& type, int32_t length, std::shared_ptr<Buffer> offsets,
+      const ArrayPtr& values,
+      int32_t null_count = 0,
+      std::shared_ptr<Buffer> nulls = nullptr) {
     offset_buf_ = offsets;
     offsets_ = offsets == nullptr? nullptr :
       reinterpret_cast<const int32_t*>(offset_buf_->data());
 
     values_ = values;
-    Array::Init(type, length, nulls);
+    Array::Init(type, length, null_count, nulls);
   }
 
   // Return a shared pointer in case the requestor desires to share ownership
@@ -108,7 +112,7 @@ class ListBuilder : public Int32Builder {
     value_builder_.reset(value_builder);
   }
 
-  Status Init(int64_t elements) {
+  Status Init(int32_t elements) {
     // One more than requested.
     //
     // XXX: This is slightly imprecise, because we might trigger null mask
@@ -116,7 +120,7 @@ class ListBuilder : public Int32Builder {
     return Int32Builder::Init(elements + 1);
   }
 
-  Status Resize(int64_t capacity) {
+  Status Resize(int32_t capacity) {
     // Need space for the end offset
     RETURN_NOT_OK(Int32Builder::Resize(capacity + 1));
 
@@ -129,18 +133,15 @@ class ListBuilder : public Int32Builder {
   //
   // If passed, null_bytes is of equal length to values, and any nonzero byte
   // will be considered as a null for that slot
-  Status Append(T* values, int64_t length, uint8_t* null_bytes = nullptr) {
+  Status Append(T* values, int32_t length, uint8_t* null_bytes = nullptr) {
     if (length_ + length > capacity_) {
-      int64_t new_capacity = util::next_power2(length_ + length);
+      int32_t new_capacity = util::next_power2(length_ + length);
       RETURN_NOT_OK(Resize(new_capacity));
     }
     memcpy(raw_buffer() + length_, values, length * elsize_);
 
-    if (nullable_ && null_bytes != nullptr) {
-      // If null_bytes is all not null, then none of the values are null
-      for (int i = 0; i < length; ++i) {
-        util::set_bit(null_bits_, length_ + i, static_cast<bool>(null_bytes[i]));
-      }
+    if (null_bytes != nullptr) {
+      AppendNulls(null_bytes, length);
     }
 
     length_ += length;
@@ -159,9 +160,10 @@ class ListBuilder : public Int32Builder {
       raw_buffer()[length_] = child_values->length();
     }
 
-    out->Init(type_, length_, values_, ArrayPtr(child_values), nulls_);
+    out->Init(type_, length_, values_, ArrayPtr(child_values),
+        null_count_, nulls_);
     values_ = nulls_ = nullptr;
-    capacity_ = length_ = 0;
+    capacity_ = length_ = null_count_ = 0;
     return Status::OK();
   }
 
@@ -181,10 +183,10 @@ class ListBuilder : public Int32Builder {
       // If the capacity was not already a multiple of 2, do so here
       RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1)));
     }
-    if (nullable_) {
-      util::set_bit(null_bits_, length_, is_null);
+    if (is_null) {
+      ++null_count_;
+      util::set_bit(null_bits_, length_);
     }
-
     raw_buffer()[length_++] = value_builder_->length();
     return Status::OK();
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/primitive-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc
index 3484294..9363443 100644
--- a/cpp/src/arrow/types/primitive-test.cc
+++ b/cpp/src/arrow/types/primitive-test.cc
@@ -53,15 +53,12 @@ TEST(TypesTest, TestBytesType) {
 #define PRIMITIVE_TEST(KLASS, ENUM, NAME)       \
   TEST(TypesTest, TestPrimitive_##ENUM) {       \
     KLASS tp;                                   \
-    KLASS tp_nn(false);                         \
                                                 \
     ASSERT_EQ(tp.type, TypeEnum::ENUM);         \
     ASSERT_EQ(tp.name(), string(NAME));         \
-    ASSERT_TRUE(tp.nullable);                   \
-    ASSERT_FALSE(tp_nn.nullable);               \
                                                 \
-    KLASS tp_copy = tp_nn;                      \
-    ASSERT_FALSE(tp_copy.nullable);             \
+    KLASS tp_copy = tp;                         \
+    ASSERT_EQ(tp_copy.type, TypeEnum::ENUM);    \
   }
 
 PRIMITIVE_TEST(Int8Type, INT8, "int8");
@@ -100,17 +97,16 @@ class TestPrimitiveBuilder : public TestBuilder {
     TestBuilder::SetUp();
 
     type_ = Attrs::type();
-    type_nn_ = Attrs::type(false);
 
     ArrayBuilder* tmp;
     ASSERT_OK(make_builder(pool_, type_, &tmp));
     builder_.reset(static_cast<BuilderType*>(tmp));
 
-    ASSERT_OK(make_builder(pool_, type_nn_, &tmp));
+    ASSERT_OK(make_builder(pool_, type_, &tmp));
     builder_nn_.reset(static_cast<BuilderType*>(tmp));
   }
 
-  void RandomData(int64_t N, double pct_null = 0.1) {
+  void RandomData(int N, double pct_null = 0.1) {
     Attrs::draw(N, &draws_);
     random_nulls(N, pct_null, &nulls_);
   }
@@ -118,28 +114,33 @@ class TestPrimitiveBuilder : public TestBuilder {
   void CheckNullable() {
     ArrayType result;
     ArrayType expected;
-    int64_t size = builder_->length();
+    int size = builder_->length();
 
-    auto ex_data = std::make_shared<Buffer>(reinterpret_cast<uint8_t*>(draws_.data()),
+    auto ex_data = std::make_shared<Buffer>(
+        reinterpret_cast<uint8_t*>(draws_.data()),
         size * sizeof(T));
 
     auto ex_nulls = bytes_to_null_buffer(nulls_.data(), size);
 
-    expected.Init(size, ex_data, ex_nulls);
+    int32_t ex_null_count = null_count(nulls_);
+
+    expected.Init(size, ex_data, ex_null_count, ex_nulls);
     ASSERT_OK(builder_->Transfer(&result));
 
     // Builder is now reset
     ASSERT_EQ(0, builder_->length());
     ASSERT_EQ(0, builder_->capacity());
+    ASSERT_EQ(0, builder_->null_count());
     ASSERT_EQ(nullptr, builder_->buffer());
 
     ASSERT_TRUE(result.Equals(expected));
+    ASSERT_EQ(ex_null_count, result.null_count());
   }
 
   void CheckNonNullable() {
     ArrayType result;
     ArrayType expected;
-    int64_t size = builder_nn_->length();
+    int size = builder_nn_->length();
 
     auto ex_data = std::make_shared<Buffer>(reinterpret_cast<uint8_t*>(draws_.data()),
         size * sizeof(T));
@@ -153,6 +154,7 @@ class TestPrimitiveBuilder : public TestBuilder {
     ASSERT_EQ(nullptr, builder_nn_->buffer());
 
     ASSERT_TRUE(result.Equals(expected));
+    ASSERT_EQ(0, result.null_count());
   }
 
  protected:
@@ -171,14 +173,14 @@ class TestPrimitiveBuilder : public TestBuilder {
   typedef CapType##Type Type;                   \
   typedef c_type T;                             \
                                                 \
-  static TypePtr type(bool nullable = true) {   \
-    return TypePtr(new Type(nullable));         \
+  static TypePtr type() {                       \
+    return TypePtr(new Type());                 \
   }
 
 #define PINT_DECL(CapType, c_type, LOWER, UPPER)    \
   struct P##CapType {                               \
     PTYPE_DECL(CapType, c_type);                    \
-    static void draw(int64_t N, vector<T>* draws) {  \
+    static void draw(int N, vector<T>* draws) {  \
       randint<T>(N, LOWER, UPPER, draws);           \
     }                                               \
   }
@@ -208,7 +210,7 @@ TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives);
 TYPED_TEST(TestPrimitiveBuilder, TestInit) {
   DECL_T();
 
-  int64_t n = 1000;
+  int n = 1000;
   ASSERT_OK(this->builder_->Init(n));
   ASSERT_EQ(n, this->builder_->capacity());
   ASSERT_EQ(n * sizeof(T), this->builder_->buffer()->size());

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/primitive.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc
index 2612e8c..c86260b 100644
--- a/cpp/src/arrow/types/primitive.cc
+++ b/cpp/src/arrow/types/primitive.cc
@@ -26,20 +26,23 @@ namespace arrow {
 // ----------------------------------------------------------------------
 // Primitive array base
 
-void PrimitiveArray::Init(const TypePtr& type, int64_t length,
+void PrimitiveArray::Init(const TypePtr& type, int32_t length,
     const std::shared_ptr<Buffer>& data,
+    int32_t null_count,
     const std::shared_ptr<Buffer>& nulls) {
-  Array::Init(type, length, nulls);
+  Array::Init(type, length, null_count, nulls);
   data_ = data;
   raw_data_ = data == nullptr? nullptr : data_->data();
 }
 
 bool PrimitiveArray::Equals(const PrimitiveArray& other) const {
   if (this == &other) return true;
-  if (type_->nullable != other.type_->nullable) return false;
+  if (null_count_ != other.null_count_) {
+    return false;
+  }
 
   bool equal_data = data_->Equals(*other.data_, length_);
-  if (type_->nullable) {
+  if (null_count_ > 0) {
     return equal_data &&
       nulls_->Equals(*other.nulls_, util::ceil_byte(length_) / 8);
   } else {

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/primitive.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h
index c5ae0f7..aa8f351 100644
--- a/cpp/src/arrow/types/primitive.h
+++ b/cpp/src/arrow/types/primitive.h
@@ -36,24 +36,24 @@ class MemoryPool;
 
 template <typename Derived>
 struct PrimitiveType : public DataType {
-  explicit PrimitiveType(bool nullable = true)
-      : DataType(Derived::type_enum, nullable) {}
+  PrimitiveType()
+      : DataType(Derived::type_enum) {}
 
   virtual std::string ToString() const {
     return std::string(static_cast<const Derived*>(this)->name());
   }
 };
 
-#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME)          \
-  typedef C_TYPE c_type;                                            \
-  static constexpr TypeEnum type_enum = TypeEnum::ENUM;             \
-  static constexpr int size = SIZE;                              \
-                                                                    \
-  explicit TYPENAME(bool nullable = true)                           \
-      : PrimitiveType<TYPENAME>(nullable) {}                        \
-                                                                    \
-  static const char* name() {                                       \
-    return NAME;                                                    \
+#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME)  \
+  typedef C_TYPE c_type;                                    \
+  static constexpr TypeEnum type_enum = TypeEnum::ENUM;     \
+  static constexpr int size = SIZE;                         \
+                                                            \
+  TYPENAME()                                                \
+      : PrimitiveType<TYPENAME>() {}                        \
+                                                            \
+  static const char* name() {                               \
+    return NAME;                                            \
   }
 
 
@@ -64,7 +64,9 @@ class PrimitiveArray : public Array {
 
   virtual ~PrimitiveArray() {}
 
-  void Init(const TypePtr& type, int64_t length, const std::shared_ptr<Buffer>& data,
+  void Init(const TypePtr& type, int32_t length,
+      const std::shared_ptr<Buffer>& data,
+      int32_t null_count = 0,
       const std::shared_ptr<Buffer>& nulls = nullptr);
 
   const std::shared_ptr<Buffer>& data() const { return data_;}
@@ -84,15 +86,17 @@ class PrimitiveArrayImpl : public PrimitiveArray {
 
   PrimitiveArrayImpl() : PrimitiveArray() {}
 
-  PrimitiveArrayImpl(int64_t length, const std::shared_ptr<Buffer>& data,
+  PrimitiveArrayImpl(int32_t length, const std::shared_ptr<Buffer>& data,
+      int32_t null_count = 0,
       const std::shared_ptr<Buffer>& nulls = nullptr) {
-    Init(length, data, nulls);
+    Init(length, data, null_count, nulls);
   }
 
-  void Init(int64_t length, const std::shared_ptr<Buffer>& data,
+  void Init(int32_t length, const std::shared_ptr<Buffer>& data,
+      int32_t null_count = 0,
       const std::shared_ptr<Buffer>& nulls = nullptr) {
-    TypePtr type(new TypeClass(nulls != nullptr));
-    PrimitiveArray::Init(type, length, data, nulls);
+    TypePtr type(new TypeClass());
+    PrimitiveArray::Init(type, length, data, null_count, nulls);
   }
 
   bool Equals(const PrimitiveArrayImpl& other) const {
@@ -101,7 +105,7 @@ class PrimitiveArrayImpl : public PrimitiveArray {
 
   const T* raw_data() const { return reinterpret_cast<const T*>(raw_data_);}
 
-  T Value(int64_t i) const {
+  T Value(int i) const {
     return raw_data()[i];
   }
 
@@ -124,7 +128,7 @@ class PrimitiveBuilder : public ArrayBuilder {
 
   virtual ~PrimitiveBuilder() {}
 
-  Status Resize(int64_t capacity) {
+  Status Resize(int32_t capacity) {
     // XXX: Set floor size for now
     if (capacity < MIN_BUILDER_CAPACITY) {
       capacity = MIN_BUILDER_CAPACITY;
@@ -135,27 +139,26 @@ class PrimitiveBuilder : public ArrayBuilder {
     } else {
       RETURN_NOT_OK(ArrayBuilder::Resize(capacity));
       RETURN_NOT_OK(values_->Resize(capacity * elsize_));
-      capacity_ = capacity;
     }
+    capacity_ = capacity;
     return Status::OK();
   }
 
-  Status Init(int64_t capacity) {
+  Status Init(int32_t capacity) {
     RETURN_NOT_OK(ArrayBuilder::Init(capacity));
-
     values_ = std::make_shared<PoolBuffer>(pool_);
     return values_->Resize(capacity * elsize_);
   }
 
-  Status Reserve(int64_t elements) {
+  Status Reserve(int32_t elements) {
     if (length_ + elements > capacity_) {
-      int64_t new_capacity = util::next_power2(length_ + elements);
+      int32_t new_capacity = util::next_power2(length_ + elements);
       return Resize(new_capacity);
     }
     return Status::OK();
   }
 
-  Status Advance(int64_t elements) {
+  Status Advance(int32_t elements) {
     return ArrayBuilder::Advance(elements);
   }
 
@@ -165,8 +168,9 @@ class PrimitiveBuilder : public ArrayBuilder {
       // If the capacity was not already a multiple of 2, do so here
       RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1)));
     }
-    if (nullable_) {
-      util::set_bit(null_bits_, length_, is_null);
+    if (is_null) {
+      ++null_count_;
+      util::set_bit(null_bits_, length_);
     }
     raw_buffer()[length_++] = val;
     return Status::OK();
@@ -176,42 +180,49 @@ class PrimitiveBuilder : public ArrayBuilder {
   //
   // If passed, null_bytes is of equal length to values, and any nonzero byte
   // will be considered as a null for that slot
-  Status Append(const T* values, int64_t length, uint8_t* null_bytes = nullptr) {
+  Status Append(const T* values, int32_t length,
+      const uint8_t* null_bytes = nullptr) {
     if (length_ + length > capacity_) {
-      int64_t new_capacity = util::next_power2(length_ + length);
+      int32_t new_capacity = util::next_power2(length_ + length);
       RETURN_NOT_OK(Resize(new_capacity));
     }
     memcpy(raw_buffer() + length_, values, length * elsize_);
 
-    if (nullable_ && null_bytes != nullptr) {
-      // If null_bytes is all not null, then none of the values are null
-      for (int64_t i = 0; i < length; ++i) {
-        util::set_bit(null_bits_, length_ + i, static_cast<bool>(null_bytes[i]));
-      }
+    if (null_bytes != nullptr) {
+      AppendNulls(null_bytes, length);
     }
 
     length_ += length;
     return Status::OK();
   }
 
-  Status AppendNull() {
-    if (!nullable_) {
-      return Status::Invalid("not nullable");
+  // Write nulls as uint8_t* into pre-allocated memory
+  void AppendNulls(const uint8_t* null_bytes, int32_t length) {
+    // If null_bytes is all not null, then none of the values are null
+    for (int i = 0; i < length; ++i) {
+      if (static_cast<bool>(null_bytes[i])) {
+        ++null_count_;
+        util::set_bit(null_bits_, length_ + i);
+      }
     }
+  }
+
+  Status AppendNull() {
     if (length_ == capacity_) {
       // If the capacity was not already a multiple of 2, do so here
       RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1)));
     }
-    util::set_bit(null_bits_, length_++, true);
+    ++null_count_;
+    util::set_bit(null_bits_, length_++);
     return Status::OK();
   }
 
   // Initialize an array type instance with the results of this builder
   // Transfers ownership of all buffers
   Status Transfer(PrimitiveArray* out) {
-    out->Init(type_, length_, values_, nulls_);
+    out->Init(type_, length_, values_, null_count_, nulls_);
     values_ = nulls_ = nullptr;
-    capacity_ = length_ = 0;
+    capacity_ = length_ = null_count_ = 0;
     return Status::OK();
   }
 
@@ -236,7 +247,7 @@ class PrimitiveBuilder : public ArrayBuilder {
 
  protected:
   std::shared_ptr<PoolBuffer> values_;
-  int64_t elsize_;
+  int elsize_;
 };
 
 } // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/string-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc
index a2d87ea..e1dcebe 100644
--- a/cpp/src/arrow/types/string-test.cc
+++ b/cpp/src/arrow/types/string-test.cc
@@ -39,7 +39,6 @@ TEST(TypesTest, TestCharType) {
   CharType t1(5);
 
   ASSERT_EQ(t1.type, TypeEnum::CHAR);
-  ASSERT_TRUE(t1.nullable);
   ASSERT_EQ(t1.size, 5);
 
   ASSERT_EQ(t1.ToString(), std::string("char(5)"));
@@ -47,7 +46,6 @@ TEST(TypesTest, TestCharType) {
   // Test copy constructor
   CharType t2 = t1;
   ASSERT_EQ(t2.type, TypeEnum::CHAR);
-  ASSERT_TRUE(t2.nullable);
   ASSERT_EQ(t2.size, 5);
 }
 
@@ -56,7 +54,6 @@ TEST(TypesTest, TestVarcharType) {
   VarcharType t1(5);
 
   ASSERT_EQ(t1.type, TypeEnum::VARCHAR);
-  ASSERT_TRUE(t1.nullable);
   ASSERT_EQ(t1.size, 5);
   ASSERT_EQ(t1.physical_type.size, 6);
 
@@ -65,19 +62,14 @@ TEST(TypesTest, TestVarcharType) {
   // Test copy constructor
   VarcharType t2 = t1;
   ASSERT_EQ(t2.type, TypeEnum::VARCHAR);
-  ASSERT_TRUE(t2.nullable);
   ASSERT_EQ(t2.size, 5);
   ASSERT_EQ(t2.physical_type.size, 6);
 }
 
 TEST(TypesTest, TestStringType) {
   StringType str;
-  StringType str_nn(false);
-
   ASSERT_EQ(str.type, TypeEnum::STRING);
   ASSERT_EQ(str.name(), std::string("string"));
-  ASSERT_TRUE(str.nullable);
-  ASSERT_FALSE(str_nn.nullable);
 }
 
 // ----------------------------------------------------------------------
@@ -96,7 +88,7 @@ class TestStringContainer : public ::testing::Test  {
 
   void MakeArray() {
     length_ = offsets_.size() - 1;
-    int64_t nchars = chars_.size();
+    int nchars = chars_.size();
 
     value_buf_ = to_buffer(chars_);
     values_ = ArrayPtr(new UInt8Array(nchars, value_buf_));
@@ -104,7 +96,9 @@ class TestStringContainer : public ::testing::Test  {
     offsets_buf_ = to_buffer(offsets_);
 
     nulls_buf_ = bytes_to_null_buffer(nulls_.data(), nulls_.size());
-    strings_.Init(length_, offsets_buf_, values_, nulls_buf_);
+    null_count_ = null_count(nulls_);
+
+    strings_.Init(length_, offsets_buf_, values_, null_count_, nulls_buf_);
   }
 
  protected:
@@ -118,7 +112,8 @@ class TestStringContainer : public ::testing::Test  {
   std::shared_ptr<Buffer> offsets_buf_;
   std::shared_ptr<Buffer> nulls_buf_;
 
-  int64_t length_;
+  int null_count_;
+  int length_;
 
   ArrayPtr values_;
   StringArray strings_;
@@ -127,7 +122,7 @@ class TestStringContainer : public ::testing::Test  {
 
 TEST_F(TestStringContainer, TestArrayBasics) {
   ASSERT_EQ(length_, strings_.length());
-  ASSERT_TRUE(strings_.nullable());
+  ASSERT_EQ(1, strings_.null_count());
 }
 
 TEST_F(TestStringContainer, TestType) {
@@ -149,7 +144,8 @@ TEST_F(TestStringContainer, TestListFunctions) {
 
 
 TEST_F(TestStringContainer, TestDestructor) {
-  auto arr = std::make_shared<StringArray>(length_, offsets_buf_, values_, nulls_buf_);
+  auto arr = std::make_shared<StringArray>(length_, offsets_buf_, values_,
+      null_count_, nulls_buf_);
 }
 
 TEST_F(TestStringContainer, TestGetString) {
@@ -189,10 +185,6 @@ class TestStringBuilder : public TestBuilder {
   std::unique_ptr<StringArray> result_;
 };
 
-TEST_F(TestStringBuilder, TestAttrs) {
-  ASSERT_FALSE(builder_->value_builder()->nullable());
-}
-
 TEST_F(TestStringBuilder, TestScalarAppend) {
   std::vector<std::string> strings = {"a", "bb", "", "", "ccc"};
   std::vector<uint8_t> is_null = {0, 0, 0, 1, 0};
@@ -212,10 +204,11 @@ TEST_F(TestStringBuilder, TestScalarAppend) {
   Done();
 
   ASSERT_EQ(reps * N, result_->length());
+  ASSERT_EQ(reps * null_count(is_null), result_->null_count());
   ASSERT_EQ(reps * 6, result_->values()->length());
 
-  int64_t length;
-  int64_t pos = 0;
+  int32_t length;
+  int32_t pos = 0;
   for (int i = 0; i < N * reps; ++i) {
     if (is_null[i % N]) {
       ASSERT_TRUE(result_->IsNull(i));

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/string.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc
index f3dfbdc..dea42e1 100644
--- a/cpp/src/arrow/types/string.cc
+++ b/cpp/src/arrow/types/string.cc
@@ -35,6 +35,6 @@ std::string VarcharType::ToString() const {
   return s.str();
 }
 
-TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type(false));
+TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type());
 
 } // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/string.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h
index d0690d9..0845625 100644
--- a/cpp/src/arrow/types/string.h
+++ b/cpp/src/arrow/types/string.h
@@ -40,13 +40,13 @@ struct CharType : public DataType {
 
   BytesType physical_type;
 
-  explicit CharType(int size, bool nullable = true)
-      : DataType(TypeEnum::CHAR, nullable),
+  explicit CharType(int size)
+      : DataType(TypeEnum::CHAR),
         size(size),
         physical_type(BytesType(size)) {}
 
   CharType(const CharType& other)
-      : CharType(other.size, other.nullable) {}
+      : CharType(other.size) {}
 
   virtual std::string ToString() const;
 };
@@ -58,12 +58,12 @@ struct VarcharType : public DataType {
 
   BytesType physical_type;
 
-  explicit VarcharType(int size, bool nullable = true)
-      : DataType(TypeEnum::VARCHAR, nullable),
+  explicit VarcharType(int size)
+      : DataType(TypeEnum::VARCHAR),
         size(size),
         physical_type(BytesType(size + 1)) {}
   VarcharType(const VarcharType& other)
-      : VarcharType(other.size, other.nullable) {}
+      : VarcharType(other.size) {}
 
   virtual std::string ToString() const;
 };
@@ -73,11 +73,11 @@ static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1));
 
 // String is a logical type consisting of a physical list of 1-byte values
 struct StringType : public DataType {
-  explicit StringType(bool nullable = true)
-      : DataType(TypeEnum::STRING, nullable) {}
+  StringType()
+      : DataType(TypeEnum::STRING) {}
 
   StringType(const StringType& other)
-      : StringType(other.nullable) {}
+      : StringType() {}
 
   const LayoutPtr& physical_type() {
     return physical_string;
@@ -98,17 +98,19 @@ class StringArray : public ListArray {
  public:
   StringArray() : ListArray(), bytes_(nullptr), raw_bytes_(nullptr) {}
 
-  StringArray(int64_t length, const std::shared_ptr<Buffer>& offsets,
+  StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
       const ArrayPtr& values,
+      int32_t null_count = 0,
       const std::shared_ptr<Buffer>& nulls = nullptr) {
-    Init(length, offsets, values, nulls);
+    Init(length, offsets, values, null_count, nulls);
   }
 
-  void Init(const TypePtr& type, int64_t length,
+  void Init(const TypePtr& type, int32_t length,
       const std::shared_ptr<Buffer>& offsets,
       const ArrayPtr& values,
+      int32_t null_count = 0,
       const std::shared_ptr<Buffer>& nulls = nullptr) {
-    ListArray::Init(type, length, offsets, values, nulls);
+    ListArray::Init(type, length, offsets, values, null_count, nulls);
 
     // TODO: type validation for values array
 
@@ -117,23 +119,24 @@ class StringArray : public ListArray {
     raw_bytes_ = bytes_->raw_data();
   }
 
-  void Init(int64_t length, const std::shared_ptr<Buffer>& offsets,
+  void Init(int32_t length, const std::shared_ptr<Buffer>& offsets,
       const ArrayPtr& values,
+      int32_t null_count = 0,
       const std::shared_ptr<Buffer>& nulls = nullptr) {
-    TypePtr type(new StringType(nulls != nullptr));
-    Init(type, length, offsets, values, nulls);
+    TypePtr type(new StringType());
+    Init(type, length, offsets, values, null_count, nulls);
   }
 
   // Compute the pointer t
-  const uint8_t* GetValue(int64_t i, int64_t* out_length) const {
+  const uint8_t* GetValue(int i, int32_t* out_length) const {
     int32_t pos = offsets_[i];
     *out_length = offsets_[i + 1] - pos;
     return raw_bytes_ + pos;
   }
 
   // Construct a std::string
-  std::string GetString(int64_t i) const {
-    int64_t nchars;
+  std::string GetString(int i) const {
+    int32_t nchars;
     const uint8_t* str = GetValue(i, &nchars);
     return std::string(reinterpret_cast<const char*>(str), nchars);
   }
@@ -161,7 +164,7 @@ class StringBuilder : public ListBuilder {
         value.size());
   }
 
-  Status Append(const uint8_t* value, int64_t length);
+  Status Append(const uint8_t* value, int32_t length);
   Status Append(const std::vector<std::string>& values,
                 uint8_t* null_bytes);
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/struct-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc
index 644b545..1a9fc6b 100644
--- a/cpp/src/arrow/types/struct-test.cc
+++ b/cpp/src/arrow/types/struct-test.cc
@@ -43,11 +43,7 @@ TEST(TestStructType, Basics) {
 
   vector<Field> fields = {f0, f1, f2};
 
-  StructType struct_type(fields, true);
-  StructType struct_type_nn(fields, false);
-
-  ASSERT_TRUE(struct_type.nullable);
-  ASSERT_FALSE(struct_type_nn.nullable);
+  StructType struct_type(fields);
 
   ASSERT_TRUE(struct_type.field(0).Equals(f0));
   ASSERT_TRUE(struct_type.field(1).Equals(f1));

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/struct.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h
index 7d8885b..afba19a 100644
--- a/cpp/src/arrow/types/struct.h
+++ b/cpp/src/arrow/types/struct.h
@@ -29,9 +29,8 @@ namespace arrow {
 struct StructType : public DataType {
   std::vector<Field> fields_;
 
-  StructType(const std::vector<Field>& fields,
-      bool nullable = true)
-      : DataType(TypeEnum::STRUCT, nullable) {
+  explicit StructType(const std::vector<Field>& fields)
+      : DataType(TypeEnum::STRUCT) {
     fields_ = fields;
   }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h
index 3ecb0de..1744efc 100644
--- a/cpp/src/arrow/types/test-common.h
+++ b/cpp/src/arrow/types/test-common.h
@@ -36,15 +36,13 @@ class TestBuilder : public ::testing::Test {
   void SetUp() {
     pool_ = GetDefaultMemoryPool();
     type_ = TypePtr(new UInt8Type());
-    type_nn_ = TypePtr(new UInt8Type(false));
     builder_.reset(new UInt8Builder(pool_, type_));
-    builder_nn_.reset(new UInt8Builder(pool_, type_nn_));
+    builder_nn_.reset(new UInt8Builder(pool_, type_));
   }
  protected:
   MemoryPool* pool_;
 
   TypePtr type_;
-  TypePtr type_nn_;
   unique_ptr<ArrayBuilder> builder_;
   unique_ptr<ArrayBuilder> builder_nn_;
 };

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/union.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h
index 7b66c3b..62a3d1c 100644
--- a/cpp/src/arrow/types/union.h
+++ b/cpp/src/arrow/types/union.h
@@ -33,9 +33,8 @@ class Buffer;
 struct DenseUnionType : public CollectionType<TypeEnum::DENSE_UNION> {
   typedef CollectionType<TypeEnum::DENSE_UNION> Base;
 
-  DenseUnionType(const std::vector<TypePtr>& child_types,
-      bool nullable = true)
-      : Base(nullable) {
+  explicit DenseUnionType(const std::vector<TypePtr>& child_types) :
+      Base() {
     child_types_ = child_types;
   }
 
@@ -46,9 +45,8 @@ struct DenseUnionType : public CollectionType<TypeEnum::DENSE_UNION> {
 struct SparseUnionType : public CollectionType<TypeEnum::SPARSE_UNION> {
   typedef CollectionType<TypeEnum::SPARSE_UNION> Base;
 
-  SparseUnionType(const std::vector<TypePtr>& child_types,
-      bool nullable = true)
-      : Base(nullable) {
+  explicit SparseUnionType(const std::vector<TypePtr>& child_types) :
+      Base() {
     child_types_ = child_types;
   }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/util/bit-util.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc
index dbac0a4..292cb33 100644
--- a/cpp/src/arrow/util/bit-util.cc
+++ b/cpp/src/arrow/util/bit-util.cc
@@ -25,7 +25,9 @@ namespace arrow {
 
 void util::bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits) {
   for (int i = 0; i < length; ++i) {
-    set_bit(bits, i, static_cast<bool>(bytes[i]));
+    if (static_cast<bool>(bytes[i])) {
+      set_bit(bits, i);
+    }
   }
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/util/bit-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index 9ae6127..841f617 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -41,8 +41,8 @@ static inline bool get_bit(const uint8_t* bits, int i) {
   return bits[i / 8] & (1 << (i % 8));
 }
 
-static inline void set_bit(uint8_t* bits, int i, bool is_set) {
-  bits[i / 8] |= (1 << (i % 8)) * is_set;
+static inline void set_bit(uint8_t* bits, int i) {
+  bits[i / 8] |= 1 << (i % 8);
 }
 
 static inline int64_t next_power2(int64_t n) {


Mime
View raw message