arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject [1/2] arrow git commit: ARROW-493: [C++] Permit large (length > INT32_MAX) arrays in memory
Date Mon, 27 Feb 2017 07:14:15 GMT
Repository: arrow
Updated Branches:
  refs/heads/master dc103feaf -> 01a67f3ff


http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/ipc/adapter.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc
index 2be87a3..f11c88a 100644
--- a/cpp/src/arrow/ipc/adapter.cc
+++ b/cpp/src/arrow/ipc/adapter.cc
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
+#include <limits>
 #include <sstream>
 #include <vector>
 
@@ -65,8 +66,14 @@ class RecordBatchWriter : public ArrayVisitor {
     if (max_recursion_depth_ <= 0) {
       return Status::Invalid("Max recursion depth reached");
     }
+
+    if (arr.length() > std::numeric_limits<int32_t>::max()) {
+      return Status::Invalid("Cannot write arrays larger than 2^31 - 1 in length");
+    }
+
     // push back all common elements
-    field_nodes_.push_back(flatbuf::FieldNode(arr.length(), arr.null_count()));
+    field_nodes_.push_back(flatbuf::FieldNode(
+        static_cast<int32_t>(arr.length()), static_cast<int32_t>(arr.null_count())));
     if (arr.null_count() > 0) {
       std::shared_ptr<Buffer> bitmap = arr.null_bitmap();
 
@@ -152,13 +159,14 @@ class RecordBatchWriter : public ArrayVisitor {
     int64_t start_offset;
     RETURN_NOT_OK(dst->Tell(&start_offset));
 
-    int64_t padded_metadata_length = metadata_fb->size() + 4;
-    const int remainder = (padded_metadata_length + start_offset) % 8;
+    int32_t padded_metadata_length = static_cast<int32_t>(metadata_fb->size()) + 4;
+    const int32_t remainder =
+        (padded_metadata_length + static_cast<int32_t>(start_offset)) % 8;
     if (remainder != 0) { padded_metadata_length += 8 - remainder; }
 
     // The returned metadata size includes the length prefix, the flatbuffer,
     // plus padding
-    *metadata_length = static_cast<int32_t>(padded_metadata_length);
+    *metadata_length = padded_metadata_length;
 
     // Write the flatbuffer size prefix including padding
     int32_t flatbuffer_size = padded_metadata_length - 4;
@@ -169,7 +177,8 @@ class RecordBatchWriter : public ArrayVisitor {
     RETURN_NOT_OK(dst->Write(metadata_fb->data(), metadata_fb->size()));
 
     // Write any padding
-    int64_t padding = padded_metadata_length - metadata_fb->size() - 4;
+    int32_t padding =
+        padded_metadata_length - static_cast<int32_t>(metadata_fb->size()) - 4;
     if (padding > 0) { RETURN_NOT_OK(dst->Write(kPaddingBytes, padding)); }
 
     return Status::OK();
@@ -184,7 +193,8 @@ class RecordBatchWriter : public ArrayVisitor {
     RETURN_NOT_OK(dst->Tell(&start_position));
 #endif
 
-    RETURN_NOT_OK(WriteMetadata(batch.num_rows(), *body_length, dst, metadata_length));
+    RETURN_NOT_OK(WriteMetadata(
+        static_cast<int32_t>(batch.num_rows()), *body_length, dst, metadata_length));
 
 #ifndef NDEBUG
     RETURN_NOT_OK(dst->Tell(&current_position));
@@ -430,7 +440,7 @@ class RecordBatchWriter : public ArrayVisitor {
         int32_t* shifted_offsets =
             reinterpret_cast<int32_t*>(shifted_offsets_buffer->mutable_data());
 
-        for (int32_t i = 0; i < array.length(); ++i) {
+        for (int64_t i = 0; i < array.length(); ++i) {
           const uint8_t code = type_ids[i];
           int32_t shift = child_offsets[code];
           if (shift == -1) { child_offsets[code] = shift = unshifted_offsets[i]; }

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/ipc/ipc-json-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc
index 3e759cc..4c18a49 100644
--- a/cpp/src/arrow/ipc/ipc-json-test.cc
+++ b/cpp/src/arrow/ipc/ipc-json-test.cc
@@ -240,7 +240,7 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) {
   const int nbatches = 3;
   std::vector<std::shared_ptr<RecordBatch>> batches;
   for (int i = 0; i < nbatches; ++i) {
-    int32_t num_rows = 5 + i * 5;
+    int num_rows = 5 + i * 5;
     std::vector<std::shared_ptr<Array>> arrays;
 
     MakeBatchArrays(schema, num_rows, &arrays);

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/ipc/json-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index 6253cd6..0458b85 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -355,7 +355,7 @@ class JsonArrayWriter : public ArrayVisitor {
     writer_->String(name);
 
     writer_->Key("count");
-    writer_->Int(arr.length());
+    writer_->Int(static_cast<int32_t>(arr.length()));
 
     RETURN_NOT_OK(arr.Accept(this));
 
@@ -394,7 +394,7 @@ class JsonArrayWriter : public ArrayVisitor {
   template <typename T>
   typename std::enable_if<std::is_base_of<BinaryArray, T>::value, void>::type
   WriteDataValues(const T& arr) {
-    for (int i = 0; i < arr.length(); ++i) {
+    for (int64_t i = 0; i < arr.length(); ++i) {
       int32_t length;
       const char* buf = reinterpret_cast<const char*>(arr.GetValue(i, &length));
 
@@ -430,7 +430,7 @@ class JsonArrayWriter : public ArrayVisitor {
   }
 
   template <typename T>
-  void WriteIntegerField(const char* name, const T* values, int32_t length) {
+  void WriteIntegerField(const char* name, const T* values, int64_t length) {
     writer_->Key(name);
     writer_->StartArray();
     for (int i = 0; i < length; ++i) {
@@ -573,7 +573,7 @@ class JsonSchemaReader {
     const auto& values = obj.GetArray();
 
     fields->resize(values.Size());
-    for (size_t i = 0; i < fields->size(); ++i) {
+    for (rj::SizeType i = 0; i < fields->size(); ++i) {
       RETURN_NOT_OK(GetField(values[i], &(*fields)[i]));
     }
     return Status::OK();
@@ -712,7 +712,7 @@ class JsonSchemaReader {
     const auto& id_array = json_type_codes->value.GetArray();
     for (const rj::Value& val : id_array) {
       DCHECK(val.IsUint());
-      type_codes.push_back(val.GetUint());
+      type_codes.push_back(static_cast<uint8_t>(val.GetUint()));
     }
 
     *type = union_(children, type_codes, mode);
@@ -770,10 +770,38 @@ static inline Status ParseHexValue(const char* data, uint8_t* out) {
   // Error checking
   if (*pos1 != c1 || *pos2 != c2) { return Status::Invalid("Encountered non-hex digit"); }
 
-  *out = (pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable);
+  *out = static_cast<uint8_t>((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable));
   return Status::OK();
 }
 
+template <typename T>
+inline typename std::enable_if<IsSignedInt<T>::value, typename T::c_type>::type
+UnboxValue(const rj::Value& val) {
+  DCHECK(val.IsInt());
+  return static_cast<typename T::c_type>(val.GetInt64());
+}
+
+template <typename T>
+inline typename std::enable_if<IsUnsignedInt<T>::value, typename T::c_type>::type
+UnboxValue(const rj::Value& val) {
+  DCHECK(val.IsUint());
+  return static_cast<typename T::c_type>(val.GetUint64());
+}
+
+template <typename T>
+inline typename std::enable_if<IsFloatingPoint<T>::value, typename T::c_type>::type
+UnboxValue(const rj::Value& val) {
+  DCHECK(val.IsFloat());
+  return static_cast<typename T::c_type>(val.GetDouble());
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_base_of<BooleanType, T>::value, bool>::type
+UnboxValue(const rj::Value& val) {
+  DCHECK(val.IsBool());
+  return val.GetBool();
+}
+
 class JsonArrayReader {
  public:
   explicit JsonArrayReader(MemoryPool* pool) : pool_(pool) {}
@@ -820,22 +848,7 @@ class JsonArrayReader {
       }
 
       const rj::Value& val = json_data_arr[i];
-      if (IsSignedInt<T>::value) {
-        DCHECK(val.IsInt());
-        builder.Append(val.GetInt64());
-      } else if (IsUnsignedInt<T>::value) {
-        DCHECK(val.IsUint());
-        builder.Append(val.GetUint64());
-      } else if (IsFloatingPoint<T>::value) {
-        DCHECK(val.IsFloat());
-        builder.Append(val.GetDouble());
-      } else if (std::is_base_of<BooleanType, T>::value) {
-        DCHECK(val.IsBool());
-        builder.Append(val.GetBool());
-      } else {
-        // We are in the wrong function
-        return Status::Invalid(type->ToString());
-      }
+      builder.Append(UnboxValue<T>(val));
     }
 
     return builder.Finish(array);
@@ -869,13 +882,13 @@ class JsonArrayReader {
         std::string hex_string = val.GetString();
 
         DCHECK(hex_string.size() % 2 == 0) << "Expected base16 hex string";
-        int64_t length = static_cast<int>(hex_string.size()) / 2;
+        int32_t length = static_cast<int>(hex_string.size()) / 2;
 
         if (byte_buffer->size() < length) { RETURN_NOT_OK(byte_buffer->Resize(length)); }
 
         const char* hex_data = hex_string.c_str();
         uint8_t* byte_buffer_data = byte_buffer->mutable_data();
-        for (int64_t j = 0; j < length; ++j) {
+        for (int32_t j = 0; j < length; ++j) {
           RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j]));
         }
         RETURN_NOT_OK(builder.Append(byte_buffer_data, length));

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/ipc/json.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc
index 773fb74..a01be19 100644
--- a/cpp/src/arrow/ipc/json.cc
+++ b/cpp/src/arrow/ipc/json.cc
@@ -69,7 +69,7 @@ class JsonWriter::JsonWriterImpl {
 
     writer_->StartObject();
     writer_->Key("count");
-    writer_->Int(batch.num_rows());
+    writer_->Int(static_cast<int32_t>(batch.num_rows()));
 
     writer_->Key("columns");
     writer_->StartArray();
@@ -158,7 +158,7 @@ class JsonReader::JsonReaderImpl {
     const auto& json_columns = it->value.GetArray();
 
     std::vector<std::shared_ptr<Array>> columns(json_columns.Size());
-    for (size_t i = 0; i < columns.size(); ++i) {
+    for (int i = 0; i < static_cast<int>(columns.size()); ++i) {
       const std::shared_ptr<DataType>& type = schema_->field(i)->type;
       RETURN_NOT_OK(ReadJsonArray(pool_, json_columns[i], type, &columns[i]));
     }

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/ipc/metadata-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc
index 7c8ddb9..1cc4a23 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -214,7 +214,8 @@ static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr<DataType>& type,
         vector_type = flatbuf::VectorType_DATA;
         break;
     }
-    auto offset = flatbuf::CreateVectorLayout(fbb, descr.bit_width(), vector_type);
+    auto offset = flatbuf::CreateVectorLayout(
+        fbb, static_cast<int16_t>(descr.bit_width()), vector_type);
     layout->push_back(offset);
   }
 
@@ -328,7 +329,7 @@ Status FieldFromFlatbufferDictionary(
   std::shared_ptr<DataType> type;
   auto children = field->children();
   std::vector<std::shared_ptr<Field>> child_fields(children->size());
-  for (size_t i = 0; i < children->size(); ++i) {
+  for (int i = 0; i < static_cast<int>(children->size()); ++i) {
     RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), dummy_memo, &child_fields[i]));
   }
 
@@ -350,7 +351,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field,
     // children to fully reconstruct the data type
     auto children = field->children();
     std::vector<std::shared_ptr<Field>> child_fields(children->size());
-    for (size_t i = 0; i < children->size(); ++i) {
+    for (int i = 0; i < static_cast<int>(children->size()); ++i) {
       RETURN_NOT_OK(
           FieldFromFlatbuffer(children->Get(i), dictionary_memo, &child_fields[i]));
     }

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/ipc/reader.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 1a9af7d..9734166 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -203,7 +203,7 @@ class FileReader::FileReaderImpl {
     }
 
     std::shared_ptr<Buffer> buffer;
-    int file_end_size = magic_size + sizeof(int32_t);
+    int file_end_size = static_cast<int>(magic_size + sizeof(int32_t));
     RETURN_NOT_OK(file_->ReadAt(footer_offset_ - file_end_size, file_end_size, &buffer));
 
     if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 07f786c..dc82366 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -51,7 +51,7 @@ const auto kListInt32 = list(int32());
 const auto kListListInt32 = list(kListInt32);
 
 Status MakeRandomInt32Array(
-    int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
+    int64_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
   std::shared_ptr<PoolBuffer> data;
   test::MakeRandomInt32PoolBuffer(length, pool, &data);
   Int32Builder builder(pool, int32());
@@ -79,7 +79,7 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
   std::vector<int32_t> list_sizes(num_lists, 0);
   std::vector<int32_t> offsets(
       num_lists + 1, 0);  // +1 so we can shift for nulls. See partial sum below.
-  const int seed = child_array->length();
+  const uint32_t seed = static_cast<uint32_t>(child_array->length());
   if (num_lists > 0) {
     test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data());
     // make sure sizes are consistent with null
@@ -89,7 +89,7 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
     std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin());
 
     // Force invariants
-    const int child_length = child_array->length();
+    const int64_t child_length = child_array->length();
     offsets[0] = 0;
     std::replace_if(offsets.begin(), offsets.end(),
         [child_length](int32_t offset) { return offset > child_length; }, child_length);
@@ -121,26 +121,26 @@ Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out) {
 
 template <class Builder, class RawType>
 Status MakeRandomBinaryArray(
-    int32_t length, MemoryPool* pool, std::shared_ptr<Array>* out) {
+    int64_t length, MemoryPool* pool, std::shared_ptr<Array>* out) {
   const std::vector<std::string> values = {
       "", "", "abc", "123", "efg", "456!@#!@#", "12312"};
   Builder builder(pool);
-  const auto values_len = values.size();
-  for (int32_t i = 0; i < length; ++i) {
-    int values_index = i % values_len;
+  const size_t values_len = values.size();
+  for (int64_t i = 0; i < length; ++i) {
+    int64_t values_index = i % values_len;
     if (values_index == 0) {
       RETURN_NOT_OK(builder.AppendNull());
     } else {
       const std::string& value = values[values_index];
-      RETURN_NOT_OK(
-          builder.Append(reinterpret_cast<const RawType*>(value.data()), value.size()));
+      RETURN_NOT_OK(builder.Append(reinterpret_cast<const RawType*>(value.data()),
+          static_cast<int32_t>(value.size())));
     }
   }
   return builder.Finish(out);
 }
 
 Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out) {
-  const int32_t length = 500;
+  const int64_t length = 500;
   auto string_type = utf8();
   auto binary_type = binary();
   auto f0 = field("f0", string_type);
@@ -302,7 +302,7 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
   std::vector<std::shared_ptr<Array>> sparse_children(2);
   std::vector<std::shared_ptr<Array>> dense_children(2);
 
-  const int32_t length = 7;
+  const int64_t length = 7;
 
   std::shared_ptr<Buffer> type_ids_buffer;
   std::vector<uint8_t> type_ids = {5, 10, 5, 5, 10, 10, 5};
@@ -346,7 +346,7 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
 }
 
 Status MakeDictionary(std::shared_ptr<RecordBatch>* out) {
-  const int32_t length = 6;
+  const int64_t length = 6;
 
   std::vector<bool> is_valid = {true, true, false, true, true, true};
   std::shared_ptr<Array> dict1, dict2;

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/ipc/writer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 975b0d1..58402b5 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -61,7 +61,7 @@ class StreamWriter::StreamWriterImpl {
     std::shared_ptr<Buffer> schema_fb;
     RETURN_NOT_OK(WriteSchemaMessage(*schema_, dictionary_memo_.get(), &schema_fb));
 
-    int32_t flatbuffer_size = schema_fb->size();
+    int32_t flatbuffer_size = static_cast<int32_t>(schema_fb->size());
     RETURN_NOT_OK(
         Write(reinterpret_cast<const uint8_t*>(&flatbuffer_size), sizeof(int32_t)));
 
@@ -252,7 +252,7 @@ class FileWriter::FileWriterImpl : public StreamWriter::StreamWriterImpl {
     RETURN_NOT_OK(UpdatePosition());
 
     // Write footer length
-    int32_t footer_length = position_ - initial_position;
+    int32_t footer_length = static_cast<int32_t>(position_ - initial_position);
 
     if (footer_length <= 0) { return Status::Invalid("Invalid file footer"); }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/pretty_print.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 23c0580..7e69e42 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -196,7 +196,7 @@ class ArrayPrinter : public ArrayVisitor {
   }
 
   Status PrintChildren(
-      const std::vector<std::shared_ptr<Array>>& fields, int32_t offset, int32_t length) {
+      const std::vector<std::shared_ptr<Array>>& fields, int64_t offset, int64_t length) {
     for (size_t i = 0; i < fields.size(); ++i) {
       Newline();
       std::stringstream ss;

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/schema.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc
index cd8256e..aa38fd3 100644
--- a/cpp/src/arrow/schema.cc
+++ b/cpp/src/arrow/schema.cc
@@ -45,7 +45,7 @@ bool Schema::Equals(const std::shared_ptr<Schema>& other) const {
 std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) {
   if (fields_.size() > 0 && name_to_index_.size() == 0) {
     for (size_t i = 0; i < fields_.size(); ++i) {
-      name_to_index_[fields_[i]->name] = i;
+      name_to_index_[fields_[i]->name] = static_cast<int>(i);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/schema.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h
index 0e1ab5c..37cdbf7 100644
--- a/cpp/src/arrow/schema.h
+++ b/cpp/src/arrow/schema.h
@@ -47,7 +47,7 @@ class ARROW_EXPORT Schema {
   // Render a string representation of the schema suitable for debugging
   std::string ToString() const;
 
-  int num_fields() const { return fields_.size(); }
+  int num_fields() const { return static_cast<int>(fields_.size()); }
 
  private:
   std::vector<std::shared_ptr<Field>> fields_;

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/status.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/status.cc b/cpp/src/arrow/status.cc
index e1a2427..3a39c84 100644
--- a/cpp/src/arrow/status.cc
+++ b/cpp/src/arrow/status.cc
@@ -18,7 +18,7 @@ namespace arrow {
 
 Status::Status(StatusCode code, const std::string& msg, int16_t posix_code) {
   assert(code != StatusCode::OK);
-  const uint32_t size = msg.size();
+  const uint32_t size = static_cast<uint32_t>(msg.size());
   char* result = new char[size + 7];
   memcpy(result, &size, sizeof(size));
   result[4] = static_cast<char>(code);

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/table-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index 25f12c4..3637473 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -150,7 +150,7 @@ TEST_F(TestTable, Equals) {
 }
 
 TEST_F(TestTable, FromRecordBatches) {
-  const int32_t length = 10;
+  const int64_t length = 10;
   MakeExample1(length);
 
   auto batch1 = std::make_shared<RecordBatch>(schema_, length, arrays_);
@@ -184,7 +184,7 @@ TEST_F(TestTable, FromRecordBatches) {
 }
 
 TEST_F(TestTable, ConcatenateTables) {
-  const int32_t length = 10;
+  const int64_t length = 10;
 
   MakeExample1(length);
   auto batch1 = std::make_shared<RecordBatch>(schema_, length, arrays_);

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/table.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 8ac06b8..6b957c0 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -29,7 +29,7 @@
 
 namespace arrow {
 
-RecordBatch::RecordBatch(const std::shared_ptr<Schema>& schema, int num_rows,
+RecordBatch::RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
     const std::vector<std::shared_ptr<Array>>& columns)
     : schema_(schema), num_rows_(num_rows), columns_(columns) {}
 
@@ -61,18 +61,18 @@ bool RecordBatch::ApproxEquals(const RecordBatch& other) const {
   return true;
 }
 
-std::shared_ptr<RecordBatch> RecordBatch::Slice(int32_t offset) {
+std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset) {
   return Slice(offset, this->num_rows() - offset);
 }
 
-std::shared_ptr<RecordBatch> RecordBatch::Slice(int32_t offset, int32_t length) {
+std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset, int64_t length) {
   std::vector<std::shared_ptr<Array>> arrays;
   arrays.reserve(num_columns());
   for (const auto& field : columns_) {
     arrays.emplace_back(field->Slice(offset, length));
   }
 
-  int32_t num_rows = std::min(num_rows_ - offset, length);
+  int64_t num_rows = std::min(num_rows_ - offset, length);
   return std::make_shared<RecordBatch>(schema_, num_rows, arrays);
 }
 
@@ -169,7 +169,7 @@ bool Table::Equals(const Table& other) const {
   if (!schema_->Equals(other.schema())) { return false; }
   if (static_cast<int64_t>(columns_.size()) != other.num_columns()) { return false; }
 
-  for (size_t i = 0; i < columns_.size(); i++) {
+  for (int i = 0; i < static_cast<int>(columns_.size()); i++) {
     if (!columns_[i]->Equals(other.column(i))) { return false; }
   }
   return true;

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/table.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index fa56824..68f664b 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -40,7 +40,7 @@ class ARROW_EXPORT RecordBatch {
   // num_rows is a parameter to allow for record batches of a particular size not
   // having any materialized columns. Each array should have the same length as
   // num_rows
-  RecordBatch(const std::shared_ptr<Schema>& schema, int32_t num_rows,
+  RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
       const std::vector<std::shared_ptr<Array>>& columns);
 
   bool Equals(const RecordBatch& other) const;
@@ -59,18 +59,18 @@ class ARROW_EXPORT RecordBatch {
   const std::string& column_name(int i) const;
 
   // @returns: the number of columns in the table
-  int num_columns() const { return columns_.size(); }
+  int num_columns() const { return static_cast<int>(columns_.size()); }
 
   // @returns: the number of rows (the corresponding length of each column)
-  int32_t num_rows() const { return num_rows_; }
+  int64_t num_rows() const { return num_rows_; }
 
   /// Slice each of the arrays in the record batch and construct a new RecordBatch object
-  std::shared_ptr<RecordBatch> Slice(int32_t offset);
-  std::shared_ptr<RecordBatch> Slice(int32_t offset, int32_t length);
+  std::shared_ptr<RecordBatch> Slice(int64_t offset);
+  std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length);
 
  private:
   std::shared_ptr<Schema> schema_;
-  int32_t num_rows_;
+  int64_t num_rows_;
   std::vector<std::shared_ptr<Array>> columns_;
 };
 
@@ -105,7 +105,7 @@ class ARROW_EXPORT Table {
   std::shared_ptr<Column> column(int i) const { return columns_[i]; }
 
   // @returns: the number of columns in the table
-  int num_columns() const { return columns_.size(); }
+  int num_columns() const { return static_cast<int>(columns_.size()); }
 
   // @returns: the number of rows (the corresponding length of each column)
   int64_t num_rows() const { return num_rows_; }

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index ffc7806..5c7d04d 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -73,16 +73,17 @@ void randint(int64_t N, T lower, T upper, std::vector<T>* out) {
   T val;
   for (int64_t i = 0; i < N; ++i) {
     draw = rng.Uniform64(span);
-    val = lower + static_cast<T>(draw);
+    val = static_cast<T>(draw + lower);
     out->push_back(val);
   }
 }
 
 template <typename T>
-void random_real(int n, uint32_t seed, T min_value, T max_value, std::vector<T>* out) {
+void random_real(
+    int64_t n, uint32_t seed, T min_value, T max_value, std::vector<T>* out) {
   std::mt19937 gen(seed);
   std::uniform_real_distribution<T> d(min_value, max_value);
-  for (int i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
     out->push_back(d(gen));
   }
 }
@@ -108,13 +109,13 @@ inline Status CopyBufferFromVector(
 
 static inline Status GetBitmapFromBoolVector(
     const std::vector<bool>& is_valid, std::shared_ptr<Buffer>* result) {
-  int length = static_cast<int>(is_valid.size());
+  int64_t length = static_cast<int64_t>(is_valid.size());
 
   std::shared_ptr<MutableBuffer> buffer;
   RETURN_NOT_OK(GetEmptyBitmap(default_memory_pool(), length, &buffer));
 
   uint8_t* bitmap = buffer->mutable_data();
-  for (int i = 0; i < length; ++i) {
+  for (int64_t i = 0; i < length; ++i) {
     if (is_valid[i]) { BitUtil::SetBit(bitmap, i); }
   }
 
@@ -126,7 +127,7 @@ static inline Status GetBitmapFromBoolVector(
 // and the rest to non-zero (true) values.
 static inline void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) {
   Random rng(random_seed());
-  for (int i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
     null_bytes[i] = rng.NextDoubleFraction() > pct_null;
   }
 }
@@ -134,41 +135,41 @@ static inline void random_null_bytes(int64_t n, double pct_null, uint8_t* null_b
 static inline void random_is_valid(
     int64_t n, double pct_null, std::vector<bool>* is_valid) {
   Random rng(random_seed());
-  for (int i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
     is_valid->push_back(rng.NextDoubleFraction() > pct_null);
   }
 }
 
-static inline void random_bytes(int n, uint32_t seed, uint8_t* out) {
+static inline void random_bytes(int64_t n, uint32_t seed, uint8_t* out) {
   std::mt19937 gen(seed);
   std::uniform_int_distribution<int> d(0, 255);
 
-  for (int i = 0; i < n; ++i) {
-    out[i] = d(gen) & 0xFF;
+  for (int64_t i = 0; i < n; ++i) {
+    out[i] = static_cast<uint8_t>(d(gen) & 0xFF);
   }
 }
 
-static inline void random_ascii(int n, uint32_t seed, uint8_t* out) {
+static inline void random_ascii(int64_t n, uint32_t seed, uint8_t* out) {
   std::mt19937 gen(seed);
   std::uniform_int_distribution<int> d(65, 122);
 
-  for (int i = 0; i < n; ++i) {
-    out[i] = d(gen) & 0xFF;
+  for (int64_t i = 0; i < n; ++i) {
+    out[i] = static_cast<uint8_t>(d(gen) & 0xFF);
   }
 }
 
 template <typename T>
-void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) {
+void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, T* out) {
   DCHECK(out || (n == 0));
   std::mt19937 gen(seed);
   std::uniform_int_distribution<T> d(min_value, max_value);
-  for (int i = 0; i < n; ++i) {
-    out[i] = d(gen);
+  for (int64_t i = 0; i < n; ++i) {
+    out[i] = static_cast<T>(d(gen));
   }
 }
 
-static inline int null_count(const std::vector<uint8_t>& valid_bytes) {
-  int result = 0;
+static inline int64_t null_count(const std::vector<uint8_t>& valid_bytes) {
+  int64_t result = 0;
   for (size_t i = 0; i < valid_bytes.size(); ++i) {
     if (valid_bytes[i] == 0) { ++result; }
   }
@@ -183,7 +184,7 @@ std::shared_ptr<Buffer> bytes_to_null_buffer(const std::vector<uint8_t>& bytes)
   return out;
 }
 
-Status MakeRandomInt32PoolBuffer(int32_t length, MemoryPool* pool,
+Status MakeRandomInt32PoolBuffer(int64_t length, MemoryPool* pool,
     std::shared_ptr<PoolBuffer>* pool_buffer, uint32_t seed = 0) {
   DCHECK(pool);
   auto data = std::make_shared<PoolBuffer>(pool);
@@ -194,7 +195,7 @@ Status MakeRandomInt32PoolBuffer(int32_t length, MemoryPool* pool,
   return Status::OK();
 }
 
-Status MakeRandomBytePoolBuffer(int32_t length, MemoryPool* pool,
+Status MakeRandomBytePoolBuffer(int64_t length, MemoryPool* pool,
     std::shared_ptr<PoolBuffer>* pool_buffer, uint32_t seed = 0) {
   auto bytes = std::make_shared<PoolBuffer>(pool);
   RETURN_NOT_OK(bytes->Resize(length));
@@ -213,7 +214,7 @@ class TestBase : public ::testing::Test {
   }
 
   template <typename ArrayType>
-  std::shared_ptr<Array> MakePrimitive(int32_t length, int32_t null_count = 0) {
+  std::shared_ptr<Array> MakePrimitive(int64_t length, int64_t null_count = 0) {
     auto data = std::make_shared<PoolBuffer>(pool_);
     const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type);
     EXPECT_OK(data->Resize(data_nbytes));
@@ -275,9 +276,9 @@ class TestBuilder : public ::testing::Test {
 
 template <class T, class Builder>
 Status MakeArray(const std::vector<uint8_t>& valid_bytes, const std::vector<T>& values,
-    int size, Builder* builder, std::shared_ptr<Array>* out) {
+    int64_t size, Builder* builder, std::shared_ptr<Array>* out) {
   // Append the first 1000
-  for (int i = 0; i < size; ++i) {
+  for (int64_t i = 0; i < size; ++i) {
     if (valid_bytes[i] > 0) {
       RETURN_NOT_OK(builder->Append(values[i]));
     } else {

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 9a97fc3..9b1ab32 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -162,7 +162,7 @@ struct ARROW_EXPORT DataType {
 
   const std::vector<std::shared_ptr<Field>>& children() const { return children_; }
 
-  int num_children() const { return children_.size(); }
+  int num_children() const { return static_cast<int>(children_.size()); }
 
   virtual Status Accept(TypeVisitor* visitor) const = 0;
 
@@ -226,7 +226,7 @@ struct ARROW_EXPORT CTypeImpl : public PrimitiveCType {
 
   CTypeImpl() : PrimitiveCType(TYPE_ID) {}
 
-  int bit_width() const override { return sizeof(C_TYPE) * 8; }
+  int bit_width() const override { return static_cast<int>(sizeof(C_TYPE) * 8); }
 
   Status Accept(TypeVisitor* visitor) const override {
     return visitor->Visit(*static_cast<const DERIVED*>(this));
@@ -432,7 +432,7 @@ struct ARROW_EXPORT DateType : public FixedWidthType {
 
   DateType() : FixedWidthType(Type::DATE) {}
 
-  int bit_width() const override { return sizeof(c_type) * 8; }
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * 8); }
 
   Status Accept(TypeVisitor* visitor) const override;
   std::string ToString() const override;
@@ -448,7 +448,7 @@ struct ARROW_EXPORT TimeType : public FixedWidthType {
 
   TimeUnit unit;
 
-  int bit_width() const override { return sizeof(c_type) * 8; }
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * 8); }
 
   explicit TimeType(TimeUnit unit = TimeUnit::MILLI)
       : FixedWidthType(Type::TIME), unit(unit) {}
@@ -465,7 +465,7 @@ struct ARROW_EXPORT TimestampType : public FixedWidthType {
   typedef int64_t c_type;
   static constexpr Type::type type_id = Type::TIMESTAMP;
 
-  int bit_width() const override { return sizeof(int64_t) * 8; }
+  int bit_width() const override { return static_cast<int>(sizeof(int64_t) * 8); }
 
   TimeUnit unit;
 
@@ -485,7 +485,7 @@ struct ARROW_EXPORT IntervalType : public FixedWidthType {
   using c_type = int64_t;
   static constexpr Type::type type_id = Type::INTERVAL;
 
-  int bit_width() const override { return sizeof(int64_t) * 8; }
+  int bit_width() const override { return static_cast<int>(sizeof(int64_t) * 8); }
 
   Unit unit;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/type_traits.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index c4898b1..d6687c1 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -32,7 +32,7 @@ template <>
 struct TypeTraits<UInt8Type> {
   using ArrayType = UInt8Array;
   using BuilderType = UInt8Builder;
-  static inline int bytes_required(int elements) { return elements; }
+  static inline int64_t bytes_required(int64_t elements) { return elements; }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return uint8(); }
 };
@@ -41,7 +41,7 @@ template <>
 struct TypeTraits<Int8Type> {
   using ArrayType = Int8Array;
   using BuilderType = Int8Builder;
-  static inline int bytes_required(int elements) { return elements; }
+  static inline int64_t bytes_required(int64_t elements) { return elements; }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return int8(); }
 };
@@ -51,7 +51,9 @@ struct TypeTraits<UInt16Type> {
   using ArrayType = UInt16Array;
   using BuilderType = UInt16Builder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(uint16_t);
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return uint16(); }
 };
@@ -61,7 +63,9 @@ struct TypeTraits<Int16Type> {
   using ArrayType = Int16Array;
   using BuilderType = Int16Builder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(int16_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(int16_t);
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return int16(); }
 };
@@ -71,7 +75,9 @@ struct TypeTraits<UInt32Type> {
   using ArrayType = UInt32Array;
   using BuilderType = UInt32Builder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(uint32_t);
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return uint32(); }
 };
@@ -81,7 +87,9 @@ struct TypeTraits<Int32Type> {
   using ArrayType = Int32Array;
   using BuilderType = Int32Builder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(int32_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(int32_t);
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return int32(); }
 };
@@ -91,7 +99,9 @@ struct TypeTraits<UInt64Type> {
   using ArrayType = UInt64Array;
   using BuilderType = UInt64Builder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(uint64_t);
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return uint64(); }
 };
@@ -101,7 +111,9 @@ struct TypeTraits<Int64Type> {
   using ArrayType = Int64Array;
   using BuilderType = Int64Builder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(int64_t);
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return int64(); }
 };
@@ -111,7 +123,9 @@ struct TypeTraits<DateType> {
   using ArrayType = DateArray;
   // using BuilderType = DateBuilder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(int64_t);
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return date(); }
 };
@@ -121,7 +135,9 @@ struct TypeTraits<TimestampType> {
   using ArrayType = TimestampArray;
   // using BuilderType = TimestampBuilder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(int64_t);
+  }
   constexpr static bool is_parameter_free = false;
 };
 
@@ -130,7 +146,9 @@ struct TypeTraits<TimeType> {
   using ArrayType = TimeArray;
   // using BuilderType = TimestampBuilder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(int64_t);
+  }
   constexpr static bool is_parameter_free = false;
 };
 
@@ -139,7 +157,9 @@ struct TypeTraits<HalfFloatType> {
   using ArrayType = HalfFloatArray;
   using BuilderType = HalfFloatBuilder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return elements * sizeof(uint16_t);
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return float16(); }
 };
@@ -149,7 +169,9 @@ struct TypeTraits<FloatType> {
   using ArrayType = FloatArray;
   using BuilderType = FloatBuilder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(float); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return static_cast<int64_t>(elements * sizeof(float));
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return float32(); }
 };
@@ -159,7 +181,9 @@ struct TypeTraits<DoubleType> {
   using ArrayType = DoubleArray;
   using BuilderType = DoubleBuilder;
 
-  static inline int bytes_required(int elements) { return elements * sizeof(double); }
+  static inline int64_t bytes_required(int64_t elements) {
+    return static_cast<int64_t>(elements * sizeof(double));
+  }
   constexpr static bool is_parameter_free = true;
   static inline std::shared_ptr<DataType> type_singleton() { return float64(); }
 };
@@ -169,7 +193,7 @@ struct TypeTraits<BooleanType> {
   using ArrayType = BooleanArray;
   using BuilderType = BooleanBuilder;
 
-  static inline int bytes_required(int elements) {
+  static inline int64_t bytes_required(int64_t elements) {
     return BitUtil::BytesForBits(elements);
   }
   constexpr static bool is_parameter_free = true;

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/util/bit-util.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc
index f3fbb41..1bbd238 100644
--- a/cpp/src/arrow/util/bit-util.cc
+++ b/cpp/src/arrow/util/bit-util.cc
@@ -42,7 +42,7 @@ void BitUtil::BytesToBits(const std::vector<uint8_t>& bytes, uint8_t* bits) {
 
 Status BitUtil::BytesToBits(
     const std::vector<uint8_t>& bytes, std::shared_ptr<Buffer>* out) {
-  int bit_length = BitUtil::BytesForBits(bytes.size());
+  int64_t bit_length = BitUtil::BytesForBits(bytes.size());
 
   std::shared_ptr<MutableBuffer> buffer;
   RETURN_NOT_OK(AllocateBuffer(default_memory_pool(), bit_length, &buffer));
@@ -98,7 +98,7 @@ Status GetEmptyBitmap(
   return Status::OK();
 }
 
-Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int32_t offset, int32_t length,
+Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t length,
     std::shared_ptr<Buffer>* out) {
   std::shared_ptr<MutableBuffer> buffer;
   RETURN_NOT_OK(GetEmptyBitmap(pool, length, &buffer));

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/cpp/src/arrow/util/bit-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index a0fbdd2..6e3e8ae 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -34,6 +34,11 @@ class Status;
 
 namespace BitUtil {
 
+static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128};
+
+// the ~i byte version of kBitmaks
+static constexpr uint8_t kFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127};
+
 static inline int64_t CeilByte(int64_t size) {
   return (size + 7) & ~7;
 }
@@ -46,28 +51,26 @@ static inline int64_t Ceil2Bytes(int64_t size) {
   return (size + 15) & ~15;
 }
 
-static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128};
-
-static inline bool GetBit(const uint8_t* bits, int i) {
+static inline bool GetBit(const uint8_t* bits, int64_t i) {
   return static_cast<bool>(bits[i / 8] & kBitmask[i % 8]);
 }
 
-static inline bool BitNotSet(const uint8_t* bits, int i) {
+static inline bool BitNotSet(const uint8_t* bits, int64_t i) {
   return (bits[i / 8] & kBitmask[i % 8]) == 0;
 }
 
-static inline void ClearBit(uint8_t* bits, int i) {
-  bits[i / 8] &= ~kBitmask[i % 8];
+static inline void ClearBit(uint8_t* bits, int64_t i) {
+  bits[i / 8] &= kFlippedBitmask[i % 8];
 }
 
-static inline void SetBit(uint8_t* bits, int i) {
+static inline void SetBit(uint8_t* bits, int64_t i) {
   bits[i / 8] |= kBitmask[i % 8];
 }
 
-static inline void SetBitTo(uint8_t* bits, int i, bool bit_is_set) {
+static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
   // See https://graphics.stanford.edu/~seander/bithacks.html
   // "Conditionally set or clear bits without branching"
-  bits[i / 8] ^= (-bit_is_set ^ bits[i / 8]) & kBitmask[i % 8];
+  bits[i / 8] ^= static_cast<uint8_t>(-bit_is_set ^ bits[i / 8]) & kBitmask[i % 8];
 }
 
 static inline int64_t NextPower2(int64_t n) {
@@ -127,8 +130,8 @@ Status ARROW_EXPORT GetEmptyBitmap(
 /// \param[out] out the resulting copy
 ///
 /// \return Status message
-Status ARROW_EXPORT CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int32_t offset,
-    int32_t length, std::shared_ptr<Buffer>* out);
+Status ARROW_EXPORT CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset,
+    int64_t length, std::shared_ptr<Buffer>* out);
 
 /// Compute the number of 1's in the given data array
 ///

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/python/pyarrow/array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd
index 9e4d469..56bb53d 100644
--- a/python/pyarrow/array.pxd
+++ b/python/pyarrow/array.pxd
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from pyarrow.includes.common cimport shared_ptr
+from pyarrow.includes.common cimport shared_ptr, int64_t
 from pyarrow.includes.libarrow cimport CArray
 
 from pyarrow.scalar import NA
@@ -36,7 +36,7 @@ cdef class Array:
         DataType type
 
     cdef init(self, const shared_ptr[CArray]& sp_array)
-    cdef getitem(self, int i)
+    cdef getitem(self, int64_t i)
 
 cdef object box_array(const shared_ptr[CArray]& sp_array)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 11abf03..7787e95 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -210,7 +210,7 @@ cdef class Array:
 
         return self.getitem(key)
 
-    cdef getitem(self, int i):
+    cdef getitem(self, int64_t i):
         return scalar.box_scalar(self.type, self.sp_array, i)
 
     def slice(self, offset=0, length=None):

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 702acfb..253cabb 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -64,15 +64,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CArray" arrow::Array":
         shared_ptr[CDataType] type()
 
-        int32_t length()
-        int32_t null_count()
+        int64_t length()
+        int64_t null_count()
         Type type_enum()
 
         c_bool Equals(const shared_ptr[CArray]& arr)
         c_bool IsNull(int i)
 
-        shared_ptr[CArray] Slice(int32_t offset)
-        shared_ptr[CArray] Slice(int32_t offset, int32_t length)
+        shared_ptr[CArray] Slice(int64_t offset)
+        shared_ptr[CArray] Slice(int64_t offset, int64_t length)
 
     cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType):
         int bit_width()
@@ -217,7 +217,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CChunkedArray] data()
 
     cdef cppclass CRecordBatch" arrow::RecordBatch":
-        CRecordBatch(const shared_ptr[CSchema]& schema, int32_t num_rows,
+        CRecordBatch(const shared_ptr[CSchema]& schema, int64_t num_rows,
                      const vector[shared_ptr[CArray]]& columns)
 
         c_bool Equals(const CRecordBatch& other)
@@ -229,10 +229,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         const vector[shared_ptr[CArray]]& columns()
 
         int num_columns()
-        int32_t num_rows()
+        int64_t num_rows()
 
-        shared_ptr[CRecordBatch] Slice(int32_t offset)
-        shared_ptr[CRecordBatch] Slice(int32_t offset, int32_t length)
+        shared_ptr[CRecordBatch] Slice(int64_t offset)
+        shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length)
 
     cdef cppclass CTable" arrow::Table":
         CTable(const c_string& name, const shared_ptr[CSchema]& schema,

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/python/pyarrow/scalar.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pxd b/python/pyarrow/scalar.pxd
index 2d55757..551aeb9 100644
--- a/python/pyarrow/scalar.pxd
+++ b/python/pyarrow/scalar.pxd
@@ -32,10 +32,10 @@ cdef class NAType(Scalar):
 cdef class ArrayValue(Scalar):
     cdef:
         shared_ptr[CArray] sp_array
-        int index
+        int64_t index
 
     cdef void init(self, DataType type,
-                   const shared_ptr[CArray]& sp_array, int index)
+                   const shared_ptr[CArray]& sp_array, int64_t index)
 
     cdef void _set_array(self, const shared_ptr[CArray]& sp_array)
 
@@ -55,7 +55,7 @@ cdef class ListValue(ArrayValue):
     cdef:
         CListArray* ap
 
-    cdef getitem(self, int i)
+    cdef getitem(self, int64_t i)
 
 
 cdef class StringValue(ArrayValue):
@@ -63,4 +63,4 @@ cdef class StringValue(ArrayValue):
 
 cdef object box_scalar(DataType type,
                        const shared_ptr[CArray]& sp_array,
-                       int index)
+                       int64_t index)

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index 57a15ad..1337b2b 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -46,7 +46,7 @@ NA = NAType()
 cdef class ArrayValue(Scalar):
 
     cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array,
-                   int index):
+                   int64_t index):
         self.type = type
         self.index = index
         self._set_array(sp_array)
@@ -201,13 +201,13 @@ cdef class ListValue(ArrayValue):
         self.ap = <CListArray*> sp_array.get()
         self.value_type = box_data_type(self.ap.value_type())
 
-    cdef getitem(self, int i):
-        cdef int j = self.ap.value_offset(self.index) + i
+    cdef getitem(self, int64_t i):
+        cdef int64_t j = self.ap.value_offset(self.index) + i
         return box_scalar(self.value_type, self.ap.values(), j)
 
     def as_py(self):
         cdef:
-            int j
+            int64_t j
             list result = []
 
         for j in range(len(self)):
@@ -236,7 +236,7 @@ cdef dict _scalar_classes = {
 }
 
 cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array,
-                       int index):
+                       int64_t index):
     cdef ArrayValue val
     if type.type.type == Type_NA:
         return NA

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index 7d73362..93bc6dd 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -497,7 +497,7 @@ cdef class RecordBatch:
             shared_ptr[CSchema] schema
             shared_ptr[CRecordBatch] batch
             vector[shared_ptr[CArray]] c_arrays
-            int32_t num_rows
+            int64_t num_rows
 
         if len(arrays) == 0:
             raise ValueError('Record batch cannot contain no arrays (for now)')

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index 5fd8eef..c125cc0 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -375,7 +375,7 @@ class BytesConverter : public TypedConverter<arrow::BinaryBuilder> {
     PyObject* bytes_obj;
     OwnedRef tmp;
     const char* bytes;
-    int32_t length;
+    int64_t length;
     Py_ssize_t size = PySequence_Size(seq);
     for (int64_t i = 0; i < size; ++i) {
       item = PySequence_GetItem(seq, i);
@@ -409,7 +409,7 @@ class UTF8Converter : public TypedConverter<arrow::StringBuilder> {
     PyObject* bytes_obj;
     OwnedRef tmp;
     const char* bytes;
-    int32_t length;
+    int64_t length;
     Py_ssize_t size = PySequence_Size(seq);
     for (int64_t i = 0; i < size; ++i) {
       item = PySequence_GetItem(seq, i);

http://git-wip-us.apache.org/repos/asf/arrow/blob/01a67f3f/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index bdc2cb7..cadb53e 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -224,13 +224,13 @@ Status AppendObjectStrings(arrow::StringBuilder& string_builder, PyObject** obje
         PyErr_Clear();
         return Status::TypeError("failed converting unicode to UTF8");
       }
-      const int32_t length = PyBytes_GET_SIZE(obj);
+      const int64_t length = PyBytes_GET_SIZE(obj);
       Status s = string_builder.Append(PyBytes_AS_STRING(obj), length);
       Py_DECREF(obj);
       if (!s.ok()) { return s; }
     } else if (PyBytes_Check(obj)) {
       *have_bytes = true;
-      const int32_t length = PyBytes_GET_SIZE(obj);
+      const int64_t length = PyBytes_GET_SIZE(obj);
       RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
     } else {
       string_builder.AppendNull();
@@ -413,7 +413,7 @@ inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_val
     const std::shared_ptr<Array> arr = data.chunk(c);
     auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
     auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
-    for (int32_t i = 0; i < arr->length(); ++i) {
+    for (int64_t i = 0; i < arr->length(); ++i) {
       *out_values = in_values[i];
     }
   }
@@ -507,7 +507,6 @@ inline Status ConvertListsLike(
     auto arr = std::static_pointer_cast<arrow::ListArray>(data.chunk(c));
 
     const uint8_t* data_ptr;
-    int32_t length;
     const bool has_nulls = data.null_count() > 0;
     for (int64_t i = 0; i < arr->length(); ++i) {
       if (has_nulls && arr->IsNull(i)) {
@@ -1520,7 +1519,7 @@ inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
   }
 
   // For readability
-  constexpr int32_t kOffset = 0;
+  constexpr int64_t kOffset = 0;
 
   RETURN_NOT_OK(ConvertData());
   std::shared_ptr<DataType> type;
@@ -1636,7 +1635,7 @@ inline Status ArrowSerializer<TYPE>::ConvertTypedLists(
       // TODO(uwe): Support more complex numpy array structures
       RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
 
-      int32_t size = PyArray_DIM(numpy_array, 0);
+      int64_t size = PyArray_DIM(numpy_array, 0);
       auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array));
       if (traits::supports_nulls) {
         null_bitmap_->Resize(size, false);
@@ -1678,7 +1677,7 @@ ArrowSerializer<NPY_OBJECT>::ConvertTypedLists<NPY_OBJECT, ::arrow::StringType>(
       // TODO(uwe): Support more complex numpy array structures
       RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
 
-      int32_t size = PyArray_DIM(numpy_array, 0);
+      int64_t size = PyArray_DIM(numpy_array, 0);
       auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
       RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes));
     } else if (PyList_Check(objects[i])) {


Mime
View raw message