arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-210: Cleanup of the string related types in C++ code base
Date Thu, 16 Jun 2016 17:58:24 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 790d5412d -> 27edd25eb


ARROW-210:  Cleanup of the string related types in C++ code base

One thing that is worth discussing is if char types should also be removed (if they aren't
i'll add the missing unit tests).

I also moved CharType to type.h which seems more consistent with existing code.  I can clean
it up either way in a follow-up review if we decide with want to push types into their corresponding
Array headers.

Author: Micah Kornfield <emkornfield@gmail.com>

Closes #85 from emkornfield/emk_string_types_wip and squashes the following commits:

4414816 [Micah Kornfield] remove CHAR from parquet
6f0634c [Micah Kornfield] remove char type and add dcheck
58bfcc9 [Micah Kornfield] fix style of char_type_
1e0152d [Micah Kornfield] wip


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/27edd25e
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/27edd25e
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/27edd25e

Branch: refs/heads/master
Commit: 27edd25eb4f714ff1cc2770ed5a1fbc695eb8a08
Parents: 790d541
Author: Micah Kornfield <emkornfield@gmail.com>
Authored: Thu Jun 16 10:58:18 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Thu Jun 16 10:58:18 2016 -0700

----------------------------------------------------------------------
 cpp/src/arrow/parquet/schema.cc    |   5 -
 cpp/src/arrow/type.cc              |  17 ++-
 cpp/src/arrow/type.h               |  55 +++++++---
 cpp/src/arrow/types/construct.cc   |   2 -
 cpp/src/arrow/types/decimal.h      |   1 -
 cpp/src/arrow/types/list.h         |   8 +-
 cpp/src/arrow/types/string-test.cc | 188 +++++++++++++++++++++++++++-----
 cpp/src/arrow/types/string.cc      |  40 +++++--
 cpp/src/arrow/types/string.h       | 104 ++++++++++--------
 cpp/src/arrow/util/macros.h        |   2 +-
 10 files changed, 307 insertions(+), 115 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/parquet/schema.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc
index fd75894..c7979db 100644
--- a/cpp/src/arrow/parquet/schema.cc
+++ b/cpp/src/arrow/parquet/schema.cc
@@ -250,11 +250,6 @@ Status FieldToNode(const std::shared_ptr<Field>& field, NodePtr*
out) {
     case Type::DOUBLE:
       type = ParquetType::DOUBLE;
       break;
-    case Type::CHAR:
-      type = ParquetType::FIXED_LEN_BYTE_ARRAY;
-      logical_type = LogicalType::UTF8;
-      length = static_cast<CharType*>(field->type.get())->size;
-      break;
     case Type::STRING:
       type = ParquetType::BYTE_ARRAY;
       logical_type = LogicalType::UTF8;

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 4e686d9..4fd50b7 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -31,7 +31,18 @@ std::string Field::ToString() const {
 
 DataType::~DataType() {}
 
-StringType::StringType() : DataType(Type::STRING) {}
+bool DataType::Equals(const DataType* other) const {
+  bool equals = other && ((this == other) ||
+                             ((this->type == other->type) &&
+                                 ((this->num_children() == other->num_children()))));
+  if (equals) {
+    for (int i = 0; i < num_children(); ++i) {
+      // TODO(emkornfield) limit recursion
+      if (!children_[i]->Equals(other->children_[i])) { return false; }
+    }
+  }
+  return equals;
+}
 
 std::string StringType::ToString() const {
   std::string result(name());
@@ -44,6 +55,10 @@ std::string ListType::ToString() const {
   return s.str();
 }
 
+std::string BinaryType::ToString() const {
+  return std::string(name());
+}
+
 std::string StructType::ToString() const {
   std::stringstream s;
   s << "struct<";

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index f366645..8fb4121 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -23,6 +23,8 @@
 #include <string>
 #include <vector>
 
+#include "arrow/util/macros.h"
+
 namespace arrow {
 
 // Data types in this library are all *logical*. They can be expressed as
@@ -53,15 +55,9 @@ struct Type {
     // 8-byte floating point value
     DOUBLE = 11,
 
-    // CHAR(N): fixed-length UTF8 string with length N
-    CHAR = 12,
-
     // UTF8 variable-length string as List<Char>
     STRING = 13,
 
-    // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1)
-    VARCHAR = 14,
-
     // Variable-length bytes (no guarantee of UTF8-ness)
     BINARY = 15,
 
@@ -114,12 +110,15 @@ struct DataType {
 
   virtual ~DataType();
 
-  bool Equals(const DataType* other) {
-    // Call with a pointer so more friendly to subclasses
-    return other && ((this == other) || (this->type == other->type));
-  }
+  // Return whether the types are equal
+  //
+  // Types that are logically convertable from one to another e.g. List<UInt8>
+  // and Binary are NOT equal).
+  virtual bool Equals(const DataType* other) const;
 
-  bool Equals(const std::shared_ptr<DataType>& other) { return Equals(other.get());
}
+  bool Equals(const std::shared_ptr<DataType>& other) const {
+    return Equals(other.get());
+  }
 
   const std::shared_ptr<Field>& child(int i) const { return children_[i]; }
 
@@ -236,9 +235,8 @@ struct DoubleType : public PrimitiveType<DoubleType> {
 
 struct ListType : public DataType {
   // List can contain any other logical value type
-  explicit ListType(const std::shared_ptr<DataType>& value_type) : DataType(Type::LIST)
{
-    children_ = {std::make_shared<Field>("item", value_type)};
-  }
+  explicit ListType(const std::shared_ptr<DataType>& value_type)
+      : ListType(value_type, Type::LIST) {}
 
   explicit ListType(const std::shared_ptr<Field>& value_field) : DataType(Type::LIST)
{
     children_ = {value_field};
@@ -251,15 +249,38 @@ struct ListType : public DataType {
   static char const* name() { return "list"; }
 
   std::string ToString() const override;
+
+ protected:
+  // Constructor for classes that are implemented as List Arrays.
+  ListType(const std::shared_ptr<DataType>& value_type, Type::type logical_type)
+      : DataType(logical_type) {
+    // TODO ARROW-187 this can technically fail, make a constructor method ?
+    children_ = {std::make_shared<Field>("item", value_type)};
+  }
 };
 
-// String is a logical type consisting of a physical list of 1-byte values
-struct StringType : public DataType {
-  StringType();
+// BinaryType type is reprsents lists of 1-byte values.
+struct BinaryType : public ListType {
+  BinaryType() : BinaryType(Type::BINARY) {}
+  static char const* name() { return "binary"; }
+  std::string ToString() const override;
+
+ protected:
+  // Allow subclasses to change the logical type.
+  explicit BinaryType(Type::type logical_type)
+      : ListType(std::shared_ptr<DataType>(new UInt8Type()), logical_type) {}
+};
+
+// UTF encoded strings
+struct StringType : public BinaryType {
+  StringType() : BinaryType(Type::STRING) {}
 
   static char const* name() { return "string"; }
 
   std::string ToString() const override;
+
+ protected:
+  explicit StringType(Type::type logical_type) : BinaryType(logical_type) {}
 };
 
 struct StructType : public DataType {

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/types/construct.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc
index bcb0ec4..2d913a7 100644
--- a/cpp/src/arrow/types/construct.cc
+++ b/cpp/src/arrow/types/construct.cc
@@ -127,10 +127,8 @@ Status MakeListArray(const TypePtr& type, int32_t length,
     case Type::LIST:
       out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap));
       break;
-    case Type::CHAR:
     case Type::DECIMAL_TEXT:
     case Type::STRING:
-    case Type::VARCHAR:
       out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap));
       break;
     default:

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/types/decimal.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h
index 1be489d..598df3e 100644
--- a/cpp/src/arrow/types/decimal.h
+++ b/cpp/src/arrow/types/decimal.h
@@ -29,7 +29,6 @@ struct DecimalType : public DataType {
       : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {}
   int precision;
   int scale;
-
   static char const* name() { return "decimal"; }
 
   std::string ToString() const override;

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/types/list.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h
index 0a39416..2f6f85d 100644
--- a/cpp/src/arrow/types/list.h
+++ b/cpp/src/arrow/types/list.h
@@ -66,8 +66,8 @@ class ListArray : public Array {
   int32_t offset(int i) const { return offsets_[i]; }
 
   // Neither of these functions will perform boundschecking
-  int32_t value_offset(int i) { return offsets_[i]; }
-  int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i]; }
+  int32_t value_offset(int i) const { return offsets_[i]; }
+  int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; }
 
   bool EqualsExact(const ListArray& other) const;
   bool Equals(const std::shared_ptr<Array>& arr) const override;
@@ -92,9 +92,9 @@ class ListArray : public Array {
 // a sequence of offests and null values.
 //
 // A note on types.  Per arrow/type.h all types in the c++ implementation are
-// logical so even though this class always builds an Array of lists, this can
+// logical so even though this class always builds list array, this can
 // represent multiple different logical types.  If no logical type is provided
-// at construction time, the class defaults to List<T> where t is take from the
+// at construction time, the class defaults to List<T> where t is taken from the
 // value_builder/values that the object is constructed with.
 class ListBuilder : public ArrayBuilder {
  public:

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/types/string-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc
index ee4307c..a141fc1 100644
--- a/cpp/src/arrow/types/string-test.cc
+++ b/cpp/src/arrow/types/string-test.cc
@@ -34,32 +34,14 @@ namespace arrow {
 
 class Buffer;
 
-TEST(TypesTest, TestCharType) {
-  CharType t1(5);
-
-  ASSERT_EQ(t1.type, Type::CHAR);
-  ASSERT_EQ(t1.size, 5);
-
-  ASSERT_EQ(t1.ToString(), std::string("char(5)"));
-
-  // Test copy constructor
-  CharType t2 = t1;
-  ASSERT_EQ(t2.type, Type::CHAR);
-  ASSERT_EQ(t2.size, 5);
-}
-
-TEST(TypesTest, TestVarcharType) {
-  VarcharType t1(5);
-
-  ASSERT_EQ(t1.type, Type::VARCHAR);
-  ASSERT_EQ(t1.size, 5);
-
-  ASSERT_EQ(t1.ToString(), std::string("varchar(5)"));
-
-  // Test copy constructor
-  VarcharType t2 = t1;
-  ASSERT_EQ(t2.type, Type::VARCHAR);
-  ASSERT_EQ(t2.size, 5);
+TEST(TypesTest, BinaryType) {
+  BinaryType t1;
+  BinaryType e1;
+  StringType t2;
+  EXPECT_TRUE(t1.Equals(&e1));
+  EXPECT_FALSE(t1.Equals(&t2));
+  ASSERT_EQ(t1.type, Type::BINARY);
+  ASSERT_EQ(t1.ToString(), std::string("binary"));
 }
 
 TEST(TypesTest, TestStringType) {
@@ -119,6 +101,7 @@ class TestStringContainer : public ::testing::Test {
 TEST_F(TestStringContainer, TestArrayBasics) {
   ASSERT_EQ(length_, strings_->length());
   ASSERT_EQ(1, strings_->null_count());
+  ASSERT_OK(strings_->Validate());
 }
 
 TEST_F(TestStringContainer, TestType) {
@@ -163,7 +146,10 @@ class TestStringBuilder : public TestBuilder {
     builder_.reset(new StringBuilder(pool_, type_));
   }
 
-  void Done() { result_ = std::dynamic_pointer_cast<StringArray>(builder_->Finish());
}
+  void Done() {
+    result_ = std::dynamic_pointer_cast<StringArray>(builder_->Finish());
+    result_->Validate();
+  }
 
  protected:
   TypePtr type_;
@@ -216,4 +202,152 @@ TEST_F(TestStringBuilder, TestZeroLength) {
   Done();
 }
 
+// Binary container type
+// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating
+// with String
+class TestBinaryContainer : public ::testing::Test {
+ public:
+  void SetUp() {
+    chars_ = {'a', 'b', 'b', 'c', 'c', 'c'};
+    offsets_ = {0, 1, 1, 1, 3, 6};
+    valid_bytes_ = {1, 1, 0, 1, 1};
+    expected_ = {"a", "", "", "bb", "ccc"};
+
+    MakeArray();
+  }
+
+  void MakeArray() {
+    length_ = offsets_.size() - 1;
+    int nchars = chars_.size();
+
+    value_buf_ = test::to_buffer(chars_);
+    values_ = ArrayPtr(new UInt8Array(nchars, value_buf_));
+
+    offsets_buf_ = test::to_buffer(offsets_);
+
+    null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_);
+    null_count_ = test::null_count(valid_bytes_);
+
+    strings_ = std::make_shared<BinaryArray>(
+        length_, offsets_buf_, values_, null_count_, null_bitmap_);
+  }
+
+ protected:
+  std::vector<int32_t> offsets_;
+  std::vector<char> chars_;
+  std::vector<uint8_t> valid_bytes_;
+
+  std::vector<std::string> expected_;
+
+  std::shared_ptr<Buffer> value_buf_;
+  std::shared_ptr<Buffer> offsets_buf_;
+  std::shared_ptr<Buffer> null_bitmap_;
+
+  int null_count_;
+  int length_;
+
+  ArrayPtr values_;
+  std::shared_ptr<BinaryArray> strings_;
+};
+
+TEST_F(TestBinaryContainer, TestArrayBasics) {
+  ASSERT_EQ(length_, strings_->length());
+  ASSERT_EQ(1, strings_->null_count());
+  ASSERT_OK(strings_->Validate());
+}
+
+TEST_F(TestBinaryContainer, TestType) {
+  TypePtr type = strings_->type();
+
+  ASSERT_EQ(Type::BINARY, type->type);
+  ASSERT_EQ(Type::BINARY, strings_->type_enum());
+}
+
+TEST_F(TestBinaryContainer, TestListFunctions) {
+  int pos = 0;
+  for (size_t i = 0; i < expected_.size(); ++i) {
+    ASSERT_EQ(pos, strings_->value_offset(i));
+    ASSERT_EQ(expected_[i].size(), strings_->value_length(i));
+    pos += expected_[i].size();
+  }
+}
+
+TEST_F(TestBinaryContainer, TestDestructor) {
+  auto arr = std::make_shared<BinaryArray>(
+      length_, offsets_buf_, values_, null_count_, null_bitmap_);
+}
+
+TEST_F(TestBinaryContainer, TestGetValue) {
+  for (size_t i = 0; i < expected_.size(); ++i) {
+    if (valid_bytes_[i] == 0) {
+      ASSERT_TRUE(strings_->IsNull(i));
+    } else {
+      int32_t len = -1;
+      const uint8_t* bytes = strings_->GetValue(i, &len);
+      ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len));
+    }
+  }
+}
+
+class TestBinaryBuilder : public TestBuilder {
+ public:
+  void SetUp() {
+    TestBuilder::SetUp();
+    type_ = TypePtr(new BinaryType());
+    builder_.reset(new BinaryBuilder(pool_, type_));
+  }
+
+  void Done() {
+    result_ = std::dynamic_pointer_cast<BinaryArray>(builder_->Finish());
+    result_->Validate();
+  }
+
+ protected:
+  TypePtr type_;
+
+  std::unique_ptr<BinaryBuilder> builder_;
+  std::shared_ptr<BinaryArray> result_;
+};
+
+TEST_F(TestBinaryBuilder, TestScalarAppend) {
+  std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
+  std::vector<uint8_t> is_null = {0, 0, 0, 1, 0};
+
+  int N = strings.size();
+  int reps = 1000;
+
+  for (int j = 0; j < reps; ++j) {
+    for (int i = 0; i < N; ++i) {
+      if (is_null[i]) {
+        builder_->AppendNull();
+      } else {
+        builder_->Append(
+            reinterpret_cast<const uint8_t*>(strings[i].data()), strings[i].size());
+      }
+    }
+  }
+  Done();
+  ASSERT_OK(result_->Validate());
+  ASSERT_EQ(reps * N, result_->length());
+  ASSERT_EQ(reps, result_->null_count());
+  ASSERT_EQ(reps * 6, result_->values()->length());
+
+  int32_t length;
+  for (int i = 0; i < N * reps; ++i) {
+    if (is_null[i % N]) {
+      ASSERT_TRUE(result_->IsNull(i));
+    } else {
+      ASSERT_FALSE(result_->IsNull(i));
+      const uint8_t* vals = result_->GetValue(i, &length);
+      ASSERT_EQ(strings[i % N].size(), length);
+      ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length));
+    }
+  }
+}
+
+TEST_F(TestBinaryBuilder, TestZeroLength) {
+  // All buffers are null
+  Done();
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/types/string.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc
index 29d97d0..da02c7d 100644
--- a/cpp/src/arrow/types/string.cc
+++ b/cpp/src/arrow/types/string.cc
@@ -24,25 +24,43 @@
 
 namespace arrow {
 
+const std::shared_ptr<DataType> BINARY(new BinaryType());
 const std::shared_ptr<DataType> STRING(new StringType());
 
-StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
+BinaryArray::BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
     const ArrayPtr& values, int32_t null_count,
     const std::shared_ptr<Buffer>& null_bitmap)
-    : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {}
+    : BinaryArray(BINARY, length, offsets, values, null_count, null_bitmap) {}
+
+BinaryArray::BinaryArray(const TypePtr& type, int32_t length,
+    const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, int32_t
null_count,
+    const std::shared_ptr<Buffer>& null_bitmap)
+    : ListArray(type, length, offsets, values, null_count, null_bitmap),
+      bytes_(std::dynamic_pointer_cast<UInt8Array>(values).get()),
+      raw_bytes_(bytes_->raw_data()) {
+  // Check in case the dynamic cast fails.
+  DCHECK(bytes_);
+}
 
-std::string CharType::ToString() const {
-  std::stringstream s;
-  s << "char(" << size << ")";
-  return s.str();
+Status BinaryArray::Validate() const {
+  if (values()->null_count() > 0) {
+    std::stringstream ss;
+    ss << type()->ToString() << " can have null values in the value array";
+    Status::Invalid(ss.str());
+  }
+  return ListArray::Validate();
 }
 
-std::string VarcharType::ToString() const {
-  std::stringstream s;
-  s << "varchar(" << size << ")";
-  return s.str();
+StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
+    const ArrayPtr& values, int32_t null_count,
+    const std::shared_ptr<Buffer>& null_bitmap)
+    : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {}
+
+Status StringArray::Validate() const {
+  // TODO(emkornfield) Validate proper UTF8 code points?
+  return BinaryArray::Validate();
 }
 
-TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type());
+TypePtr BinaryBuilder::value_type_ = TypePtr(new UInt8Type());
 
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/types/string.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h
index d2d3c5b..b3c00d2 100644
--- a/cpp/src/arrow/types/string.h
+++ b/cpp/src/arrow/types/string.h
@@ -34,87 +34,99 @@ namespace arrow {
 class Buffer;
 class MemoryPool;
 
-struct CharType : public DataType {
-  int size;
-
-  explicit CharType(int size) : DataType(Type::CHAR), size(size) {}
-
-  CharType(const CharType& other) : CharType(other.size) {}
-
-  virtual std::string ToString() const;
-};
-
-// Variable-length, null-terminated strings, up to a certain length
-struct VarcharType : public DataType {
-  int size;
-
-  explicit VarcharType(int size) : DataType(Type::VARCHAR), size(size) {}
-  VarcharType(const VarcharType& other) : VarcharType(other.size) {}
-
-  virtual std::string ToString() const;
-};
-
-// TODO(wesm): add a BinaryArray layer in between
-class StringArray : public ListArray {
+class BinaryArray : public ListArray {
  public:
-  StringArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>&
offsets,
+  BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
       const ArrayPtr& values, int32_t null_count = 0,
-      const std::shared_ptr<Buffer>& null_bitmap = nullptr)
-      : ListArray(type, length, offsets, values, null_count, null_bitmap) {
-    // For convenience
-    bytes_ = static_cast<UInt8Array*>(values.get());
-    raw_bytes_ = bytes_->raw_data();
-  }
-
-  StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+  // Constructor that allows sub-classes/builders to propagate there logical type up the
+  // class hierarchy.
+  BinaryArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>&
offsets,
       const ArrayPtr& values, int32_t null_count = 0,
       const std::shared_ptr<Buffer>& null_bitmap = nullptr);
 
-  // Compute the pointer t
+  // Return the pointer to the given elements bytes
+  // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy
+  // pointer + offset
   const uint8_t* GetValue(int i, int32_t* out_length) const {
-    int32_t pos = offsets_[i];
+    DCHECK(out_length);
+    const int32_t pos = offsets_[i];
     *out_length = offsets_[i + 1] - pos;
     return raw_bytes_ + pos;
   }
 
+  Status Validate() const override;
+
+ private:
+  UInt8Array* bytes_;
+  const uint8_t* raw_bytes_;
+};
+
+class StringArray : public BinaryArray {
+ public:
+  StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
+      const ArrayPtr& values, int32_t null_count = 0,
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+  // Constructor that allows overriding the logical type, so subclasses can propagate
+  // there
+  // up the class hierarchy.
+  StringArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>&
offsets,
+      const ArrayPtr& values, int32_t null_count = 0,
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr)
+      : BinaryArray(type, length, offsets, values, null_count, null_bitmap) {}
+
   // Construct a std::string
+  // TODO: std::bad_alloc possibility
   std::string GetString(int i) const {
     int32_t nchars;
     const uint8_t* str = GetValue(i, &nchars);
     return std::string(reinterpret_cast<const char*>(str), nchars);
   }
 
- private:
-  UInt8Array* bytes_;
-  const uint8_t* raw_bytes_;
+  Status Validate() const override;
 };
 
-// String builder
-class StringBuilder : public ListBuilder {
+// BinaryBuilder : public ListBuilder
+class BinaryBuilder : public ListBuilder {
  public:
-  explicit StringBuilder(MemoryPool* pool, const TypePtr& type)
+  explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type)
       : ListBuilder(pool, std::make_shared<UInt8Builder>(pool, value_type_), type)
{
     byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get());
   }
 
-  Status Append(const std::string& value) { return Append(value.c_str(), value.size());
}
-
-  Status Append(const char* value, int32_t length) {
+  Status Append(const uint8_t* value, int32_t length) {
     RETURN_NOT_OK(ListBuilder::Append());
-    return byte_builder_->Append(reinterpret_cast<const uint8_t*>(value), length);
+    return byte_builder_->Append(value, length);
   }
-  Status Append(const std::vector<std::string>& values, uint8_t* null_bytes);
 
   std::shared_ptr<Array> Finish() override {
-    return ListBuilder::Transfer<StringArray>();
+    return ListBuilder::Transfer<BinaryArray>();
   }
 
  protected:
   UInt8Builder* byte_builder_;
-
   static TypePtr value_type_;
 };
 
+// String builder
+class StringBuilder : public BinaryBuilder {
+ public:
+  explicit StringBuilder(MemoryPool* pool, const TypePtr& type)
+      : BinaryBuilder(pool, type) {}
+
+  Status Append(const std::string& value) { return Append(value.c_str(), value.size());
}
+
+  Status Append(const char* value, int32_t length) {
+    return BinaryBuilder::Append(reinterpret_cast<const uint8_t*>(value), length);
+  }
+
+  Status Append(const std::vector<std::string>& values, uint8_t* null_bytes);
+
+  std::shared_ptr<Array> Finish() override {
+    return ListBuilder::Transfer<StringArray>();
+  }
+};
+
 }  // namespace arrow
 
 #endif  // ARROW_TYPES_STRING_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/27edd25e/cpp/src/arrow/util/macros.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h
index 51e605e..69ecda1 100644
--- a/cpp/src/arrow/util/macros.h
+++ b/cpp/src/arrow/util/macros.h
@@ -21,6 +21,6 @@
 // From Google gutil
 #define DISALLOW_COPY_AND_ASSIGN(TypeName) \
   TypeName(const TypeName&) = delete;      \
-  void operator=(const TypeName&) = delete
+  TypeName& operator=(const TypeName&) = delete
 
 #endif  // ARROW_UTIL_MACROS_H


Mime
View raw message