impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tarmstr...@apache.org
Subject incubator-impala git commit: IMPALA-3732: handle string length overflow in avro files
Date Wed, 15 Jun 2016 09:33:21 GMT
Repository: incubator-impala
Updated Branches:
  refs/heads/master d8d3e2391 -> 2dad444c8


IMPALA-3732: handle string length overflow in avro files

Avro string lengths are encoded as 64-bit integers. Impala can only
handle up to 32-bit integers, so we need to be careful about handling
out-of-range integers. Negative integers were already handled by a
previous patch, but if a positive 64-bit integer is truncated to a
32-bit integer, the result can be a negative length.

This patch fixes CHAR/VARCHAR behaviour, where we can just truncate
the string, and STRING, where we can't truncate the string, so must
return an error.

Testing:
Added unit tests for STRING, CHAR, and VARCHAR that exercise the string
overflow handling.

Change-Id: If6541e7c68255bf599b26386a55057c93e62af51
Reviewed-on: http://gerrit.cloudera.org:8080/3383
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2dad444c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2dad444c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2dad444c

Branch: refs/heads/master
Commit: 2dad444c8c1db09598d40ae34a81b4bc4276fbb4
Parents: d8d3e23
Author: Tim Armstrong <tarmstrong@cloudera.com>
Authored: Tue Jun 14 14:57:19 2016 -0700
Committer: Tim Armstrong <tarmstrong@cloudera.com>
Committed: Wed Jun 15 02:33:17 2016 -0700

----------------------------------------------------------------------
 be/src/exec/hdfs-avro-scanner-ir.cc   | 18 +++++-
 be/src/exec/hdfs-avro-scanner-test.cc | 92 +++++++++++++++++++++++++-----
 be/src/exec/hdfs-avro-scanner.cc      | 10 ++++
 be/src/exec/hdfs-avro-scanner.h       |  1 +
 common/thrift/generate_error_codes.py |  3 +
 5 files changed, 108 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/be/src/exec/hdfs-avro-scanner-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-avro-scanner-ir.cc b/be/src/exec/hdfs-avro-scanner-ir.cc
index 6839fe1..eeaae9d 100644
--- a/be/src/exec/hdfs-avro-scanner-ir.cc
+++ b/be/src/exec/hdfs-avro-scanner-ir.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <limits>
 
 #include "exec/hdfs-avro-scanner.h"
 #include "exec/read-write-util.h"
@@ -21,6 +22,7 @@
 
 using namespace impala;
 using namespace strings;
+using std::numeric_limits;
 
 // Functions in this file are cross-compiled to IR with clang.
 
@@ -183,8 +185,10 @@ bool HdfsAvroScanner::ReadAvroVarchar(PrimitiveType type, int max_len,
uint8_t**
   if (write_slot) {
     DCHECK(type == TYPE_VARCHAR);
     StringValue* sv = reinterpret_cast<StringValue*>(slot);
-    int str_len = std::min(static_cast<int>(len.val), max_len);
-    DCHECK(str_len >= 0);
+    // 'max_len' is an int, so the result of min() should always be in [0, INT_MAX].
+    // We need to be careful not to truncate the length before evaluating min().
+    int str_len = static_cast<int>(std::min<int64_t>(len.val, max_len));
+    DCHECK_GE(str_len, 0);
     sv->len = str_len;
     sv->ptr = reinterpret_cast<char*>(*data);
   }
@@ -199,7 +203,10 @@ bool HdfsAvroScanner::ReadAvroChar(PrimitiveType type, int max_len, uint8_t**
da
   if (write_slot) {
     DCHECK(type == TYPE_CHAR);
     ColumnType ctype = ColumnType::CreateCharType(max_len);
-    int str_len = std::min(static_cast<int>(len.val), max_len);
+    // 'max_len' is an int, so the result of min() should always be in [0, INT_MAX].
+    // We need to be careful not to truncate the length before evaluating min().
+    int str_len = static_cast<int>(std::min<int64_t>(len.val, max_len));
+    DCHECK_GE(str_len, 0);
     if (ctype.IsVarLenStringType()) {
       StringValue* sv = reinterpret_cast<StringValue*>(slot);
       sv->ptr = reinterpret_cast<char*>(pool->TryAllocate(max_len));
@@ -227,6 +234,11 @@ bool HdfsAvroScanner::ReadAvroString(PrimitiveType type, uint8_t** data,
   if (UNLIKELY(!len.ok)) return false;
   if (write_slot) {
     DCHECK(type == TYPE_STRING);
+    if (UNLIKELY(len.val > numeric_limits<int>::max())) {
+      SetStatusValueOverflow(TErrorCode::SCANNER_STRING_LENGTH_OVERFLOW, len.val,
+          numeric_limits<int>::max());
+      return false;
+    }
     StringValue* sv = reinterpret_cast<StringValue*>(slot);
     sv->len = len.val;
     sv->ptr = reinterpret_cast<char*>(*data);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/be/src/exec/hdfs-avro-scanner-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-avro-scanner-test.cc b/be/src/exec/hdfs-avro-scanner-test.cc
index e5f77f3..1a8b759 100644
--- a/be/src/exec/hdfs-avro-scanner-test.cc
+++ b/be/src/exec/hdfs-avro-scanner-test.cc
@@ -23,7 +23,8 @@
 #include "runtime/runtime-state.h"
 #include "runtime/string-value.inline.h"
 
-// TODO: CHAR, VARCHAR (IMPALA-3658)
+// TODO: IMPALA-3658: complete CHAR unit tests.
+// TODO: IMPALA-3658: complete VARCHAR unit tests.
 
 namespace impala {
 
@@ -51,6 +52,21 @@ class HdfsAvroScannerTest : public testing::Test {
     }
   }
 
+  template<typename T>
+  void CheckReadResult(T expected_val, int expected_encoded_len,
+      TErrorCode::type expected_error, T val, bool success, int actual_encoded_len) {
+    bool expect_success = expected_error == TErrorCode::OK;
+    EXPECT_EQ(success, expect_success);
+
+    if (success) {
+      EXPECT_TRUE(scanner_.parse_status_.ok());
+      EXPECT_EQ(val, expected_val);
+      EXPECT_EQ(expected_encoded_len, actual_encoded_len);
+    } else {
+      EXPECT_EQ(scanner_.parse_status_.code(), expected_error);
+    }
+  }
+
   // Templated function for calling different ReadAvro* functions.
   //
   // PrimitiveType is a template parameter so we can pass in int 'slot_byte_size' to
@@ -62,22 +78,12 @@ class HdfsAvroScannerTest : public testing::Test {
       TErrorCode::type expected_error = TErrorCode::OK) {
     // Reset parse_status_
     scanner_.parse_status_ = Status::OK();
-
     uint8_t* new_data = data;
     T slot;
-    bool expect_success = expected_error == TErrorCode::OK;
-
     bool success = (scanner_.*read_fn)(type, &new_data, data + data_len, true, &slot,
         NULL);
-    EXPECT_EQ(success, expect_success);
-
-    if (success) {
-      EXPECT_TRUE(scanner_.parse_status_.ok());
-      EXPECT_EQ(slot, expected_val);
-      EXPECT_EQ(new_data, data + expected_encoded_len);
-    } else {
-      EXPECT_EQ(scanner_.parse_status_.code(), expected_error);
-    }
+    CheckReadResult(expected_val, expected_encoded_len, expected_error, slot, success,
+        new_data - data);
   }
 
   void TestReadAvroBoolean(uint8_t* data, int64_t data_len, bool expected_val,
@@ -131,12 +137,48 @@ class HdfsAvroScannerTest : public testing::Test {
         expected_val, 8, expected_error);
   }
 
+  void TestReadAvroChar(int max_len, uint8_t* data, int64_t data_len,
+      StringValue expected_val, int expected_encoded_len,
+      TErrorCode::type expected_error = TErrorCode::OK) {
+    // Reset parse_status_
+    scanner_.parse_status_ = Status::OK();
+    uint8_t* new_data = data;
+    char slot[max<int>(sizeof(StringValue), max_len)];
+    bool success = scanner_.ReadAvroChar(TYPE_CHAR, max_len, &new_data, data + data_len,
+        true, slot, NULL);
+    StringValue value;
+    if (max_len <= ColumnType::MAX_CHAR_INLINE_LENGTH) {
+      // Convert to non-inlined string value for comparison.
+      value.len = max_len;
+      value.ptr = slot;
+    } else {
+      value = *reinterpret_cast<StringValue*>(slot);
+    }
+    CheckReadResult(expected_val, expected_encoded_len, expected_error, value, success,
+        new_data - data);
+  }
+
+  void TestReadAvroVarchar(int max_len, uint8_t* data, int64_t data_len,
+      StringValue expected_val, int expected_encoded_len,
+      TErrorCode::type expected_error = TErrorCode::OK) {
+    // Reset parse_status_
+    scanner_.parse_status_ = Status::OK();
+
+    uint8_t* new_data = data;
+    StringValue slot;
+    bool success = scanner_.ReadAvroVarchar(TYPE_VARCHAR, max_len, &new_data,
+        data + data_len, true, &slot, NULL);
+    CheckReadResult(expected_val, expected_encoded_len, expected_error, slot, success,
+        new_data - data);
+  }
+
   void TestReadAvroString(uint8_t* data, int64_t data_len, StringValue expected_val,
       int expected_encoded_len, TErrorCode::type expected_error = TErrorCode::OK) {
     TestReadAvroType(&HdfsAvroScanner::ReadAvroString, TYPE_STRING, data, data_len,
         expected_val, expected_encoded_len, expected_error);
   }
 
+
   template<typename T>
   void TestReadAvroDecimal(uint8_t* data, int64_t data_len, DecimalValue<T> expected_val,
       int expected_encoded_len, TErrorCode::type expected_error = TErrorCode::OK) {
@@ -282,6 +324,30 @@ TEST_F(HdfsAvroScannerTest, DoubleTest) {
   TestReadAvroDouble(data, 7, d, TErrorCode::AVRO_TRUNCATED_BLOCK);
 }
 
+TEST_F(HdfsAvroScannerTest, StringLengthOverflowTest) {
+  const char* s = "hello world";
+  StringValue sv(s);
+  // Test handling of strings that would overflow the StringValue::len 32-bit integer.
+  // Test cases with 0 and 1 in the upper bit of the 32-bit integer.
+  vector<int64_t> large_string_lens({0x1FFFFFFFF, 0x0FFFFFFFF, 0x100000000});
+  for (int64_t large_string_len: large_string_lens) {
+    LOG(INFO) << "Testing large string of length " << large_string_len;
+    vector<uint8_t> large_buffer(large_string_len + ReadWriteUtil::MAX_ZLONG_LEN);
+    int64_t zlong_len = ReadWriteUtil::PutZLong(large_string_len, large_buffer.data());
+    memcpy(large_buffer.data() + zlong_len, s, strlen(s));
+
+    // CHAR and VARCHAR truncation should still work in this case.
+    TestReadAvroChar(sv.len, large_buffer.data(), large_buffer.size(), sv,
+        zlong_len + large_string_len);
+    TestReadAvroVarchar(sv.len, large_buffer.data(), large_buffer.size(), sv,
+        zlong_len + large_string_len);
+
+    // Interpreting as a string would overflow length field.
+    TestReadAvroString(large_buffer.data(), large_buffer.size(), sv, -1,
+        TErrorCode::SCANNER_STRING_LENGTH_OVERFLOW);
+  }
+}
+
 TEST_F(HdfsAvroScannerTest, StringTest) {
   uint8_t data[100];
   const char* s = "hello";

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/be/src/exec/hdfs-avro-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-avro-scanner.cc b/be/src/exec/hdfs-avro-scanner.cc
index 70dff83..fdb0a96 100644
--- a/be/src/exec/hdfs-avro-scanner.cc
+++ b/be/src/exec/hdfs-avro-scanner.cc
@@ -655,6 +655,16 @@ void HdfsAvroScanner::SetStatusInvalidValue(TErrorCode::type error_code,
int64_t
   }
 }
 
+void HdfsAvroScanner::SetStatusValueOverflow(TErrorCode::type error_code, int64_t len,
+    int64_t limit) {
+  DCHECK(parse_status_.ok());
+  if (TestInfo::is_test()) {
+    parse_status_ = Status(error_code, "test file", len, limit, 123);
+  } else {
+    parse_status_ = Status(error_code, stream_->filename(), len, limit, stream_->file_offset());
+  }
+}
+
 // This function produces a codegen'd function equivalent to MaterializeTuple() but
 // optimized for the table schema. Via helper functions CodegenReadRecord() and
 // CodegenReadScalar(), it eliminates the conditionals necessary when interpreting the

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/be/src/exec/hdfs-avro-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-avro-scanner.h b/be/src/exec/hdfs-avro-scanner.h
index 2069a9b..961b2da 100644
--- a/be/src/exec/hdfs-avro-scanner.h
+++ b/be/src/exec/hdfs-avro-scanner.h
@@ -284,6 +284,7 @@ class HdfsAvroScanner : public BaseSequenceScanner {
   /// contain exception handling code.
   void SetStatusCorruptData(TErrorCode::type error_code);
   void SetStatusInvalidValue(TErrorCode::type error_code, int64_t len);
+  void SetStatusValueOverflow(TErrorCode::type error_code, int64_t len, int64_t limit);
 
   /// Unit test constructor
   HdfsAvroScanner();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2dad444c/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 6365281..0b6f5cf 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -265,6 +265,9 @@ error_codes = (
 
   ("AVRO_INVALID_METADATA_COUNT", 86, "File '$0' is corrupt: invalid metadata count $1 "
    "at offset $2"),
+
+  ("SCANNER_STRING_LENGTH_OVERFLOW", 87, "File '$0' could not be read: string $1 was "
+    "longer than supported limit of $2 bytes at offset $3"),
 )
 
 import sys


Mime
View raw message