impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tarmstr...@apache.org
Subject [1/2] incubator-impala git commit: IMPALA-4387: validate decimal type in Avro file schema
Date Sun, 30 Oct 2016 07:31:58 GMT
Repository: incubator-impala
Updated Branches:
  refs/heads/master 3b8a0891d -> 6d1a130d8


IMPALA-4387: validate decimal type in Avro file schema

This patch prevents an invalid decimal type in an Avro file schema from
crashing Impala. Most invalid Avro schemas are caught by the frontend,
but file schemas still need to be validated by the backend.

After this patch files with bad schemas are skipped.

Testing:
This was hit very rarely by the scanner fuzzing. Added a regression test that
scans a file with a bad schema.

Change-Id: I25a326ee2220bc14d3b5f887dc288b4adf859cfc
Reviewed-on: http://gerrit.cloudera.org:8080/4876
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6587c08f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6587c08f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6587c08f

Branch: refs/heads/master
Commit: 6587c08f701ea4a72d03a36e159f9074a84d5f8f
Parents: 3b8a089
Author: Tim Armstrong <tarmstrong@cloudera.com>
Authored: Thu Oct 27 13:13:02 2016 -0700
Committer: Internal Jenkins <cloudera-hudson@gerrit.cloudera.org>
Committed: Sun Oct 30 00:12:58 2016 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-avro-scanner.cc                |  15 ++++---
 be/src/runtime/decimal-test.cc                  |  18 ++++++++
 be/src/runtime/types.h                          |  10 +++--
 be/src/util/avro-util.cc                        |  42 +++++++++++++------
 be/src/util/avro-util.h                         |   7 +++-
 common/thrift/generate_error_codes.py           |   7 +++-
 testdata/bad_avro_snap/README                   |  13 +++++-
 .../bad_avro_snap/invalid_decimal_schema.avro   | Bin 0 -> 328 bytes
 .../functional/functional_schema_template.sql   |  10 +++++
 .../datasets/functional/schema_constraints.csv  |   1 +
 .../queries/DataErrorsTest/avro-errors.test     |   8 ++++
 11 files changed, 104 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/exec/hdfs-avro-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-avro-scanner.cc b/be/src/exec/hdfs-avro-scanner.cc
index 91a9d03..5b17fc1 100644
--- a/be/src/exec/hdfs-avro-scanner.cc
+++ b/be/src/exec/hdfs-avro-scanner.cc
@@ -385,12 +385,15 @@ Status HdfsAvroScanner::VerifyTypesMatch(const AvroSchemaElement&
table_schema,
     return Status::OK();
   }
 
-  ColumnType reader_type = AvroSchemaToColumnType(table_schema.schema);
-  ColumnType writer_type = AvroSchemaToColumnType(file_schema.schema);
+  ColumnType reader_type;
+  RETURN_IF_ERROR(AvroSchemaToColumnType(table_schema.schema, field_name, &reader_type));
+  ColumnType writer_type;
+  RETURN_IF_ERROR(AvroSchemaToColumnType(file_schema.schema, field_name, &writer_type));
   bool match = VerifyTypesMatch(reader_type, writer_type);
   if (match) return Status::OK();
   return Status(TErrorCode::AVRO_SCHEMA_RESOLUTION_ERROR, field_name,
-      avro_type_name(table_schema.schema->type), avro_type_name(file_schema.schema->type));
+      avro_type_name(table_schema.schema->type),
+      avro_type_name(file_schema.schema->type));
 }
 
 Status HdfsAvroScanner::VerifyTypesMatch(SlotDescriptor* slot_desc, avro_obj_t* schema) {
@@ -405,10 +408,12 @@ Status HdfsAvroScanner::VerifyTypesMatch(SlotDescriptor* slot_desc,
avro_obj_t*
   // TODO: update if/when we have TYPE_STRUCT primitive type
   if (schema->type == AVRO_RECORD) {
     return Status(TErrorCode::AVRO_SCHEMA_METADATA_MISMATCH, col_name,
-      slot_desc->type().DebugString(), avro_type_name(schema->type));
+        slot_desc->type().DebugString(), avro_type_name(schema->type));
   }
 
-  bool match = VerifyTypesMatch(slot_desc->type(), AvroSchemaToColumnType(schema));
+  ColumnType file_type;
+  RETURN_IF_ERROR(AvroSchemaToColumnType(schema, col_name, &file_type));
+  bool match = VerifyTypesMatch(slot_desc->type(), file_type);
   if (match) return Status::OK();
   return Status(TErrorCode::AVRO_SCHEMA_METADATA_MISMATCH, col_name,
       slot_desc->type().DebugString(), avro_type_name(schema->type));

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/runtime/decimal-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/decimal-test.cc b/be/src/runtime/decimal-test.cc
index ab87a0b..f88c8f5 100644
--- a/be/src/runtime/decimal-test.cc
+++ b/be/src/runtime/decimal-test.cc
@@ -22,6 +22,7 @@
 #include <boost/cstdint.hpp>
 #include <boost/lexical_cast.hpp>
 #include "runtime/decimal-value.inline.h"
+#include "runtime/types.h"
 #include "testutil/gtest-util.h"
 #include "util/string-parser.h"
 
@@ -796,6 +797,23 @@ TEST(DecimalArithmetic, RandTesting) {
   }
 }
 
+TEST(DecimalValidation, PrecisionScaleValidation) {
+  // Valid precision and scale.
+  EXPECT_TRUE(ColumnType::ValidateDecimalParams(1, 0));
+  EXPECT_TRUE(ColumnType::ValidateDecimalParams(1, 1));
+  EXPECT_TRUE(ColumnType::ValidateDecimalParams(38, 38));
+  EXPECT_TRUE(ColumnType::ValidateDecimalParams(38, 0));
+
+  // Out of range precision or scale.
+  EXPECT_FALSE(ColumnType::ValidateDecimalParams(3, -1));
+  EXPECT_FALSE(ColumnType::ValidateDecimalParams(0, 0));
+  EXPECT_FALSE(ColumnType::ValidateDecimalParams(39, 0));
+  EXPECT_FALSE(ColumnType::ValidateDecimalParams(38, 39));
+
+  // Incompatible precision and scale.
+  EXPECT_FALSE(ColumnType::ValidateDecimalParams(15, 16));
+}
+
 }
 
 IMPALA_TEST_MAIN();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/runtime/types.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h
index a30447c..9558792 100644
--- a/be/src/runtime/types.h
+++ b/be/src/runtime/types.h
@@ -124,11 +124,13 @@ struct ColumnType {
     return ret;
   }
 
+  static bool ValidateDecimalParams(int precision, int scale) {
+    return precision >= 1 && precision <= MAX_PRECISION && scale >=
0
+        && scale <= MAX_SCALE && scale <= precision;
+  }
+
   static ColumnType CreateDecimalType(int precision, int scale) {
-    DCHECK_LE(precision, MAX_PRECISION);
-    DCHECK_LE(scale, MAX_SCALE);
-    DCHECK_GE(precision, 0);
-    DCHECK_LE(scale, precision);
+    DCHECK(ValidateDecimalParams(precision, scale)) << precision << ", " <<
scale;
     ColumnType ret;
     ret.type = TYPE_DECIMAL;
     ret.precision = precision;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/util/avro-util.cc
----------------------------------------------------------------------
diff --git a/be/src/util/avro-util.cc b/be/src/util/avro-util.cc
index be3e4a2..f03242c 100644
--- a/be/src/util/avro-util.cc
+++ b/be/src/util/avro-util.cc
@@ -103,23 +103,39 @@ ScopedAvroSchemaElement::~ScopedAvroSchemaElement() {
   avro_schema_decref(element_.schema);
 }
 
-ColumnType AvroSchemaToColumnType(const avro_schema_t& schema) {
+Status AvroSchemaToColumnType(
+    const avro_schema_t& schema, const string& column_name, ColumnType* column_type)
{
   switch (schema->type) {
     case AVRO_BYTES:
     case AVRO_STRING:
-      return TYPE_STRING;
-    case AVRO_INT32: return TYPE_INT;
-    case AVRO_INT64: return TYPE_BIGINT;
-    case AVRO_FLOAT: return TYPE_FLOAT;
-    case AVRO_DOUBLE: return TYPE_DOUBLE;
-    case AVRO_BOOLEAN: return TYPE_BOOLEAN;
-    case AVRO_DECIMAL:
-      return ColumnType::CreateDecimalType(
-          avro_schema_decimal_precision(schema), avro_schema_decimal_scale(schema));
+      *column_type = TYPE_STRING;
+      return Status::OK();
+    case AVRO_INT32:
+      *column_type = TYPE_INT;
+      return Status::OK();
+    case AVRO_INT64:
+      *column_type = TYPE_BIGINT;
+      return Status::OK();
+    case AVRO_FLOAT:
+      *column_type = TYPE_FLOAT;
+      return Status::OK();
+    case AVRO_DOUBLE:
+      *column_type = TYPE_DOUBLE;
+      return Status::OK();
+    case AVRO_BOOLEAN:
+      *column_type = TYPE_BOOLEAN;
+      return Status::OK();
+    case AVRO_DECIMAL: {
+      int precision = avro_schema_decimal_precision(schema);
+      int scale = avro_schema_decimal_scale(schema);
+      if (!ColumnType::ValidateDecimalParams(precision, scale)) {
+        return Status(TErrorCode::AVRO_INVALID_DECIMAL, column_name, precision, scale);
+      }
+      *column_type = ColumnType::CreateDecimalType(precision, scale);
+      return Status::OK();
+    }
     default:
-      DCHECK(false) << "NYI: " << avro_type_name(schema->type);
-      return INVALID_TYPE;
+      return Status(TErrorCode::AVRO_UNSUPPORTED_TYPE, column_name, avro_type_name(schema->type));
   }
 }
-
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/be/src/util/avro-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/avro-util.h b/be/src/util/avro-util.h
index 1a9a354..3106279 100644
--- a/be/src/util/avro-util.h
+++ b/be/src/util/avro-util.h
@@ -80,8 +80,11 @@ struct ScopedAvroSchemaElement {
   DISALLOW_COPY_AND_ASSIGN(ScopedAvroSchemaElement);
 };
 
-ColumnType AvroSchemaToColumnType(const avro_schema_t& schema);
-
+/// Converts an Avro schema column to an Impala type. Returns an error status if the Avro
+/// schema does not map to a valid Impala type. 'column_name' is used only for error
+/// messages.
+Status AvroSchemaToColumnType(
+    const avro_schema_t& schema, const std::string& column_name, ColumnType* column_type);
 }
 
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index f03b073..e9df146 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -297,7 +297,12 @@ error_codes = (
 
   ("KUDU_KEY_NOT_FOUND", 96, "Key not found in Kudu table '$0'."),
 
-  ("KUDU_SESSION_ERROR", 97, "Error in Kudu table '$0': $1")
+  ("KUDU_SESSION_ERROR", 97, "Error in Kudu table '$0': $1"),
+
+  ("AVRO_UNSUPPORTED_TYPE", 98, "Column '$0': unsupported Avro type '$1'"),
+
+  ("AVRO_INVALID_DECIMAL", 99,
+      "Column '$0': invalid Avro decimal type with precision = '$1' scale = '$2'"),
 )
 
 import sys

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/bad_avro_snap/README
----------------------------------------------------------------------
diff --git a/testdata/bad_avro_snap/README b/testdata/bad_avro_snap/README
index a88d0ad..6271967 100644
--- a/testdata/bad_avro_snap/README
+++ b/testdata/bad_avro_snap/README
@@ -1,7 +1,7 @@
-These Avro files were created by modifying Impala's HdfsAvroTableWriter.
-
 String Data
 -----------
+Created by modifying Impala's HdfsAvroTableWriter.
+
 These files' schemas have a single nullable string column 's'.
 
 negative_string_len.avro: contains two values, but the second value has a negative length.
@@ -14,6 +14,15 @@ truncated_string.avro: contains one value, which is missing the last byte.
 
 Float Data
 ----------
+Created by modifying Impala's HdfsAvroTableWriter.
+
 These files' schemas have a single nullable float column 'c1'.
 
 truncated_float.avro: contains two float values. The second is missing the last byte.
+
+Bad Schema
+----------
+Created by editing the schema of a valid Avro file with vim.
+
+invalid_decimal_schema.avro: two columns, name STRING and value DECIMAL(5,7).
+  The DECIMAL value is invalid.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/bad_avro_snap/invalid_decimal_schema.avro
----------------------------------------------------------------------
diff --git a/testdata/bad_avro_snap/invalid_decimal_schema.avro b/testdata/bad_avro_snap/invalid_decimal_schema.avro
new file mode 100644
index 0000000..0141f99
Binary files /dev/null and b/testdata/bad_avro_snap/invalid_decimal_schema.avro differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 7b929b7..bfe8f39 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -1509,6 +1509,16 @@ c1 FLOAT
 LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/truncated_float.avro'
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
+functional
+---- BASE_TABLE_NAME
+bad_avro_decimal_schema
+---- COLUMNS
+name STRING
+value DECIMAL(5,2)
+---- DEPENDENT_LOAD
+LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_avro_snap/invalid_decimal_schema.avro'
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+====
+---- DATASET
 -- IMPALA-694: uses data file produced by parquet-mr version 1.2.5-cdh4.5.0
 -- (can't use LOAD DATA LOCAL with Impala so copied in create-load-data.sh)
 functional

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/datasets/functional/schema_constraints.csv
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index 4c9da6d..d6d1111 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -37,6 +37,7 @@ table_name:bad_text_gzip, constraint:restrict_to, table_format:text/gzip/block
 table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block
 table_name:bad_avro_snap_strings, constraint:restrict_to, table_format:avro/snap/block
 table_name:bad_avro_snap_floats, constraint:restrict_to, table_format:avro/snap/block
+table_name:bad_avro_decimal_schema, constraint:restrict_to, table_format:avro/snap/block
 table_name:bad_parquet, constraint:restrict_to, table_format:parquet/none/none
 table_name:bad_parquet_strings_negative_len, constraint:restrict_to, table_format:parquet/none/none
 table_name:bad_parquet_strings_out_of_bounds, constraint:restrict_to, table_format:parquet/none/none

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6587c08f/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test
index 6ca2af6..e396583 100644
--- a/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/avro-errors.test
@@ -24,3 +24,11 @@ float
 Problem parsing file $NAMENODE/test-warehouse/bad_avro_snap_floats_avro_snap/truncated_float.avro
at 159
 File '$NAMENODE/test-warehouse/bad_avro_snap_floats_avro_snap/truncated_float.avro' is corrupt:
truncated data block at offset 159
 ====
+---- QUERY
+select * from bad_avro_decimal_schema
+---- TYPES
+string,decimal
+---- RESULTS
+---- ERRORS
+Column 'value': invalid Avro decimal type with precision = '5' scale = '7'
+====


Mime
View raw message