impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tarmstr...@apache.org
Subject [2/3] incubator-impala git commit: IMPALA-5664: Unix time to timestamp conversions may crash Impala
Date Mon, 23 Oct 2017 15:44:09 GMT
IMPALA-5664: Unix time to timestamp conversions may crash Impala

TimestampValue::FromSubsecondUnixTime() and UtcFromUnixTimeMicros()
are incorrect only in case of the last second of 1399, because these
sub-second values are rounded first towards 1400-01-01 00:00:00, which
is accepted as a valid date, and the sub-second part is subtracted
afterwards, leading to a date outside the valid interval. The maximum
case, 9999-12-31 59:59:59 is a bit different, because as I understand,
with nanosecond precision posix times, the maximum value is actually
10000-01-01. 00:00:00 minus 1 nanosec.

TimestampValue::FromUnixTimeNanos() can create problematic TimestampValues
both <1400 and 10000<=.

These timestamps can cause problems, because most code assumes that if
HasDate/HasTime is true, then it really is a valid timestamp.

To fix this, the posix times are checked in the constructor of
TimestampValue, and if it is outside the valid interval,both time_
and date_ are set to not_a_date_time.

Test:
select cast(-17987443200-0.1 as timestamp);
This query no longer crashes, but returns NULL, similarly to other
< 1400 timestamps.

Change-Id: I77b2f6284d3a597f57e61c17a67c959eff9e38ff
Reviewed-on: http://gerrit.cloudera.org:8080/7954
Reviewed-by: Lars Volker <lv@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/41f0c6a5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/41f0c6a5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/41f0c6a5

Branch: refs/heads/master
Commit: 41f0c6a5a6be887fbe42ae338274096c27090af6
Parents: 3b5a363
Author: Csaba Ringhofer <csringhofer@cloudera.com>
Authored: Fri Sep 1 17:50:35 2017 +0200
Committer: Impala Public Jenkins <impala-public-jenkins@gerrit.cloudera.org>
Committed: Sat Oct 21 03:13:14 2017 +0000

----------------------------------------------------------------------
 be/src/exec/parquet-column-readers.cc           |  2 +-
 be/src/exec/parquet-column-stats.inline.h       |  2 +-
 be/src/exprs/expr-test.cc                       | 15 ++++
 be/src/exprs/timestamp-functions-ir.cc          |  1 -
 be/src/runtime/timestamp-test.cc                | 26 +++++++
 be/src/runtime/timestamp-value.cc               |  7 ++
 be/src/runtime/timestamp-value.h                | 21 +++--
 .../queries/QueryTest/exprs.test                | 82 ++++++++++++++++++++
 8 files changed, 146 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/41f0c6a5/be/src/exec/parquet-column-readers.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-readers.cc b/be/src/exec/parquet-column-readers.cc
index de63859..ae0c0ca 100644
--- a/be/src/exec/parquet-column-readers.cc
+++ b/be/src/exec/parquet-column-readers.cc
@@ -622,7 +622,7 @@ inline bool ScalarColumnReader<TimestampValue, true>::NeedsValidationInline()
co
 template<>
 bool ScalarColumnReader<TimestampValue, true>::ValidateSlot(
     TimestampValue* src, Tuple* tuple) const {
-  if (UNLIKELY(!src->IsValidDate())) {
+  if (UNLIKELY(!TimestampValue::IsValidDate(src->date()))) {
     ErrorMsg msg(TErrorCode::PARQUET_TIMESTAMP_OUT_OF_RANGE,
         filename(), node_.element->name);
     Status status = parent_->state_->LogOrReturnError(msg);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/41f0c6a5/be/src/exec/parquet-column-stats.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.inline.h b/be/src/exec/parquet-column-stats.inline.h
index 9b81ba8..c0e90aa 100644
--- a/be/src/exec/parquet-column-stats.inline.h
+++ b/be/src/exec/parquet-column-stats.inline.h
@@ -127,7 +127,7 @@ inline bool ColumnStats<TimestampValue>::DecodePlainValue(
   // statistics written by Hive / old versions of parquet-mr. Should Hive add support for
   // writing new statistics for the deprecated timestamp type, we will have to add support
   // for conversion here.
-  return result->IsValidDate();
+  return TimestampValue::IsValidDate(result->date());
 }
 
 /// parquet::Statistics stores string values directly and does not use plain encoding.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/41f0c6a5/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index d923b00..8fa252b 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -6157,6 +6157,21 @@ TEST_F(ExprTest, TimestampFunctions) {
 
   // last_day udf test for IMPALA-5316
   TestLastDayFunction();
+
+  // Test microsecond unix time conversion functions.
+  TestValue("utc_to_unix_micros(\"1400-01-01 00:00:00\")", TYPE_BIGINT,
+      -17987443200000000);
+  TestValue("utc_to_unix_micros(\"1970-01-01 00:00:00\")", TYPE_BIGINT,
+      0);
+  TestValue("utc_to_unix_micros(\"9999-01-01 23:59:59.9999999\")", TYPE_BIGINT,
+      253370851200000000);
+
+  TestStringValue("cast(unix_micros_to_utc_timestamp(-17987443200000000) as string)",
+      "1400-01-01 00:00:00");
+  TestIsNull("unix_micros_to_utc_timestamp(-17987443200000001)", TYPE_TIMESTAMP);
+  TestStringValue("cast(unix_micros_to_utc_timestamp(253402300799999999) as string)",
+      "9999-12-31 23:59:59.999999000");
+  TestIsNull("unix_micros_to_utc_timestamp(253402300800000000)", TYPE_TIMESTAMP);
 }
 
 TEST_F(ExprTest, ConditionalFunctions) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/41f0c6a5/be/src/exprs/timestamp-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/timestamp-functions-ir.cc b/be/src/exprs/timestamp-functions-ir.cc
index c209f21..7f0cc2f 100644
--- a/be/src/exprs/timestamp-functions-ir.cc
+++ b/be/src/exprs/timestamp-functions-ir.cc
@@ -329,7 +329,6 @@ StringVal TimestampFunctions::ToDate(FunctionContext* context,
   // Defensively, return NULL if the timestamp does not have a date portion. Some of
   // our built-in functions might incorrectly return such a malformed timestamp.
   if (!ts_value.HasDate()) return StringVal::null();
-  DCHECK(ts_value.IsValidDate());
   StringVal result(context, 10);
   result.len = 10;
   // Fill in year, month, and day.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/41f0c6a5/be/src/runtime/timestamp-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc
index b8919cc..3a54026 100644
--- a/be/src/runtime/timestamp-test.cc
+++ b/be/src/runtime/timestamp-test.cc
@@ -661,6 +661,16 @@ TEST(TimestampTest, Basic) {
   EXPECT_FALSE(too_early.HasTime());
   EXPECT_FALSE(too_early.UtcToUnixTimeMicros(&tm_min_micros));
 
+  // Sub-second FromUnixTime functions incorrectly accepted the last second of 1399
+  // as valid, because validation logic checked the nearest second rounded towards 0
+  // (IMPALA-5664).
+  EXPECT_FALSE(TimestampValue::FromSubsecondUnixTime(
+      MIN_DATE_AS_UNIX_TIME - 0.1).HasDate());
+  EXPECT_FALSE(TimestampValue::UtcFromUnixTimeMicros(
+      MIN_DATE_AS_UNIX_TIME * MICROS_PER_SEC - 2000).HasDate());
+  EXPECT_FALSE(TimestampValue::FromUnixTimeNanos(
+      MIN_DATE_AS_UNIX_TIME, -NANOS_PER_MICRO * 100).HasDate());
+
   // Test the max supported date that can be represented in seconds.
   const int64_t MAX_DATE_AS_UNIX_TIME = 253402300799;
   TimestampValue max_date =
@@ -700,6 +710,22 @@ TEST(TimestampTest, Basic) {
   EXPECT_FALSE(too_late.HasDate());
   EXPECT_FALSE(too_late.HasTime());
 
+  // Checking sub-second FromUnixTime functions near 10000.01.01
+  EXPECT_TRUE(TimestampValue::FromSubsecondUnixTime(
+      MAX_DATE_AS_UNIX_TIME + 0.99).HasDate());
+  EXPECT_FALSE(TimestampValue::FromSubsecondUnixTime(
+      MAX_DATE_AS_UNIX_TIME + 1).HasDate());
+
+  EXPECT_TRUE(TimestampValue::UtcFromUnixTimeMicros(
+      MAX_DATE_AS_UNIX_TIME * MICROS_PER_SEC + MICROS_PER_SEC - 1).HasDate());
+  EXPECT_FALSE(TimestampValue::UtcFromUnixTimeMicros(
+      MAX_DATE_AS_UNIX_TIME * MICROS_PER_SEC + MICROS_PER_SEC).HasDate());
+
+  EXPECT_TRUE(TimestampValue::FromUnixTimeNanos(
+      MAX_DATE_AS_UNIX_TIME, NANOS_PER_SEC - 1).HasDate());
+  EXPECT_FALSE(TimestampValue::FromUnixTimeNanos(
+      MAX_DATE_AS_UNIX_TIME, NANOS_PER_SEC).HasDate());
+
   // Regression tests for IMPALA-1676, Unix times overflow int32 during year 2038
   EXPECT_EQ("2038-01-19 03:14:08",
       TimestampValue::FromUnixTime(2147483648).ToString());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/41f0c6a5/be/src/runtime/timestamp-value.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-value.cc b/be/src/runtime/timestamp-value.cc
index 8d27b46..20d6cee 100644
--- a/be/src/runtime/timestamp-value.cc
+++ b/be/src/runtime/timestamp-value.cc
@@ -112,6 +112,13 @@ ostream& operator<<(ostream& os, const TimestampValue&
timestamp_value) {
   return os << timestamp_value.ToString();
 }
 
+void TimestampValue::Validate() {
+    if (HasDate() && UNLIKELY(!IsValidDate(date_))) {
+      time_ = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
+      date_ = boost::gregorian::date(boost::gregorian::not_a_date_time);
+    }
+}
+
 /// Return a ptime representation of the given Unix time (seconds since the Unix epoch).
 /// The time zone of the resulting ptime is local time. This is called by
 /// UnixTimeToPtime.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/41f0c6a5/be/src/runtime/timestamp-value.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-value.h b/be/src/runtime/timestamp-value.h
index eb33195..0ad7540 100644
--- a/be/src/runtime/timestamp-value.h
+++ b/be/src/runtime/timestamp-value.h
@@ -40,7 +40,7 @@ struct DateTimeFormatContext;
 
 /// Represents either a (1) date and time, (2) a date with an undefined time, or (3)
 /// a time with an undefined date. In all cases, times have up to nanosecond resolution
-/// and the minimum and maximum dates are 1400-01-01 and 10000-12-31.
+/// and the minimum and maximum dates are 1400-01-01 and 9999-12-31.
 //
 /// This type is similar to Postgresql TIMESTAMP WITHOUT TIME ZONE datatype and MySQL's
 /// DATETIME datatype. Note that because TIMESTAMP does not contain time zone
@@ -73,10 +73,14 @@ class TimestampValue {
   TimestampValue(const boost::gregorian::date& d,
       const boost::posix_time::time_duration& t)
       : time_(t),
-        date_(d) {}
+        date_(d) {
+    Validate();
+  }
   TimestampValue(const boost::posix_time::ptime& t)
       : time_(t.time_of_day()),
-        date_(t.date()) {}
+        date_(t.date()) {
+    Validate();
+  }
   TimestampValue(const TimestampValue& tv) : time_(tv.time_), date_(tv.date_) {}
 
   /// Constructors that parse from a date/time string. See TimestampParser for details
@@ -165,8 +169,8 @@ class TimestampValue {
 
   std::string ToString() const;
 
-  /// Verifies that the timestamp date falls into a valid range (years 1400..9999).
-  inline bool IsValidDate() const {
+  /// Verifies that the date falls into a valid range (years 1400..9999).
+  static inline bool IsValidDate(const boost::gregorian::date& date) {
     // Smallest valid day number.
     const static int64_t MIN_DAY_NUMBER = static_cast<int64_t>(
         boost::gregorian::date(boost::date_time::min_date_time).day_number());
@@ -174,8 +178,8 @@ class TimestampValue {
     const static int64_t MAX_DAY_NUMBER = static_cast<int64_t>(
         boost::gregorian::date(boost::date_time::max_date_time).day_number());
 
-    return date_.day_number() >= MIN_DAY_NUMBER
-        && date_.day_number() <= MAX_DAY_NUMBER;
+    return date.day_number() >= MIN_DAY_NUMBER
+        && date.day_number() <= MAX_DAY_NUMBER;
   }
 
   /// Formats the timestamp using the given date/time context and places the result in the
@@ -276,6 +280,9 @@ class TimestampValue {
   /// 4 -bytes - stores the date as a day
   boost::gregorian::date date_;
 
+  /// Sets both date and time to invalid if date is outside the valid range.
+  void Validate();
+
   /// Return a ptime representation of the given Unix time (seconds since the Unix epoch).
   /// The time zone of the resulting ptime is determined by
   /// FLAGS_use_local_tz_for_unix_timestamp_conversions. If the flag is true, the value

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/41f0c6a5/testdata/workloads/functional-query/queries/QueryTest/exprs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index 552dea2..b1abcbd 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -2718,6 +2718,88 @@ TIMESTAMP
 UDF WARNING: Cannot subtract interval 1: Year is out of valid range: 1400..9999
 ====
 ---- QUERY
+# Test <1400 date handling during double to timestamp conversion.
+select CAST(CAST(CAST('1400-01-01' AS TIMESTAMP) AS DOUBLE) - 0.1 AS TIMESTAMP)
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test >=10000 date handling during double to timestamp conversion.
+select CAST(CAST(CAST('9999-12-31 23:59:59' AS TIMESTAMP) AS DOUBLE) + 1.1 AS TIMESTAMP)
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test <1400 date handling during bigint to timestamp conversion.
+select CAST(CAST(CAST('1400-01-01' AS TIMESTAMP) AS BIGINT) - 1 AS TIMESTAMP)
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test >=10000 date handling during bigint to timestamp conversion.
+select CAST(CAST(CAST('9999-12-31 23:59:59' AS TIMESTAMP) AS BIGINT) + 1 AS TIMESTAMP)
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test <1400 date handling during microsecond unix time to timestamp conversion.
+select unix_micros_to_utc_timestamp(
+  CAST(CAST('1400-01-01' AS TIMESTAMP) AS BIGINT) * 1000000 - 1)
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test >=10000 date handling during microsecond unix time to timestamp conversion.
+select unix_micros_to_utc_timestamp(
+  (CAST(CAST('9999-12-31 23:59:59' AS TIMESTAMP) AS BIGINT) + 1) * 1000000 + 1)
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test <1400 date handling during string to timestamp conversion.
+select CAST("1399-12-31 23:59:59.99999" AS TIMESTAMP);
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test >=10000 date handling during string to timestamp conversion.
+select CAST("10000-01-01" AS TIMESTAMP);
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test leap year checking during string to timestamp conversion.
+select CAST("1900-02-29" AS TIMESTAMP);
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
+# Test invalid format handling during string to timestamp conversion.
+select CAST("not a timestamp" AS TIMESTAMP);
+---- RESULTS
+NULL
+---- TYPES
+TIMESTAMP
+====
+---- QUERY
 # IMPALA-4574: UUID() is not a constant expression.
 select count(*) from functional.alltypestiny group by concat(uuid(), "_test")
 ---- RESULTS


Mime
View raw message