impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tarmstr...@apache.org
Subject [2/2] incubator-impala git commit: IMPALA-3894: Change the behavior parsing date "YY"
Date Thu, 03 Aug 2017 04:21:09 GMT
IMPALA-3894: Change the behavior parsing date "YY"

This patch change the behavor when running unix_timestamp(string, string)
function. Before the change Impala directly adds 2000 to the year parsed.
Behavior after change is the same as Hive's, shifting the parsed date
into the interval [current time - 80 years, current time + 20 years).
In 2017, given query
> select from_unixtime(unix_timestamp('31-AUG-94', 'dd-MMM-yy'),'yyyyMMdd');
Impala would output 20940831 before the change and 19940831 with this
patch applied. unix_timestamp function with other forms of parameters
is not affected.
Hive's impelentation can be found at:
https://github.com/apache/hive/blob/39d66a439c02ea2b5c7501b362c0d8f9b8d22cc0/hplsql/src/main/java/org/apache/hive/hplsql/functions/FunctionDatetime.java#L159
It uses SimpleDateFormat, documented at:
https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html

Test: add 4 test cases to timestamp-test.cc, testing the edge cases of
2-year format.

Change-Id: I5da761255915dc741f1dcc488fd4ef6ecc385896
Reviewed-on: http://gerrit.cloudera.org:8080/7530
Reviewed-by: Matthew Jacobs <mj@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5797f859
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5797f859
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5797f859

Branch: refs/heads/master
Commit: 5797f859692ced6f30a790568b29669c663fa65c
Parents: 748fe94
Author: Tianyi Wang <twang@cloudera.com>
Authored: Thu Jul 27 16:51:23 2017 -0700
Committer: Impala Public Jenkins <impala-public-jenkins@gerrit.cloudera.org>
Committed: Thu Aug 3 02:20:48 2017 +0000

----------------------------------------------------------------------
 be/src/exprs/timestamp-functions.cc    |  1 +
 be/src/runtime/timestamp-parse-util.cc | 31 +++++++++++++++++++++++++++--
 be/src/runtime/timestamp-parse-util.h  | 11 ++++++++++
 be/src/runtime/timestamp-test.cc       | 12 +++++++++++
 4 files changed, 53 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5797f859/be/src/exprs/timestamp-functions.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/timestamp-functions.cc b/be/src/exprs/timestamp-functions.cc
index 93a16cd..9cb5430 100644
--- a/be/src/exprs/timestamp-functions.cc
+++ b/be/src/exprs/timestamp-functions.cc
@@ -184,6 +184,7 @@ void TimestampFunctions::UnixAndFromUnixPrepare(
     // This is much cheaper vs alloc/dealloc'ing a context for each evaluation.
     dt_ctx = new DateTimeFormatContext();
   }
+  dt_ctx->SetCenturyBreak(*context->impl()->state()->now());
   context->SetFunctionState(scope, dt_ctx);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5797f859/be/src/runtime/timestamp-parse-util.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc
index d0c595f..9b9e8c8 100644
--- a/be/src/runtime/timestamp-parse-util.cc
+++ b/be/src/runtime/timestamp-parse-util.cc
@@ -19,10 +19,10 @@
 
 #include <boost/assign/list_of.hpp>
 #include <boost/date_time/gregorian/gregorian.hpp>
-#include <boost/date_time/posix_time/posix_time.hpp>
 #include <boost/unordered_map.hpp>
 
 #include "runtime/string-value.inline.h"
+#include "runtime/timestamp-value.h"
 #include "util/string-parser.h"
 
 namespace assign = boost::assign;
@@ -58,6 +58,12 @@ struct DateTimeParseResult {
   }
 };
 
+void DateTimeFormatContext::SetCenturyBreak(const TimestampValue &now) {
+  const date& now_date = now.date();
+  century_break_ptime = boost::posix_time::ptime(
+      date(now_date.year() - 80, now_date.month(), now_date.day()), now.time());
+}
+
 bool TimestampParser::initialized_ = false;
 
 /// Lazily initialized pseudo-constant hashmap for mapping month names to an index.
@@ -422,6 +428,8 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
   // Keep track of the number of characters we need to shift token positions by.
   // Variable-length tokens will result in values > 0;
   int shift_len = 0;
+  // Whether to realign the year for 2-digit year format
+  bool realign_year = false;
   for (const DateTimeFormatToken& tok: dt_ctx.toks) {
     const char* tok_val = str + tok.pos + shift_len;
     if (tok.type == SEPARATOR) {
@@ -442,7 +450,9 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
         dt_result->year = StringParser::StringToInt<int>(tok_val, tok_len, &status);
         if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
         if (UNLIKELY(dt_result->year < 1 || dt_result->year > 9999)) return false;
-        if (tok_len < 4 && dt_result->year < 99) dt_result->year += 2000;
+        // Year in "Y" and "YY" format should be in the interval
+        // [current time - 80 years, current time + 20 years)
+        if (tok_len <= 2) realign_year = true;
         break;
       }
       case MONTH_IN_YEAR: {
@@ -536,6 +546,23 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
       default: DCHECK(false) << "Unknown date/time format token";
     }
   }
+  // Hive uses Java's SimpleDateFormat to parse timestamp:
+  // In SimpleDateFormat, the century for 2-digit-year breaks at current_time - 80 years.
+  // https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html
+  if (realign_year) {
+    DCHECK(!dt_ctx.century_break_ptime.is_special());
+    // Let the century start at AABB and the year parsed be YY, this gives us AAYY.
+    dt_result->year += (dt_ctx.century_break_ptime.date().year() / 100) * 100;
+    date parsed_date(dt_result->year, dt_result->month, dt_result->day);
+    time_duration parsed_time(dt_result->hour, dt_result->minute, dt_result->second,
+        dt_result->fraction);
+    // Advance 100 years if parsed time is before the century break
+    // For example if the century breaks at 1937 but dt_result->year = 1936,
+    // the correct year would be 2036.
+    if (boost::posix_time::ptime(parsed_date, parsed_time) < dt_ctx.century_break_ptime)
{
+      dt_result->year += 100;
+    }
+  }
   return true;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5797f859/be/src/runtime/timestamp-parse-util.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.h b/be/src/runtime/timestamp-parse-util.h
index 8611f87..b68cc85 100644
--- a/be/src/runtime/timestamp-parse-util.h
+++ b/be/src/runtime/timestamp-parse-util.h
@@ -20,6 +20,7 @@
 
 #include <cstddef>
 #include <vector>
+#include <boost/date_time/posix_time/ptime.hpp>
 
 namespace boost {
   namespace gregorian {
@@ -33,6 +34,7 @@ namespace boost {
 namespace impala {
 
 struct DateTimeParseResult;
+class TimestampValue;
 
 /// Add support for dealing with custom date/time formats in Impala. The following
 /// date/time tokens are supported:
@@ -134,6 +136,9 @@ struct DateTimeFormatContext {
   std::vector<DateTimeFormatToken> toks;
   bool has_date_toks;
   bool has_time_toks;
+  /// Current time - 80 years to determine the actual year when
+  /// parsing 1 or 2-digit year token.
+  boost::posix_time::ptime century_break_ptime;
 
   DateTimeFormatContext() {
     Reset(NULL, 0);
@@ -143,6 +148,11 @@ struct DateTimeFormatContext {
     Reset(fmt, fmt_len);
   }
 
+  /// Set the century break when parsing 1 or 2-digit year format.
+  /// When parsing 1 or 2-digit year, the year should be in the interval
+  /// [now - 80 years, now + 20 years), according to Hive.
+  void SetCenturyBreak(const TimestampValue &now);
+
   void Reset(const char* fmt, int fmt_len) {
     this->fmt = fmt;
     this->fmt_len = fmt_len;
@@ -150,6 +160,7 @@ struct DateTimeFormatContext {
     this->has_date_toks = false;
     this->has_time_toks = false;
     this->toks.clear();
+    this->century_break_ptime = boost::posix_time::not_a_date_time;
   }
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5797f859/be/src/runtime/timestamp-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc
index c18313b..5f3e0e6 100644
--- a/be/src/runtime/timestamp-test.cc
+++ b/be/src/runtime/timestamp-test.cc
@@ -230,6 +230,9 @@ void TestTimestampTokens(vector<TimestampToken>* toks, int year,
int month,
 }
 
 TEST(TimestampTest, Basic) {
+  // Fix current time to determine the behavior parsing 2-digit year format
+  TimestampValue now(date(2017, 7, 28), time_duration(16, 14, 24));
+
   char s1[] = "2012-01-20 01:10:01";
   char s2[] = "1990-10-20 10:10:10.123456789  ";
   char s3[] = "  1990-10-20 10:10:10.123456789";
@@ -472,6 +475,14 @@ TEST(TimestampTest, Basic) {
     // Test short year token
     (TimestampTC("y-MM-dd", "2013-11-13", false, true, false, 2013, 11, 13))
     (TimestampTC("y-MM-dd", "13-11-13", false, true, false, 2013, 11, 13))
+    // Test 2-digit year format
+    (TimestampTC("yy-MM-dd", "37-07-28", false, true, false, 2037, 7, 28))
+    (TimestampTC("yy-MM-dd", "37-07-29", false, true, false, 1937, 7, 29))
+    // Test 1-digit year format with time to show the exact boundary
+    (TimestampTC("y-MM-dd HH:mm:ss", "37-07-28 16:14:23", false, true, false,
+                 2037, 7, 28, 16, 14, 23))
+    (TimestampTC("y-MM-dd HH:mm:ss", "37-07-28 16:14:24", false, true, false,
+                 1937, 7, 28, 16, 14, 24))
     // Test short month token
     (TimestampTC("yyyy-M-dd", "2013-11-13", false, true, false, 2013, 11, 13))
     (TimestampTC("yyyy-M-dd", "2013-1-13", false, true, false, 2013, 1, 13))
@@ -501,6 +512,7 @@ TEST(TimestampTest, Basic) {
   for (int i = 0; i < test_cases.size(); ++i) {
     TimestampTC test_case = test_cases[i];
     DateTimeFormatContext dt_ctx(test_case.fmt, strlen(test_case.fmt));
+    dt_ctx.SetCenturyBreak(now);
     bool parse_result = TimestampParser::ParseFormatTokens(&dt_ctx);
     if (test_case.fmt_should_fail) {
       EXPECT_FALSE(parse_result) << "TC: " << i;


Mime
View raw message