orc-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From omal...@apache.org
Subject [1/2] orc git commit: HIVE-13948. Incorrect timezone handling in Writable results in wrong dates in queries
Date Fri, 10 Jun 2016 19:48:26 GMT
Repository: orc
Updated Branches:
  refs/heads/branch-1.1 3b778c8cc -> 662938edc


HIVE-13948. Incorrect timezone handling in Writable results in wrong dates in
queries


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/f79238ec
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/f79238ec
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/f79238ec

Branch: refs/heads/branch-1.1
Commit: f79238ec69fe89ff84b8ea9aba8936e116a0a23c
Parents: 3b778c8
Author: Owen O'Malley <omalley@apache.org>
Authored: Fri Jun 10 12:18:54 2016 -0700
Committer: Owen O'Malley <omalley@apache.org>
Committed: Fri Jun 10 12:18:54 2016 -0700

----------------------------------------------------------------------
 .../hadoop/hive/serde2/io/DateWritable.java     | 68 ++++++++++++++++----
 1 file changed, 57 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/f79238ec/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java
b/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java
index dd2b1d9..637720a 100644
--- a/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java
+++ b/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java
@@ -22,6 +22,7 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.sql.Date;
 import java.util.Calendar;
+import java.util.GregorianCalendar;
 import java.util.TimeZone;
 import java.util.concurrent.TimeUnit;
 
@@ -41,7 +42,7 @@ public class DateWritable implements WritableComparable<DateWritable>
{
 
   private static final long MILLIS_PER_DAY = TimeUnit.DAYS.toMillis(1);
 
-  // Local time zone.
+  // Local time zone. Store separately because Calendar would clone it.
   // Java TimeZone has no mention of thread safety. Use thread local instance to be safe.
   private static final ThreadLocal<TimeZone> LOCAL_TIMEZONE = new ThreadLocal<TimeZone>()
{
     @Override
@@ -50,6 +51,19 @@ public class DateWritable implements WritableComparable<DateWritable>
{
     }
   };
 
+  private static final ThreadLocal<Calendar> UTC_CALENDAR = new ThreadLocal<Calendar>()
{
+    @Override
+    protected Calendar initialValue() {
+      return new GregorianCalendar(TimeZone.getTimeZone("UTC"));
+    }
+  };
+  private static final ThreadLocal<Calendar> LOCAL_CALENDAR = new ThreadLocal<Calendar>()
{
+    @Override
+    protected Calendar initialValue() {
+      return Calendar.getInstance();
+    }
+  };
+
   // Internal representation is an integer representing day offset from our epoch value 1970-01-01
   private int daysSinceEpoch = 0;
 
@@ -95,11 +109,16 @@ public class DateWritable implements WritableComparable<DateWritable>
{
   }
 
   /**
-   *
    * @return Date value corresponding to the date in the local time zone
    */
   public Date get() {
-    return new Date(daysToMillis(daysSinceEpoch));
+    return get(true);
+  }
+
+  // TODO: we should call this more often. In theory, for DATE type, time should never matter,
but
+  //       it's hard to tell w/some code paths like UDFs/OIs etc. that are used in many places.
+  public Date get(boolean doesTimeMatter) {
+    return new Date(daysToMillis(daysSinceEpoch, doesTimeMatter));
   }
 
   public int getDays() {
@@ -119,21 +138,47 @@ public class DateWritable implements WritableComparable<DateWritable>
{
   }
 
   public static long daysToMillis(int d) {
-    // Convert from day offset to ms in UTC, then apply local timezone offset.
-    long millisUtc = d * MILLIS_PER_DAY;
-    long tmp =  millisUtc - LOCAL_TIMEZONE.get().getOffset(millisUtc);
-    // Between millisUtc and tmp, the time zone offset may have changed due to DST.
-    // Look up the offset again.
-    return millisUtc - LOCAL_TIMEZONE.get().getOffset(tmp);
+    return daysToMillis(d, true);
+  }
+
+  public static long daysToMillis(int d, boolean doesTimeMatter) {
+    // What we are trying to get is the equivalent of new Date(ymd).getTime() in the local
tz,
+    // where ymd is whatever d represents. How it "works" is this.
+    // First we get the UTC midnight for that day (which always exists, a small island of
sanity).
+    long utcMidnight = d * MILLIS_PER_DAY;
+    // Now we take a local TZ offset at midnight UTC. Say we are in -4; that means (surprise
+    // surprise) that at midnight UTC it was 20:00 in local. So far we are on firm ground.
+    long utcMidnightOffset = LOCAL_TIMEZONE.get().getOffset(utcMidnight);
+    // And now we wander straight into the swamp, when instead of adding, we subtract it
from UTC
+    // midnight to supposedly get local midnight (in the above case, 4:00 UTC). Of course,
given
+    // all the insane DST variations, where we actually end up is anyone's guess.
+    long hopefullyMidnight = utcMidnight - utcMidnightOffset;
+    // Then we determine the local TZ offset at that magical time.
+    long offsetAtHM = LOCAL_TIMEZONE.get().getOffset(hopefullyMidnight);
+    // If the offsets are the same, we assume our initial jump did not cross any DST boundaries,
+    // and is thus valid. Both times flowed at the same pace. We congratulate ourselves and
bail.
+    if (utcMidnightOffset == offsetAtHM) return hopefullyMidnight;
+    // Alas, we crossed some DST boundary. If the time of day doesn't matter to the caller,
we'll
+    // simply get the next day and go back half a day. This is not ideal but seems to work.
+    if (!doesTimeMatter) return daysToMillis(d + 1) - (MILLIS_PER_DAY >> 1);
+    // Now, we could get previous and next day, figure our how many hours were inserted or
removed,
+    // and from which of the days, etc. But at this point our gun is pointing straight at
our foot,
+    // so let's just go the safe, expensive way.
+    Calendar utc = UTC_CALENDAR.get(), local = LOCAL_CALENDAR.get();
+    utc.setTimeInMillis(utcMidnight);
+    local.set(utc.get(Calendar.YEAR), utc.get(Calendar.MONTH), utc.get(Calendar.DAY_OF_MONTH));
+    return local.getTimeInMillis();
   }
 
   public static int millisToDays(long millisLocal) {
+    // We assume millisLocal is midnight of some date. What we are basically trying to do
+    // here is go from local-midnight to UTC-midnight (or whatever time that happens to be).
     long millisUtc = millisLocal + LOCAL_TIMEZONE.get().getOffset(millisLocal);
     int days;
     if (millisUtc >= 0L) {
       days = (int) (millisUtc / MILLIS_PER_DAY);
     } else {
-      days = (int) ((millisUtc - 86399999) / MILLIS_PER_DAY);
+      days = (int) ((millisUtc - 86399999 /*(MILLIS_PER_DAY - 1)*/) / MILLIS_PER_DAY);
     }
     return days;
   }
@@ -169,7 +214,8 @@ public class DateWritable implements WritableComparable<DateWritable>
{
 
   @Override
   public String toString() {
-    return get().toString();
+    // For toString, the time does not matter
+    return get(false).toString();
   }
 
   @Override


Mime
View raw message