crunch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gr...@apache.org
Subject crunch git commit: CRUNCH-528 Improve Pair comparison
Date Thu, 28 May 2015 08:04:02 GMT
Repository: crunch
Updated Branches:
  refs/heads/master 06688d55e -> 626853abb


CRUNCH-528 Improve Pair comparison

Resolves the issue of integer overflow when comparing hash codes
of values, as well as providing stable sorting of null values
and non-equal objects with hash collisions.

Description of the underlying problem:

If the hash code values are too large, then the values will wrap
when doing subtraction. This results in a comparison function
that is not transitive.

Among other things, this makes Joins using the in-memory
pipeline not work, since the in-memory shuffler uses a TreeMap
if the key type is Comparable. Since the key in a join is a Pair
of the original key and a join tag, the key is always comparable.
With a non-transitive comparison function, it is possible for
the two join tags of the original key to sort differently,
resulting in the two join tags not being adjacent for the
original key. This results either in either the cross product
erroneously producing no values in the case of an inner join,
since the two join tags are not adjacent, or null values
appearing when they should not in the case of an outer join.


Project: http://git-wip-us.apache.org/repos/asf/crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/626853ab
Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/626853ab
Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/626853ab

Branch: refs/heads/master
Commit: 626853abb7f996107941875653b8292f462ba3b9
Parents: 06688d5
Author: Brandon Vargo <brandon.vargo@gmail.com>
Authored: Wed May 27 10:42:09 2015 -0600
Committer: Gabriel Reid <greid@apache.org>
Committed: Thu May 28 08:59:28 2015 +0200

----------------------------------------------------------------------
 .../src/main/java/org/apache/crunch/Pair.java   | 25 ++++++++--
 .../test/java/org/apache/crunch/PairTest.java   | 50 ++++++++++++++++++++
 2 files changed, 71 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/crunch/blob/626853ab/crunch-core/src/main/java/org/apache/crunch/Pair.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Pair.java b/crunch-core/src/main/java/org/apache/crunch/Pair.java
index 462434c..176c4aa 100644
--- a/crunch-core/src/main/java/org/apache/crunch/Pair.java
+++ b/crunch-core/src/main/java/org/apache/crunch/Pair.java
@@ -17,10 +17,12 @@
  */
 package org.apache.crunch;
 
-import org.apache.commons.lang.builder.HashCodeBuilder;
-
 import java.io.Serializable;
 
+import com.google.common.collect.ComparisonChain;
+import com.google.common.collect.Ordering;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+
 /**
  * A convenience class for two-element {@link Tuple}s.
  */
@@ -91,9 +93,24 @@ public class Pair<K, V> implements Tuple, Comparable<Pair<K,
V>>, Serializable {
     if (lhs == rhs) {
       return 0;
     } else if (lhs != null && Comparable.class.isAssignableFrom(lhs.getClass()))
{
-      return ((Comparable) lhs).compareTo(rhs);
+      return Ordering.natural().nullsLast().compare((Comparable) lhs, (Comparable) rhs);//(Comparable)
lhs).compareTo(rhs);
+    }
+    if (lhs == null) {
+      return 1; // nulls last
     }
-    return (lhs == null ? 0 : lhs.hashCode()) - (rhs == null ? 0 : rhs.hashCode());
+    if (rhs == null) {
+      return -1; // nulls last
+    }
+    if (lhs.equals(rhs)) {
+      return 0;
+    }
+
+    // Now we compare based on hash code. We already know that the two sides are not equal,
so
+    // if the hash codes are equal, we just use arbitrary (but consistent) ordering
+    return ComparisonChain.start()
+        .compare(lhs.hashCode(), rhs.hashCode())
+        .compare(lhs, rhs, Ordering.arbitrary())
+        .result();
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/crunch/blob/626853ab/crunch-core/src/test/java/org/apache/crunch/PairTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/PairTest.java b/crunch-core/src/test/java/org/apache/crunch/PairTest.java
index 106413c..4119201 100644
--- a/crunch-core/src/test/java/org/apache/crunch/PairTest.java
+++ b/crunch-core/src/test/java/org/apache/crunch/PairTest.java
@@ -62,5 +62,55 @@ public class PairTest {
     assertTrue(Pair.of(2, "a").compareTo(Pair.of(1, "a")) > 0);
     assertTrue(Pair.of("a", 2).compareTo(Pair.of("a", 1)) > 0);
     assertTrue(Pair.of(null, 17).compareTo(Pair.of(null, 29)) < 0);
+    assertTrue(Pair.of(Integer.MIN_VALUE, 0).compareTo(Pair.of((Integer)null, 0)) < 0);
+    assertTrue(Pair.of(0, Integer.MIN_VALUE).compareTo(Pair.of(0, (Integer)null)) < 0);
+    assertTrue(Pair.of(2, "a").compareTo(Pair.of(1, "a")) > 0);
+    assertTrue(Pair.of(new HashValue(2), "a").compareTo(Pair.of(new HashValue(1), "a")) >
0);
+    assertTrue(Pair.of(new HashValue(Integer.MAX_VALUE), "a").compareTo(
+          Pair.of(new HashValue(Integer.MIN_VALUE), "a")) > 0);
+    assertTrue(Pair.of(new HashValue(0), "a").compareTo(Pair.of((HashValue) null, "a")) <
0);
+    // Two value whose hash code is the same but are not equal (via equals) should not compare
as the same
+    assertTrue(Pair.of(new HashValue(0, 1), 0).compareTo(Pair.of(new HashValue(0, 2), 0))
!= 0);
+  }
+
+  /**
+   * An object with a known hash value that is not comparable used for testing the comparison
logic
+   * within Pair.
+   */
+  private static final class HashValue {
+    private final int value;
+    private final int additionalValue;  // Additional value which is used for equality but
not hash code
+
+    public HashValue(int value) {
+      this(value, 0);
+    }
+
+    public HashValue(int value, int additionalValue) {
+      this.value = value;
+      this.additionalValue = additionalValue;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o)
+        return true;
+      if (!(o instanceof HashValue))
+        return false;
+
+      HashValue other = (HashValue) o;
+
+      if (value != other.value)
+        return false;
+
+      if (additionalValue != other.additionalValue)
+        return false;
+
+      return true;
+    }
+
+    @Override
+    public int hashCode() {
+      return this.value;
+    }
   }
 }


Mime
View raw message