spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From m...@apache.org
Subject spark git commit: [SPARK-5186][branch-1.2] Vector.hashCode is not efficient
Date Thu, 12 Mar 2015 08:40:45 GMT
Repository: spark
Updated Branches:
  refs/heads/branch-1.2 d7c359b49 -> c684e5f9f


[SPARK-5186][branch-1.2] Vector.hashCode is not efficient

Backport hhbyyh 's hasCode implementation to branch-1.2. The old implementation causes performance
issues with PySpark, which calls hashCode (https://issues.apache.org/jira/browse/SPARK-6288).

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #4985 from mengxr/SPARK-5186-1.2 and squashes the following commits:

155e559 [Yuhao Yang] backport SPARK-5186


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c684e5f9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c684e5f9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c684e5f9

Branch: refs/heads/branch-1.2
Commit: c684e5f9ff193ac78d48f214d7f8b87df227c8a6
Parents: d7c359b
Author: Yuhao Yang <hhbyyh@gmail.com>
Authored: Thu Mar 12 01:40:40 2015 -0700
Committer: Xiangrui Meng <meng@databricks.com>
Committed: Thu Mar 12 01:40:40 2015 -0700

----------------------------------------------------------------------
 .../org/apache/spark/mllib/linalg/Vectors.scala | 55 ++++++++++++++++++--
 .../spark/mllib/linalg/VectorsSuite.scala       | 18 +++++++
 2 files changed, 70 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/c684e5f9/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 47d1a76..451fe91 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -51,13 +51,35 @@ sealed trait Vector extends Serializable {
 
   override def equals(other: Any): Boolean = {
     other match {
-      case v: Vector =>
-        util.Arrays.equals(this.toArray, v.toArray)
+      case v2: Vector => {
+        if (this.size != v2.size) return false
+        (this, v2) match {
+          case (s1: SparseVector, s2: SparseVector) =>
+            Vectors.equals(s1.indices, s1.values, s2.indices, s2.values)
+          case (s1: SparseVector, d1: DenseVector) =>
+            Vectors.equals(s1.indices, s1.values, 0 until d1.size, d1.values)
+          case (d1: DenseVector, s1: SparseVector) =>
+            Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values)
+          case (_, _) => util.Arrays.equals(this.toArray, v2.toArray)
+        }
+      }
       case _ => false
     }
   }
 
-  override def hashCode(): Int = util.Arrays.hashCode(this.toArray)
+  override def hashCode(): Int = {
+    var result: Int = size + 31
+    this.foreachActive { case (index, value) =>
+      // ignore explict 0 for comparison between sparse and dense
+      if (value != 0) {
+        result = 31 * result + index
+        // refer to {@link java.util.Arrays.equals} for hash algorithm
+        val bits = java.lang.Double.doubleToLongBits(value)
+        result = 31 * result + (bits ^ (bits >>> 32)).toInt
+      }
+    }
+    return result
+  }
 
   /**
    * Converts the instance to a breeze vector.
@@ -312,6 +334,33 @@ object Vectors {
       math.pow(sum, 1.0 / p)
     }
   }
+
+  /**
+   * Check equality between sparse/dense vectors
+   */
+  private[mllib] def equals(
+      v1Indices: IndexedSeq[Int],
+      v1Values: Array[Double],
+      v2Indices: IndexedSeq[Int],
+      v2Values: Array[Double]): Boolean = {
+    val v1Size = v1Values.size
+    val v2Size = v2Values.size
+    var k1 = 0
+    var k2 = 0
+    var allEqual = true
+    while (allEqual) {
+      while (k1 < v1Size && v1Values(k1) == 0) k1 += 1
+      while (k2 < v2Size && v2Values(k2) == 0) k2 += 1
+
+      if (k1 >= v1Size || k2 >= v2Size) {
+        return k1 >= v1Size && k2 >= v2Size // check end alignment
+      }
+      allEqual = v1Indices(k1) == v2Indices(k2) && v1Values(k1) == v2Values(k2)
+      k1 += 1
+      k2 += 1
+    }
+    allEqual
+  }
 }
 
 /**

http://git-wip-us.apache.org/repos/asf/spark/blob/c684e5f9/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index f99f014..b9a8aec 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -87,6 +87,24 @@ class VectorsSuite extends FunSuite {
     }
   }
 
+  test("vectors equals with explicit 0") {
+    val dv1 = Vectors.dense(Array(0, 0.9, 0, 0.8, 0))
+    val sv1 = Vectors.sparse(5, Array(1, 3), Array(0.9, 0.8))
+    val sv2 = Vectors.sparse(5, Array(0, 1, 2, 3, 4), Array(0, 0.9, 0, 0.8, 0))
+
+    val vectors = Seq(dv1, sv1, sv2)
+    for (v <- vectors; u <- vectors) {
+      assert(v === u)
+      assert(v.## === u.##)
+    }
+
+    val another = Vectors.sparse(5, Array(0, 1, 3), Array(0, 0.9, 0.2))
+    for (v <- vectors) {
+      assert(v != another)
+      assert(v.## != another.##)
+    }
+  }
+
   test("indexing dense vectors") {
     val vec = Vectors.dense(1.0, 2.0, 3.0, 4.0)
     assert(vec(0) === 1.0)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message