spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject [15/20] git commit: Added Java API for countApproxDistinctByKey
Date Wed, 01 Jan 2014 01:48:45 GMT
Added Java API for countApproxDistinctByKey


Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/ed06500d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/ed06500d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/ed06500d

Branch: refs/heads/master
Commit: ed06500d300e93ae3129a035a364117adcb7d361
Parents: b75d7c9
Author: Hossein Falaki <falaki@gmail.com>
Authored: Mon Dec 30 19:30:42 2013 -0800
Committer: Hossein Falaki <falaki@gmail.com>
Committed: Mon Dec 30 19:30:42 2013 -0800

----------------------------------------------------------------------
 .../org/apache/spark/api/java/JavaPairRDD.scala | 36 ++++++++++++++++++++
 1 file changed, 36 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/ed06500d/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 363667f..55c8745 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -611,6 +611,42 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag:
ClassTag[K
    * Return an RDD with the values of each tuple.
    */
   def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2))
+
+  /**
+   * Return approximate number of distinct values for each key in this RDD.
+   * The accuracy of approximation can be controlled through the relative standard deviation
+   * (relativeSD) parameter, which also controls the amount of memory used. Lower values
result in
+   * more accurate counts but increase the memory footprint and vise versa. Uses the provided
+   * Partitioner to partition the output RDD.
+   */
+  def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaRDD[(K,
Long)] = {
+    rdd.countApproxDistinctByKey(relativeSD, partitioner)
+  }
+
+  /**
+   * Return approximate number of distinct values for each key this RDD.
+   * The accuracy of approximation can be controlled through the relative standard deviation
+   * (relativeSD) parameter, which also controls the amount of memory used. Lower values
result in
+   * more accurate counts but increase the memory footprint and vise versa. The default value
of
+   * relativeSD is 0.05. Hash-partitions the output RDD using the existing partitioner/parallelism
+   * level.
+   */
+  def countApproxDistinctByKey(relativeSD: Double = 0.05): JavaRDD[(K, Long)] = {
+    rdd.countApproxDistinctByKey(relativeSD)
+  }
+
+
+  /**
+   * Return approximate number of distinct values for each key in this RDD.
+   * The accuracy of approximation can be controlled through the relative standard deviation
+   * (relativeSD) parameter, which also controls the amount of memory used. Lower values
result in
+   * more accurate counts but increase the memory footprint and vise versa. HashPartitions
the
+   * output RDD into numPartitions.
+   *
+   */
+  def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaRDD[(K, Long)]
= {
+    rdd.countApproxDistinctByKey(relativeSD, numPartitions)
+  }
 }
 
 object JavaPairRDD {


Mime
View raw message