incubator-crunch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ki...@apache.org
Subject git commit: CRUNCH-98. Sampling Scala PCollection.
Date Wed, 17 Oct 2012 20:13:49 GMT
Updated Branches:
  refs/heads/master 2576896c9 -> 14132b093


CRUNCH-98. Sampling Scala PCollection.


Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/14132b09
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/14132b09
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/14132b09

Branch: refs/heads/master
Commit: 14132b093cd5201b0f323d7fb9c7a9ab4a58a679
Parents: 2576896
Author: Kiyan Ahmadizadeh <kiyan@wibidata.com>
Authored: Tue Oct 16 15:29:41 2012 -0700
Committer: Kiyan Ahmadizadeh <kiyan@wibidata.com>
Committed: Tue Oct 16 16:10:47 2012 -0700

----------------------------------------------------------------------
 .../apache/crunch/scrunch/PCollectionTest.scala    |   28 +++++++++++++++
 .../org/apache/crunch/scrunch/PCollection.scala    |    8 ++++
 2 files changed, 36 insertions(+), 0 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
index 4c25298..94ac917 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
@@ -69,4 +69,32 @@ class PCollectionTest extends CrunchTestSupport with JUnitSuite {
     assertEquals("Wrong last line in Shakespeare.", lastLineInShakespeare,
         lines(linesInShakespeare - 1))
   }
+
+  /**
+   * Tests sampling elements from a PCollection using some acceptance probability.
+   */
+  @Test def testSampling {
+    // Get the collection and sample ten percent.
+    val shakespeare = shakespeareCollection
+    val sampledCollection = shakespeare.sample(0.10)
+    val length = sampledCollection.length().value()
+    // The number of lines in the sampled collection should be about ten percent of the lines
in
+    // the original collection. We use a tolerance of +- 50.
+    val lower = linesInShakespeare * 0.10 - 50
+    val upper = linesInShakespeare * 0.10 + 50
+    assertTrue("Sampled collection contains too few elements.", lower <= length)
+    assertTrue("Sampled collection contains too many elements.", length <= upper)
+  }
+
+  /**
+   * Tests sampling elements from a PCollection using some acceptance probability and a seed.
+   */
+  @Test def testSamplingWithSeed {
+    // Get the collection and sample ten percent.
+    val shakespeare = shakespeareCollection
+    // With a seed of 1L, 380 elements should be sampled.
+    val sampledCollection = shakespeare.sample(0.10, 1L)
+    val length = sampledCollection.length().value()
+    assertEquals("Incorrect number of elements sampled with seed 1L.", 380L, length)
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
index 89959ea..ac2242f 100644
--- a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
+++ b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
@@ -76,6 +76,14 @@ class PCollection[S](val native: JCollection[S]) extends PCollectionLike[S,
PCol
 
   def min()(implicit converter: Converter[S, S]) = PObject(Aggregate.min(native))(converter)
 
+  def sample(acceptanceProbability: Double) = {
+    wrap(native.sample(acceptanceProbability))
+  }
+
+  def sample(acceptanceProbability: Double, seed: Long) = {
+    wrap(native.sample(acceptanceProbability, seed))
+  }
+
   def pType = native.getPType()
 }
 


Mime
View raw message