mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From p..@apache.org
Subject git commit: MAHOUT-1541 backed out compatability with legacy Item Similarity, now outputs raw LLR scores
Date Sat, 05 Jul 2014 20:36:36 GMT
Repository: mahout
Updated Branches:
  refs/heads/master 8b2bec7f5 -> 24cb5576f


MAHOUT-1541 backed out compatability with legacy Item Similarity, now outputs raw LLR scores


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/24cb5576
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/24cb5576
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/24cb5576

Branch: refs/heads/master
Commit: 24cb5576f720737b73906ebb15be486d540ac629
Parents: 8b2bec7
Author: pferrel <pat@occamsmachete.com>
Authored: Sat Jul 5 13:36:24 2014 -0700
Committer: pferrel <pat@occamsmachete.com>
Committed: Sat Jul 5 13:36:24 2014 -0700

----------------------------------------------------------------------
 .../apache/mahout/cf/CooccurrenceAnalysis.scala |  8 ++--
 .../mahout/cf/CooccurrenceAnalysisSuite.scala   | 50 ++++++++++----------
 .../drivers/ItemSimilarityDriverSuite.scala     | 41 ++++++++--------
 3 files changed, 51 insertions(+), 48 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/24cb5576/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala b/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala
index b01332c..14cc9d5 100644
--- a/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala
+++ b/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala
@@ -134,9 +134,11 @@ object CooccurrenceAnalysis extends Serializable {
               val llr = logLikelihoodRatio(numInteractionsB(thingB).toLong, numInteractionsA(thingA).toLong,
                 cooccurrences.toLong, numUsers)
 
-              // matches hadoop code and maps values to range (0..1)
-              val tLLR = 1.0 - (1.0 / (1.0 + llr))
-              val candidate = thingA -> tLLR
+              val candidate = thingA -> llr
+
+              // matches legacy hadoop code and maps values to range (0..1)
+              // val tLLR = 1.0 - (1.0 / (1.0 + llr))
+              //val candidate = thingA -> tLLR
 
               // Enqueue item with score, if belonging to the top-k
               if (topItemsPerThing.size < maxInterestingItemsPerThing) {

http://git-wip-us.apache.org/repos/asf/mahout/blob/24cb5576/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala b/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala
index e46dad5..065f2f8 100644
--- a/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala
@@ -41,19 +41,19 @@ class CooccurrenceAnalysisSuite extends FunSuite with MahoutSuite with
MahoutLoc
 
   // correct cooccurrence with LLR
   final val matrixLLRCoocAtAControl = dense(
-    (0.0,                0.6331745808516107, 0.0,                     0.0,              
 0.0),
-    (0.6331745808516107, 0.0,                0.0,                     0.0,              
 0.0),
-    (0.0,                0.0,                0.0,                     0.6331745808516107,
0.0),
-    (0.0,                0.0,                0.6331745808516107,      0.0,              
 0.0),
+    (0.0,                1.7260924347106847, 0.0,                     0.0,              
 0.0),
+    (1.7260924347106847, 0.0,                0.0,                     0.0,              
 0.0),
+    (0.0,                0.0,                0.0,                     1.7260924347106847,
0.0),
+    (0.0,                0.0,                1.7260924347106847,      0.0,              
 0.0),
     (0.0,                0.0,                0.0,                     0.0,              
 0.0))
 
   // correct cross-cooccurrence with LLR
   final val matrixLLRCoocBtAControl = dense(
-    (0.6331745808516107, 0.4046187819149094, 0.4046187819149094, 0.6331745808516107, 0.0),
-    (0.6331745808516107, 0.4046187819149094, 0.4046187819149094, 0.6331745808516107, 0.0),
-    (0.6331745808516107, 0.4046187819149094, 0.4046187819149094, 0.6331745808516107, 0.40461878191490940),
-    (0.6331745808516107, 0.4046187819149094, 0.4046187819149094, 0.6331745808516107, 0.0),
-    (0.0,                0.0,                0.0,                0.0,                0.8181382096075936))
+    (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0),
+    (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0),
+    (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.6795961471815897),
+    (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0),
+    (0.0,                0.0,                0.0,                0.0,                4.498681156950466))
 
 
 
@@ -90,16 +90,16 @@ class CooccurrenceAnalysisSuite extends FunSuite with MahoutSuite with
MahoutLoc
 
   test("cooccurrence [A'A], [B'A] double data using LLR") {
     val a = dense(
-        (100000.0D, 1.0D, 0.0D,  0.0D,  0.0D),
-        (0.0D,      0.0D, 10.0D, 1.0D,  0.0D),
-        (0.0D,      0.0D, 0.0D,  0.0D,  1000.0D),
-        (1.0D,      0.0D, 0.0D,  10.0D, 0.0D))
+        (100000.0D, 1.0D,  0.0D,  0.0D,     0.0D),
+        (     0.0D, 0.0D, 10.0D,  1.0D,     0.0D),
+        (     0.0D, 0.0D,  0.0D,  0.0D,  1000.0D),
+        (     1.0D, 0.0D,  0.0D, 10.0D,     0.0D))
 
     val b = dense(
-        (10000.0D, 100.0D, 1000.0D,     1.0D,      0.0D),
-        (10.0D,    1.0D,   10000000.0D, 10.0D,     0.0D),
-        (0.0D,     0.0D,   1000.0D,     0.0D,      100.0D),
-        (100.0D,   1.0D,   0.0D,        100000.0D, 0.0D))
+        (10000.0D, 100.0D,     1000.0D,      1.0D,   0.0D),
+        (   10.0D,   1.0D, 10000000.0D,     10.0D,   0.0D),
+        (    0.0D,   0.0D,     1000.0D,      0.0D, 100.0D),
+        (  100.0D,   1.0D,        0.0D, 100000.0D,   0.0D))
 
     val drmA = drmParallelize(m = a, numPartitions = 2)
     val drmB = drmParallelize(m = b, numPartitions = 2)
@@ -120,16 +120,16 @@ class CooccurrenceAnalysisSuite extends FunSuite with MahoutSuite with
MahoutLoc
 
   test("cooccurrence [A'A], [B'A] integer data using LLR") {
     val a = dense(
-        (1000,  10, 0,      0,    0),
-        (0,     0,  -10000, 10,   0),
-        (0,     0,  0,      0,    100),
-        (10000, 0,  0,      1000, 0))
+        ( 1000,  10,       0,    0,   0),
+        (    0,   0,  -10000,   10,   0),
+        (    0,   0,       0,    0, 100),
+        (10000,   0,       0, 1000,   0))
 
     val b = dense(
-        (100,   1000, -10000, 10000, 0),
-        (10000, 1000, 100,    10,    0),
-        (0,     0,    10,     0,     -100),
-        (10,    100,  0,      1000,  0))
+        (  100, 1000, -10000, 10000,    0),
+        (10000, 1000,    100,    10,    0),
+        (    0,    0,     10,     0, -100),
+        (   10,  100,      0,  1000,    0))
 
     val drmA = drmParallelize(m = a, numPartitions = 2)
     val drmB = drmParallelize(m = b, numPartitions = 2)

http://git-wip-us.apache.org/repos/asf/mahout/blob/24cb5576/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
index e4a75de..2827317 100644
--- a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
@@ -45,17 +45,18 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with
MahoutLoc
 */
 
   final val SelfSimilairtyTSV = Set(
-      "galaxy\tnexus:0.6331745808516107",
-      "ipad\tiphone:0.6331745808516107",
-      "nexus\tgalaxy:0.6331745808516107",
-      "iphone\tipad:0.6331745808516107",
-      "surface")
-  final val CrossSimilarityTSV = Set(
-      "galaxy\tnexus:0.4046187819149094,iphone:0.6331745808516107,ipad:0.4046187819149094,galaxy:0.6331745808516107",
-      "surface\tsurface:0.8181382096075936",
-      "nexus\tnexus:0.4046187819149094,iphone:0.6331745808516107,ipad:0.4046187819149094,surface:0.4046187819149094,galaxy:0.6331745808516107",
-      "ipad\tnexus:0.4046187819149094,iphone:0.6331745808516107,ipad:0.4046187819149094,galaxy:0.6331745808516107",
-      "iphone\tnexus:0.4046187819149094,iphone:0.6331745808516107,ipad:0.4046187819149094,galaxy:0.6331745808516107")
+    "galaxy\tnexus:1.7260924347106847",
+    "ipad\tiphone:1.7260924347106847",
+    "nexus\tgalaxy:1.7260924347106847",
+    "iphone\tipad:1.7260924347106847",
+    "surface")
+
+  final val CrossSimilarityTSV = Set("" +
+    "nexus\tnexus:0.6795961471815897,iphone:1.7260924347106847,ipad:0.6795961471815897,surface:0.6795961471815897,galaxy:1.7260924347106847",
+    "ipad\tnexus:0.6795961471815897,iphone:1.7260924347106847,ipad:0.6795961471815897,galaxy:1.7260924347106847",
+    "surface\tsurface:4.498681156950466",
+    "iphone\tnexus:0.6795961471815897,iphone:1.7260924347106847,ipad:0.6795961471815897,galaxy:1.7260924347106847",
+    "galaxy\tnexus:0.6795961471815897,iphone:1.7260924347106847,ipad:0.6795961471815897,galaxy:1.7260924347106847")
 
   final val TmpDir = "tmp/" // all IO going to whatever the default HDFS config is pointing
to
 
@@ -88,7 +89,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with MahoutLoc
     ))
   */
 
-  ignore ("ItemSimilarityDriver, non-full-spec CSV"){
+  test ("ItemSimilarityDriver, non-full-spec CSV"){
 
     val InFile = TmpDir + "in-file.csv/" //using part files, not singel file
     val OutPath = TmpDir + "indicator-matrices/"
@@ -143,7 +144,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with
MahoutLoc
 
 
 
-  ignore ("ItemSimilarityDriver TSV "){
+  test ("ItemSimilarityDriver TSV "){
 
     val InFile = TmpDir + "in-file.tsv/"
     val OutPath = TmpDir + "indicator-matrices/"
@@ -197,7 +198,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with
MahoutLoc
 
   }
 
-  ignore ("ItemSimilarityDriver log-ish files"){
+  test ("ItemSimilarityDriver log-ish files"){
 
     val InFile = TmpDir + "in-file.log/"
     val OutPath = TmpDir + "indicator-matrices/"
@@ -251,7 +252,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with
MahoutLoc
 
   }
 
-  ignore ("ItemSimilarityDriver legacy supported file format"){
+  test ("ItemSimilarityDriver legacy supported file format"){
 
     val InDir = TmpDir + "in-dir/"
     val InFilename = "in-file.tsv"
@@ -269,11 +270,11 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with
MahoutLoc
         "3,3,1")
 
     val Answer = Set(
-      "0\t1:0.6331745808516107",
-      "3\t2:0.6331745808516107",
-      "1\t0:0.6331745808516107",
+      "0\t1:1.7260924347106847",
+      "3\t2:1.7260924347106847",
+      "1\t0:1.7260924347106847",
       "4",
-      "2\t3:0.6331745808516107")
+      "2\t3:1.7260924347106847")
 
     // this creates one part-0000 file in the directory
     mahoutCtx.parallelize(lines).coalesce(1, shuffle=true).saveAsTextFile(InDir)
@@ -298,7 +299,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with
MahoutLoc
 
   }
 
-  ignore("ItemSimilarityDriver recursive file discovery using filename patterns"){
+  test("ItemSimilarityDriver recursive file discovery using filename patterns"){
     //directory structure using the following
     // tmp/data/m1.tsv
     // tmp/data/more-data/another-dir/m2.tsv


Mime
View raw message