mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r1147431 - in /mahout/trunk: core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/ core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/ math/src/main/java/org/apache/mahout/math/stats/ math/src/test/java/org/apache/mahout...
Date Sat, 16 Jul 2011 14:19:52 GMT
Author: srowen
Date: Sat Jul 16 14:19:51 2011
New Revision: 1147431

URL: http://svn.apache.org/viewvc?rev=1147431&view=rev
Log:
Knock down a few small CPU hotspots. Parse files in FileDataModel with Splitter. Optimize
and even increase accuracy of entropy-like calculation for log-likelihood calc to avoid 5
unnecessary array allocations

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java
    mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
    mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java?rev=1147431&r1=1147430&r2=1147431&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
Sat Jul 16 14:19:51 2011
@@ -22,11 +22,12 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.Collection;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.locks.ReentrantLock;
-import java.util.regex.Pattern;
 
+import com.google.common.base.Splitter;
 import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
 import org.apache.mahout.cf.taste.common.Refreshable;
@@ -127,7 +128,7 @@ public class FileDataModel extends Abstr
   private long lastModified;
   private long lastUpdateFileModified;
   private final char delimiter;
-  private final Pattern delimiterPattern;
+  private final Splitter delimiterPattern;
   private final boolean hasPrefValues;
   private DataModel delegate;
   private final ReentrantLock reloadLock;
@@ -177,10 +178,13 @@ public class FileDataModel extends Abstr
     Closeables.closeQuietly(iterator);
 
     delimiter = determineDelimiter(firstLine);
-    delimiterPattern = Pattern.compile(String.valueOf(delimiter));
-    String[] firstLineSplit = delimiterPattern.split(firstLine);
+    delimiterPattern = Splitter.on(delimiter);
+    List<String> firstLineSplit = Lists.newArrayList();
+    for (String token : delimiterPattern.split(firstLine)) {
+      firstLineSplit.add(token);
+    }
     // If preference value exists and isn't empty then the file is specifying pref values
-    hasPrefValues = firstLineSplit.length >= 3 && firstLineSplit[2].length() >
0;
+    hasPrefValues = firstLineSplit.size() >= 3 && firstLineSplit.get(2).length()
> 0;
 
     this.reloadLock = new ReentrantLock();
     this.transpose = transpose;
@@ -368,15 +372,12 @@ public class FileDataModel extends Abstr
       return;
     }
 
-    // Consume up to 4 tokens, and gather whatever is left in an unused 5th token:
-    String[] tokens = delimiterPattern.split(line, 5);
-
-    Preconditions.checkArgument(tokens.length >= 3, "Bad line: %s", line);
-
-    String userIDString = tokens[0];
-    String itemIDString = tokens[1];
-    String preferenceValueString = tokens[2];
-    String timestampString = tokens.length >= 4 ? tokens[3] : null;
+    Iterator<String> tokens = delimiterPattern.split(line).iterator();
+    String userIDString = tokens.next();
+    String itemIDString = tokens.next();
+    String preferenceValueString = tokens.next();
+    boolean hasTimestamp = tokens.hasNext();
+    String timestampString = hasTimestamp ? tokens.next() : null;
 
     long userID = readUserIDFromString(userIDString);
     long itemID = readItemIDFromString(itemIDString);
@@ -393,7 +394,7 @@ public class FileDataModel extends Abstr
       // Data are PreferenceArray
 
       PreferenceArray prefs = (PreferenceArray) maybePrefs;
-      if (tokens.length == 3 && preferenceValueString.length() == 0) {
+      if (!hasTimestamp && preferenceValueString.length() == 0) {
         // Then line is of form "userID,itemID,", meaning remove
         if (prefs != null) {
           boolean exists = false;
@@ -461,7 +462,7 @@ public class FileDataModel extends Abstr
 
       Collection<Preference> prefs = (Collection<Preference>) maybePrefs;
 
-      if (tokens.length == 3 && preferenceValueString.length() == 0) {
+      if (!hasTimestamp && preferenceValueString.length() == 0) {
         // Then line is of form "userID,itemID,", meaning remove
         if (prefs != null) {
           // remove pref
@@ -532,12 +533,13 @@ public class FileDataModel extends Abstr
       return;
     }
 
-    String[] tokens = delimiterPattern.split(line);
-    Preconditions.checkArgument(tokens.length >= 2, "Bad line: %s", line);
-    String userIDString = tokens[0];
-    String itemIDString = tokens[1];
-    String preferenceValueString = tokens.length >= 3 ? tokens[2] : "";
-    String timestampString = tokens.length >= 4 ? tokens[3] : null;
+    Iterator<String> tokens = delimiterPattern.split(line).iterator();
+    String userIDString = tokens.next();
+    String itemIDString = tokens.next();
+    boolean hasPreference = tokens.hasNext();
+    String preferenceValueString = hasPreference ? tokens.next() : "";
+    boolean hasTimestamp = tokens.hasNext();
+    String timestampString = hasTimestamp ? tokens.next() : null;
 
     long userID = readUserIDFromString(userIDString);
     long itemID = readItemIDFromString(itemIDString);
@@ -548,7 +550,7 @@ public class FileDataModel extends Abstr
       itemID = tmp;
     }
 
-    if (tokens.length == 3 && preferenceValueString.length() == 0) {
+    if (hasPreference && !hasTimestamp && preferenceValueString.length()
== 0) {
       // Then line is of form "userID,itemID,", meaning remove
 
       FastIDSet itemIDs = data.get(userID);

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java?rev=1147431&r1=1147430&r2=1147431&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java
(original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java
Sat Jul 16 14:19:51 2011
@@ -67,8 +67,8 @@ public final class LogLikelihoodSimilari
     assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(1, 0));
     assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(0, 1));
 
-    assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(2, 3));
-    assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(3, 2));
+    assertCorrelationEquals(0.0, similarity.itemSimilarity(2, 3));
+    assertCorrelationEquals(0.0, similarity.itemSimilarity(3, 2));
   }
 
   @Test

Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=1147431&r1=1147430&r2=1147431&view=diff
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java (original)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java Sat Jul
16 14:19:51 2011
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.math.stats;
 
+import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Multiset;
 import com.google.common.collect.Ordering;
@@ -48,19 +49,34 @@ public final class LogLikelihood {
    * @return The entropy value for the elements
    */
   public static double entropy(long... elements) {
-    double sum = 0.0;
+    long sum = 0;
     double result = 0.0;
     for (long element : elements) {
-      if (element < 0) {
-        throw new IllegalArgumentException("Should not have negative count for entropy computation:
(" + element + ')');
-      }
-      if (element > 0) {
-        result += element * Math.log(element);
-        sum += element;
-      }
+      Preconditions.checkArgument(element >= 0);
+      result += xLogX(element);
+      sum += element;
     }
-    result -= sum * Math.log(sum);
-    return -result;
+    return xLogX(sum) - result;
+  }
+
+  private static double xLogX(long x) {
+    return x == 0 ? 0.0 : x * Math.log(x);
+  }
+
+  /**
+   * Merely an optimization for the common two argument case of {@link #entropy(long...)}
+   * @see #logLikelihoodRatio(long, long, long, long)
+   */
+  private static double entropy(long a, long b) {
+    return xLogX(a + b) - xLogX(a) - xLogX(b);
+  }
+
+  /**
+   * Merely an optimization for the common four argument case of {@link #entropy(long...)}
+   * @see #logLikelihoodRatio(long, long, long, long)
+   */
+  private static double entropy(long a, long b, long c, long d) {
+    return xLogX(a + b + c + d) - xLogX(a) - xLogX(b) - xLogX(c) - xLogX(d);
   }
 
   /**
@@ -82,6 +98,7 @@ public final class LogLikelihood {
    * Credit to http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the
table and the descriptions.
    */
   public static double logLikelihoodRatio(long k11, long k12, long k21, long k22) {
+    Preconditions.checkArgument(k11 >= 0 && k12 >= 0 && k21 >= 0
&& k22 >= 0);
     // note that we have counts here, not probabilities, and that the entropy is not normalized.
     double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
     double columnEntropy = entropy(k11, k21) + entropy(k12, k22);

Modified: mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java?rev=1147431&r1=1147430&r2=1147431&view=diff
==============================================================================
--- mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java (original)
+++ mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java Sat
Jul 16 14:19:51 2011
@@ -68,7 +68,7 @@ public final class LogLikelihoodTest ext
 
   @Test
   public void testRootNegativeLLR() {
-    assertEquals(0.0, LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487), 0.00000001);
+    assertTrue(LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487) > 0.0);
   }
 
   @Test



Mime
View raw message