Return-Path: X-Original-To: apmail-mahout-commits-archive@www.apache.org Delivered-To: apmail-mahout-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id F07D97C8A for ; Sat, 16 Jul 2011 14:20:15 +0000 (UTC) Received: (qmail 99747 invoked by uid 500); 16 Jul 2011 14:20:15 -0000 Delivered-To: apmail-mahout-commits-archive@mahout.apache.org Received: (qmail 99621 invoked by uid 500); 16 Jul 2011 14:20:14 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 99614 invoked by uid 99); 16 Jul 2011 14:20:14 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 16 Jul 2011 14:20:14 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 16 Jul 2011 14:20:12 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 798D02388897; Sat, 16 Jul 2011 14:19:52 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1147431 - in /mahout/trunk: core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/ core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/ math/src/main/java/org/apache/mahout/math/stats/ math/src/test/java/org/apache/mahout... Date: Sat, 16 Jul 2011 14:19:52 -0000 To: commits@mahout.apache.org From: srowen@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20110716141952.798D02388897@eris.apache.org> Author: srowen Date: Sat Jul 16 14:19:51 2011 New Revision: 1147431 URL: http://svn.apache.org/viewvc?rev=1147431&view=rev Log: Knock down a few small CPU hotspots. Parse files in FileDataModel with Splitter. Optimize and even increase accuracy of entropy-like calculation for log-likelihood calc to avoid 5 unnecessary array allocations Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java?rev=1147431&r1=1147430&r2=1147431&view=diff ============================================================================== --- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java (original) +++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java Sat Jul 16 14:19:51 2011 @@ -22,11 +22,12 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collection; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.concurrent.locks.ReentrantLock; -import java.util.regex.Pattern; +import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.common.io.Closeables; import org.apache.mahout.cf.taste.common.Refreshable; @@ -127,7 +128,7 @@ public class FileDataModel extends Abstr private long lastModified; private long lastUpdateFileModified; private final char delimiter; - private final Pattern delimiterPattern; + private final Splitter delimiterPattern; private final boolean hasPrefValues; private DataModel delegate; private final ReentrantLock reloadLock; @@ -177,10 +178,13 @@ public class FileDataModel extends Abstr Closeables.closeQuietly(iterator); delimiter = determineDelimiter(firstLine); - delimiterPattern = Pattern.compile(String.valueOf(delimiter)); - String[] firstLineSplit = delimiterPattern.split(firstLine); + delimiterPattern = Splitter.on(delimiter); + List firstLineSplit = Lists.newArrayList(); + for (String token : delimiterPattern.split(firstLine)) { + firstLineSplit.add(token); + } // If preference value exists and isn't empty then the file is specifying pref values - hasPrefValues = firstLineSplit.length >= 3 && firstLineSplit[2].length() > 0; + hasPrefValues = firstLineSplit.size() >= 3 && firstLineSplit.get(2).length() > 0; this.reloadLock = new ReentrantLock(); this.transpose = transpose; @@ -368,15 +372,12 @@ public class FileDataModel extends Abstr return; } - // Consume up to 4 tokens, and gather whatever is left in an unused 5th token: - String[] tokens = delimiterPattern.split(line, 5); - - Preconditions.checkArgument(tokens.length >= 3, "Bad line: %s", line); - - String userIDString = tokens[0]; - String itemIDString = tokens[1]; - String preferenceValueString = tokens[2]; - String timestampString = tokens.length >= 4 ? tokens[3] : null; + Iterator tokens = delimiterPattern.split(line).iterator(); + String userIDString = tokens.next(); + String itemIDString = tokens.next(); + String preferenceValueString = tokens.next(); + boolean hasTimestamp = tokens.hasNext(); + String timestampString = hasTimestamp ? tokens.next() : null; long userID = readUserIDFromString(userIDString); long itemID = readItemIDFromString(itemIDString); @@ -393,7 +394,7 @@ public class FileDataModel extends Abstr // Data are PreferenceArray PreferenceArray prefs = (PreferenceArray) maybePrefs; - if (tokens.length == 3 && preferenceValueString.length() == 0) { + if (!hasTimestamp && preferenceValueString.length() == 0) { // Then line is of form "userID,itemID,", meaning remove if (prefs != null) { boolean exists = false; @@ -461,7 +462,7 @@ public class FileDataModel extends Abstr Collection prefs = (Collection) maybePrefs; - if (tokens.length == 3 && preferenceValueString.length() == 0) { + if (!hasTimestamp && preferenceValueString.length() == 0) { // Then line is of form "userID,itemID,", meaning remove if (prefs != null) { // remove pref @@ -532,12 +533,13 @@ public class FileDataModel extends Abstr return; } - String[] tokens = delimiterPattern.split(line); - Preconditions.checkArgument(tokens.length >= 2, "Bad line: %s", line); - String userIDString = tokens[0]; - String itemIDString = tokens[1]; - String preferenceValueString = tokens.length >= 3 ? tokens[2] : ""; - String timestampString = tokens.length >= 4 ? tokens[3] : null; + Iterator tokens = delimiterPattern.split(line).iterator(); + String userIDString = tokens.next(); + String itemIDString = tokens.next(); + boolean hasPreference = tokens.hasNext(); + String preferenceValueString = hasPreference ? tokens.next() : ""; + boolean hasTimestamp = tokens.hasNext(); + String timestampString = hasTimestamp ? tokens.next() : null; long userID = readUserIDFromString(userIDString); long itemID = readItemIDFromString(itemIDString); @@ -548,7 +550,7 @@ public class FileDataModel extends Abstr itemID = tmp; } - if (tokens.length == 3 && preferenceValueString.length() == 0) { + if (hasPreference && !hasTimestamp && preferenceValueString.length() == 0) { // Then line is of form "userID,itemID,", meaning remove FastIDSet itemIDs = data.get(userID); Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java?rev=1147431&r1=1147430&r2=1147431&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarityTest.java Sat Jul 16 14:19:51 2011 @@ -67,8 +67,8 @@ public final class LogLikelihoodSimilari assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(1, 0)); assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(0, 1)); - assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(2, 3)); - assertCorrelationEquals(Double.NaN, similarity.itemSimilarity(3, 2)); + assertCorrelationEquals(0.0, similarity.itemSimilarity(2, 3)); + assertCorrelationEquals(0.0, similarity.itemSimilarity(3, 2)); } @Test Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=1147431&r1=1147430&r2=1147431&view=diff ============================================================================== --- mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java (original) +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java Sat Jul 16 14:19:51 2011 @@ -17,6 +17,7 @@ package org.apache.mahout.math.stats; +import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Multiset; import com.google.common.collect.Ordering; @@ -48,19 +49,34 @@ public final class LogLikelihood { * @return The entropy value for the elements */ public static double entropy(long... elements) { - double sum = 0.0; + long sum = 0; double result = 0.0; for (long element : elements) { - if (element < 0) { - throw new IllegalArgumentException("Should not have negative count for entropy computation: (" + element + ')'); - } - if (element > 0) { - result += element * Math.log(element); - sum += element; - } + Preconditions.checkArgument(element >= 0); + result += xLogX(element); + sum += element; } - result -= sum * Math.log(sum); - return -result; + return xLogX(sum) - result; + } + + private static double xLogX(long x) { + return x == 0 ? 0.0 : x * Math.log(x); + } + + /** + * Merely an optimization for the common two argument case of {@link #entropy(long...)} + * @see #logLikelihoodRatio(long, long, long, long) + */ + private static double entropy(long a, long b) { + return xLogX(a + b) - xLogX(a) - xLogX(b); + } + + /** + * Merely an optimization for the common four argument case of {@link #entropy(long...)} + * @see #logLikelihoodRatio(long, long, long, long) + */ + private static double entropy(long a, long b, long c, long d) { + return xLogX(a + b + c + d) - xLogX(a) - xLogX(b) - xLogX(c) - xLogX(d); } /** @@ -82,6 +98,7 @@ public final class LogLikelihood { * Credit to http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the table and the descriptions. */ public static double logLikelihoodRatio(long k11, long k12, long k21, long k22) { + Preconditions.checkArgument(k11 >= 0 && k12 >= 0 && k21 >= 0 && k22 >= 0); // note that we have counts here, not probabilities, and that the entropy is not normalized. double rowEntropy = entropy(k11, k12) + entropy(k21, k22); double columnEntropy = entropy(k11, k21) + entropy(k12, k22); Modified: mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java?rev=1147431&r1=1147430&r2=1147431&view=diff ============================================================================== --- mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java (original) +++ mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java Sat Jul 16 14:19:51 2011 @@ -68,7 +68,7 @@ public final class LogLikelihoodTest ext @Test public void testRootNegativeLLR() { - assertEquals(0.0, LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487), 0.00000001); + assertTrue(LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487) > 0.0); } @Test