Return-Path: X-Original-To: apmail-mahout-commits-archive@www.apache.org Delivered-To: apmail-mahout-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id E05991089F for ; Sat, 27 Jul 2013 05:08:24 +0000 (UTC) Received: (qmail 28660 invoked by uid 500); 27 Jul 2013 05:08:23 -0000 Delivered-To: apmail-mahout-commits-archive@mahout.apache.org Received: (qmail 28625 invoked by uid 500); 27 Jul 2013 05:08:18 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 28618 invoked by uid 99); 27 Jul 2013 05:08:16 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 27 Jul 2013 05:08:16 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 27 Jul 2013 05:08:13 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 049D823888CD; Sat, 27 Jul 2013 05:07:51 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1507575 - in /mahout/trunk: CHANGELOG core/pom.xml core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java Date: Sat, 27 Jul 2013 05:07:51 -0000 To: commits@mahout.apache.org From: smarthi@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20130727050752.049D823888CD@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: smarthi Date: Sat Jul 27 05:07:51 2013 New Revision: 1507575 URL: http://svn.apache.org/r1507575 Log: MAHOUT-1287: classifier.sgd.CsvRecordFactory incorrectly parses CSV format Modified: mahout/trunk/CHANGELOG mahout/trunk/core/pom.xml mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java Modified: mahout/trunk/CHANGELOG URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1507575&r1=1507574&r2=1507575&view=diff ============================================================================== --- mahout/trunk/CHANGELOG (original) +++ mahout/trunk/CHANGELOG Sat Jul 27 05:07:51 2013 @@ -12,6 +12,8 @@ Release 0.9 - unreleased MAHOUT-1290: Issue when running Mahout Recommender Demo (Helder Garay Martins via smarthi) + MAHOUT-1287: classifier.sgd.CsvRecordFactory incorrectly parses CSV format (Alex Franchuk via smarthi) + MAHOUT-1275: Dropped bz2 distribution format for source and binaries (sslavic) Release 0.8 - 2013-07-25 Modified: mahout/trunk/core/pom.xml URL: http://svn.apache.org/viewvc/mahout/trunk/core/pom.xml?rev=1507575&r1=1507574&r2=1507575&view=diff ============================================================================== --- mahout/trunk/core/pom.xml (original) +++ mahout/trunk/core/pom.xml Sat Jul 27 05:07:51 2013 @@ -202,6 +202,12 @@ test + + org.apache.solr + solr-commons-csv + 3.5.0 + + Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java?rev=1507575&r1=1507574&r2=1507575&view=diff ============================================================================== --- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java (original) +++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java Sat Jul 27 05:07:51 2013 @@ -17,14 +17,14 @@ package org.apache.mahout.classifier.sgd; -import com.google.common.base.CharMatcher; import com.google.common.base.Function; import com.google.common.base.Preconditions; -import com.google.common.base.Splitter; import com.google.common.collect.Collections2; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; + +import org.apache.commons.csv.CSVUtils; import org.apache.mahout.math.Vector; import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder; import org.apache.mahout.vectorizer.encoders.ContinuousValueEncoder; @@ -33,8 +33,10 @@ import org.apache.mahout.vectorizer.enco import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder; import org.apache.mahout.vectorizer.encoders.TextValueEncoder; +import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -68,10 +70,6 @@ import java.util.Set; public class CsvRecordFactory implements RecordFactory { private static final String INTERCEPT_TERM = "Intercept Term"; - // crude CSV value splitter. This will fail if any double quoted strings have - // commas inside. Also, escaped quotes will not be unescaped. Good enough for now. - private static final Splitter COMMA = Splitter.on(',').trimResults(CharMatcher.is('"')); - private static final Map> TYPE_DICTIONARY = ImmutableMap.>builder() .put("continuous", ContinuousValueEncoder.class) @@ -103,6 +101,29 @@ public class CsvRecordFactory implements "Unable to construct type converter... shouldn't be possible"; /** + * Parse a single line of csv-formatted text. + * + * Separated to make changing this functionality for the entire class easier + * in the future. + * @param line - CSV formatted text + * @return List + */ + private List parseCsvLine(String line) { + try { + return Arrays.asList(CSVUtils.parseLine(line)); + } + catch (IOException e) { + List list = Lists.newArrayList(); + list.add(line); + return list; + } + } + + private List parseCsvLine(CharSequence line) { + return parseCsvLine(line.toString()); + } + + /** * Construct a parser for CSV lines that encodes the parsed data in vector form. * @param targetName The name of the target variable. * @param typeMap A map describing the types of the predictor variables. @@ -166,7 +187,7 @@ public class CsvRecordFactory implements public void firstLine(String line) { // read variable names, build map of name -> column final Map vars = Maps.newHashMap(); - variableNames = Lists.newArrayList(COMMA.split(line)); + variableNames = parseCsvLine(line); int column = 0; for (String var : variableNames) { vars.put(var, column++); @@ -240,7 +261,7 @@ public class CsvRecordFactory implements */ @Override public int processLine(String line, Vector featureVector) { - List values = Lists.newArrayList(COMMA.split(line)); + List values = parseCsvLine(line); int targetValue = targetDictionary.intern(values.get(target)); if (targetValue >= maxTargetValue) { @@ -271,7 +292,7 @@ public class CsvRecordFactory implements * @return The value of the target variable. */ public int processLine(CharSequence line, Vector featureVector, boolean returnTarget) { - List values = Lists.newArrayList(COMMA.split(line)); + List values = parseCsvLine(line); int targetValue = -1; if (returnTarget) { targetValue = targetDictionary.intern(values.get(target)); @@ -293,7 +314,7 @@ public class CsvRecordFactory implements * @return the raw target value in the corresponding column of CSV line */ public String getTargetString(CharSequence line) { - List values = Lists.newArrayList(COMMA.split(line)); + List values = parseCsvLine(line); return values.get(target); } @@ -318,7 +339,7 @@ public class CsvRecordFactory implements * @return the id value of the CSV record */ public String getIdString(CharSequence line) { - List values = Lists.newArrayList(COMMA.split(line)); + List values = parseCsvLine(line); return values.get(id); }