mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s..@apache.org
Subject svn commit: r1576590 - in /mahout/trunk: CHANGELOG core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java
Date Wed, 12 Mar 2014 06:19:50 GMT
Author: ssc
Date: Wed Mar 12 06:19:49 2014
New Revision: 1576590

URL: http://svn.apache.org/r1576590
Log:
MAHOUT-1448 In Random Forest, the training does not support multiple input files. The input
dataset must be one single file

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1576590&r1=1576589&r2=1576590&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Wed Mar 12 06:19:49 2014
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 1.0 - unreleased
 
+  MAHOUT-1448: In Random Forest, the training does not support multiple input files. The
input dataset must be one single file. (Manoj Awasthi via ssc)
+
   MAHOUT-1447: ImplicitFeedbackAlternatingLeastSquaresSolver tests and features (Adam Ilardi
via ssc)
 
   MAHOUT-1438: "quickstart" tutorial for building a simple recommender (Maciej Mazur and
Steve Cook via ssc)

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java?rev=1576590&r1=1576589&r2=1576590&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java Wed
Mar 12 06:19:49 2014
@@ -48,9 +48,9 @@ public final class DataLoader {
 
   private static final Logger log = LoggerFactory.getLogger(DataLoader.class);
 
-  private static final Pattern COMMA_SPACE = Pattern.compile("[, ]");
+  private static final Pattern SEPARATORS = Pattern.compile("[, ]");
 
-  private DataLoader() { }
+  private DataLoader() {}
 
   /**
    * Converts a comma-separated String to a Vector.
@@ -63,7 +63,7 @@ public final class DataLoader {
    */
   private static boolean parseString(Attribute[] attrs, Set<String>[] values, CharSequence
string,
     boolean regression) {
-    String[] tokens = COMMA_SPACE.split(string);
+    String[] tokens = SEPARATORS.split(string);
     Preconditions.checkArgument(tokens.length == attrs.length,
         "Wrong number of attributes in the string: " + tokens.length + ". Must be: " + attrs.length);
 
@@ -134,9 +134,21 @@ public final class DataLoader {
     return new Data(dataset, instances);
   }
 
-  /**
-   * Loads the data from a String array
-   */
+
+  /** Loads the data from multiple paths specified by pathes */
+  public static Data loadData(Dataset dataset, FileSystem fs, Path[] pathes) throws IOException
{
+    List<Instance> instances = Lists.newArrayList();
+
+    for (Path path : pathes) {
+      Data loadedData = loadData(dataset, fs, path);
+      for (int index = 0; index <= loadedData.size(); index++) {
+        instances.add(loadedData.get(index));
+      }
+    }
+    return new Data(dataset, instances);
+  }
+
+  /** Loads the data from a String array */
   public static Data loadData(Dataset dataset, String[] data) {
     List<Instance> instances = Lists.newArrayList();
 



Mime
View raw message