mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From robina...@apache.org
Subject svn commit: r909900 [2/4] - in /lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df: ./ builder/ callback/ data/ data/conditions/ mapred/ mapred/inmem/ mapred/partial/ mapreduce/ mapreduce/inmem/ mapreduce/partial/ node/ ref/ split/ tools/
Date Sat, 13 Feb 2010 20:27:30 GMT
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemBuilder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemBuilder.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemBuilder.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemBuilder.java Sat Feb 13 20:27:25 2010
@@ -45,51 +45,51 @@
  * in-memory. The forest trees are splitted across all the mappers
  */
 public class InMemBuilder extends Builder {
-
+  
   public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath,
-      Long seed, Configuration conf) {
+                      Long seed, Configuration conf) {
     super(treeBuilder, dataPath, datasetPath, seed, conf);
   }
-
+  
   public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath) {
     this(treeBuilder, dataPath, datasetPath, null, new Configuration());
   }
-
+  
   @Override
   protected void configureJob(JobConf conf, int nbTrees, boolean oobEstimate)
-      throws IOException {
+  throws IOException {
     FileOutputFormat.setOutputPath(conf, getOutputPath(conf));
-
+    
     // put the data in the DistributedCache
     DistributedCache.addCacheFile(getDataPath().toUri(), conf);
-
+    
     conf.setOutputKeyClass(IntWritable.class);
     conf.setOutputValueClass(MapredOutput.class);
-
+    
     conf.setMapperClass(InMemMapper.class);
     conf.setNumReduceTasks(0); // no reducers
-
+    
     conf.setInputFormat(InMemInputFormat.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
   }
-
+  
   @Override
   protected DecisionForest parseOutput(JobConf conf, PredictionCallback callback)
-      throws IOException {
+  throws IOException {
     Map<Integer, MapredOutput> output = new HashMap<Integer, MapredOutput>();
-
+    
     Path outputPath = getOutputPath(conf);
     FileSystem fs = outputPath.getFileSystem(conf);
-
+    
     Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
-
+    
     // import the InMemOutputs
     IntWritable key = new IntWritable();
     MapredOutput value = new MapredOutput();
-
+    
     for (Path path : outfiles) {
       Reader reader = new Reader(fs, path, conf);
-
+      
       try {
         while (reader.next(key, value)) {
           output.put(key.get(), value.clone());
@@ -98,10 +98,10 @@
         reader.close();
       }
     }
-
-    return processOutput(output, callback);
+    
+    return InMemBuilder.processOutput(output, callback);
   }
-
+  
   /**
    * Process the output, extracting the trees and passing the predictions to the
    * callback
@@ -111,14 +111,14 @@
    * @return
    */
   private static DecisionForest processOutput(Map<Integer, MapredOutput> output,
-      PredictionCallback callback) {
+                                              PredictionCallback callback) {
     List<Node> trees = new ArrayList<Node>();
-
+    
     for (Map.Entry<Integer, MapredOutput> entry : output.entrySet()) {
       MapredOutput value = entry.getValue();
-
+      
       trees.add(value.getTree());
-
+      
       if (callback != null) {
         int[] predictions = value.getPredictions();
         for (int index = 0; index < predictions.length; index++) {
@@ -126,7 +126,7 @@
         }
       }
     }
-
+    
     return new DecisionForest(trees);
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java Sat Feb 13 20:27:25 2010
@@ -42,15 +42,15 @@
  * The number of splits is equal to the number of requested splits
  */
 public class InMemInputFormat implements InputFormat<IntWritable, NullWritable> {
-
+  
   private static final Logger log = LoggerFactory.getLogger(InMemInputSplit.class);
-
+  
   private Random rng;
-
+  
   private Long seed;
-
+  
   private boolean isSingleSeed;
-
+  
   /**
    * Used for DEBUG purposes only. if true and a seed is available, all the
    * mappers use the same seed, thus all the mapper should take the same time to
@@ -62,95 +62,97 @@
   private static boolean isSingleSeed(Configuration conf) {
     return conf.getBoolean("debug.mahout.rf.single.seed", false);
   }
-
+  
   @Override
   public RecordReader<IntWritable, NullWritable> getRecordReader(
-      InputSplit split, JobConf conf, Reporter reporter) throws IOException {
+    InputSplit split, JobConf conf, Reporter reporter) throws IOException {
     return new InMemRecordReader((InMemInputSplit) split);
   }
-
+  
   @Override
   public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
     int nbTrees = Builder.getNbTrees(conf);
     int splitSize = nbTrees / numSplits;
-
+    
     seed = Builder.getRandomSeed(conf);
-    isSingleSeed = isSingleSeed(conf);
-
+    isSingleSeed = InMemInputFormat.isSingleSeed(conf);
+    
     if (rng != null && seed != null) {
-      log.warn("getSplits() was called more than once and the 'seed' is set, "
-          + "this can lead to no-repeatable behavior");
+      InMemInputFormat.log.warn("getSplits() was called more than once and the 'seed' is set, "
+        + "this can lead to no-repeatable behavior");
     }
-
-    rng = (seed == null || isSingleSeed) ? null : RandomUtils.getRandom();
-
+    
+    rng = seed == null || isSingleSeed ? null : RandomUtils.getRandom();
+    
     int id = 0;
-
+    
     InputSplit[] splits = new InputSplit[numSplits];
-
+    
     for (int index = 0; index < numSplits - 1; index++) {
       splits[index] = new InMemInputSplit(id, splitSize, nextSeed());
       id += splitSize;
     }
-
+    
     // take care of the remainder
     splits[numSplits - 1] = new InMemInputSplit(id, nbTrees - id, nextSeed());
-
+    
     return splits;
   }
-
+  
   /**
    * Return the seed for the next InputSplit
    * 
    * @return
    */
   private Long nextSeed() {
-    if (seed == null)
+    if (seed == null) {
       return null;
-    else if (isSingleSeed)
+    } else if (isSingleSeed) {
       return seed;
-    else
+    } else {
       return rng.nextLong();
+    }
   }
-
+  
   public static class InMemRecordReader implements
-      RecordReader<IntWritable, NullWritable> {
-
+  RecordReader<IntWritable, NullWritable> {
+    
     private final InMemInputSplit split;
-
+    
     private int pos;
-
+    
     public InMemRecordReader(InMemInputSplit split) {
       this.split = split;
     }
-
+    
     @Override
     public void close() throws IOException {
     }
-
+    
     @Override
     public IntWritable createKey() {
       return new IntWritable();
     }
-
+    
     @Override
     public NullWritable createValue() {
       return NullWritable.get();
     }
-
+    
     @Override
     public long getPos() throws IOException {
       return pos;
     }
-
+    
     @Override
     public float getProgress() throws IOException {
-      if (pos == 0)
+      if (pos == 0) {
         return 0.0f;
-      else
+      } else {
         return (float) (pos - 1) / split.nbTrees;
+      }
     }
-
+    
     @Override
     public boolean next(IntWritable key, NullWritable value) throws IOException {
       if (pos < split.nbTrees) {
@@ -161,32 +163,32 @@
         return false;
       }
     }
-
+    
   }
-
+  
   /**
    * Custom InputSplit that indicates how many trees are built by each mapper
    */
   public static class InMemInputSplit implements InputSplit {
-
+    
     private static final String[] NO_LOCATIONS = new String[0];
-
+    
     /** Id of the first tree of this split */
     private int firstId;
-
+    
     private int nbTrees;
-
+    
     private Long seed;
-
+    
     public InMemInputSplit() {
     }
-
+    
     public InMemInputSplit(int firstId, int nbTrees, Long seed) {
       this.firstId = firstId;
       this.nbTrees = nbTrees;
       this.seed = seed;
     }
-
+    
     /**
      * Return the Id of the first tree of this split
      * 
@@ -195,7 +197,7 @@
     public int getFirstId() {
       return firstId;
     }
-
+    
     /**
      * Return the number of trees
      * 
@@ -204,7 +206,7 @@
     public int getNbTrees() {
       return nbTrees;
     }
-
+    
     /**
      * Return the random seed
      * 
@@ -213,52 +215,55 @@
     public Long getSeed() {
       return seed;
     }
-
+    
     @Override
     public long getLength() throws IOException {
       return nbTrees;
     }
-
+    
     @Override
     public String[] getLocations() throws IOException {
-      return NO_LOCATIONS;
+      return InMemInputSplit.NO_LOCATIONS;
     }
-
+    
     @Override
     public boolean equals(Object obj) {
-      if (this == obj)
+      if (this == obj) {
         return true;
-      if (obj == null || !(obj instanceof InMemInputSplit))
+      }
+      if (obj == null || !(obj instanceof InMemInputSplit)) {
         return false;
-
+      }
+      
       InMemInputSplit split = (InMemInputSplit) obj;
-
-      if (seed == null && split.seed != null)
+      
+      if (seed == null && split.seed != null) {
         return false;
-
+      }
+      
       return firstId == split.firstId && nbTrees == split.nbTrees
-          && (seed == null || seed.equals(split.seed));
+      && (seed == null || seed.equals(split.seed));
     }
-
+    
     @Override
     public int hashCode() {
       return firstId + nbTrees + (seed == null ? 0 : seed.intValue());
     }
-
+    
     @Override
     public String toString() {
       return String.format("[firstId:%d, nbTrees:%d, seed:%d]", firstId,
-          nbTrees, seed);
+        nbTrees, seed);
     }
-
+    
     @Override
     public void readFields(DataInput in) throws IOException {
       firstId = in.readInt();
       nbTrees = in.readInt();
       boolean isSeed = in.readBoolean();
-      seed = (isSeed) ? in.readLong() : null;
+      seed = isSeed ? in.readLong() : null;
     }
-
+    
     @Override
     public void write(DataOutput out) throws IOException {
       out.writeInt(firstId);
@@ -268,12 +273,12 @@
         out.writeLong(seed);
       }
     }
-
+    
     public static InMemInputSplit read(DataInput in) throws IOException {
       InMemInputSplit split = new InMemInputSplit();
       split.readFields(in);
       return split;
     }
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemMapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemMapper.java Sat Feb 13 20:27:25 2010
@@ -44,92 +44,92 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * In-memory mapper that grows the trees using a full copy of the data loaded
- * in-memory. The number of trees to grow is determined by the current
- * InMemInputSplit.
+ * In-memory mapper that grows the trees using a full copy of the data loaded in-memory. The number of trees
+ * to grow is determined by the current InMemInputSplit.
  */
 public class InMemMapper extends MapredMapper implements
-    Mapper<IntWritable, NullWritable, IntWritable, MapredOutput> {
-
+    Mapper<IntWritable,NullWritable,IntWritable,MapredOutput> {
+  
   private static final Logger log = LoggerFactory.getLogger(InMemMapper.class);
-
+  
   private Bagging bagging;
-
+  
   private Random rng;
-
+  
   private Data data;
-
+  
   /**
    * Load the training data
    * 
    * @param conf
    * @return
-   * @throws RuntimeException if the data could not be loaded
+   * @throws RuntimeException
+   *           if the data could not be loaded
    */
   private static Data loadData(JobConf conf, Dataset dataset) throws IOException {
     Path dataPath = Builder.getDistributedCacheFile(conf, 1);
     FileSystem fs = FileSystem.get(dataPath.toUri(), conf);
     return DataLoader.loadData(dataset, fs, dataPath);
   }
-
+  
   @Override
   public void configure(JobConf conf) {
     super.configure(conf);
-
-    log.info("Loading the data...");
+    
+    InMemMapper.log.info("Loading the data...");
     try {
-      data = loadData(conf, getDataset());
+      data = InMemMapper.loadData(conf, getDataset());
     } catch (IOException e) {
-    throw new IllegalStateException("Exception caught while loading the data: "
-        + StringUtils.stringifyException(e));
+      throw new IllegalStateException("Exception caught while loading the data: "
+                                      + StringUtils.stringifyException(e));
     }
-    log.info("Data loaded : {} instances", data.size());
-
+    InMemMapper.log.info("Data loaded : {} instances", data.size());
+    
     bagging = new Bagging(getTreeBuilder(), data);
   }
-
+  
   @Override
-  public void map(IntWritable key, NullWritable value,
-      OutputCollector<IntWritable, MapredOutput> output, Reporter reporter)
-      throws IOException {
+  public void map(IntWritable key,
+                  NullWritable value,
+                  OutputCollector<IntWritable,MapredOutput> output,
+                  Reporter reporter) throws IOException {
     map(key, output, (InMemInputSplit) reporter.getInputSplit());
   }
-
-  public void map(IntWritable key,
-      OutputCollector<IntWritable, MapredOutput> output, InMemInputSplit split)
-      throws IOException {
-
+  
+  public void map(IntWritable key, OutputCollector<IntWritable,MapredOutput> output, InMemInputSplit split) throws IOException {
+    
     SingleTreePredictions callback = null;
     int[] predictions = null;
-
+    
     if (isOobEstimate() && !isNoOutput()) {
       callback = new SingleTreePredictions(data.size());
       predictions = callback.getPredictions();
     }
-
+    
     initRandom(split);
-
-    log.debug("Building...");
+    
+    InMemMapper.log.debug("Building...");
     Node tree = bagging.build(key.get(), rng, callback);
-
+    
     if (!isNoOutput()) {
-      log.debug("Outputing...");
+      InMemMapper.log.debug("Outputing...");
       MapredOutput mrOut = new MapredOutput(tree, predictions);
-
+      
       output.collect(key, mrOut);
     }
   }
-
+  
   protected void initRandom(InMemInputSplit split) {
     if (rng == null) { // first execution of this mapper
       Long seed = split.getSeed();
-      log.debug("Initialising rng with seed {}: ", seed);
-
-      if (seed == null)
+      InMemMapper.log.debug("Initialising rng with seed {}: ", seed);
+      
+      if (seed == null) {
         rng = RandomUtils.getRandom();
-      else
+      } else {
         rng = RandomUtils.getRandom(seed);
+      }
     }
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/PartialBuilder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/PartialBuilder.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/PartialBuilder.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/PartialBuilder.java Sat Feb 13 20:27:25 2010
@@ -42,13 +42,12 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Builds a random forest using partial data. Each mapper uses only the data
- * given by its InputSplit
+ * Builds a random forest using partial data. Each mapper uses only the data given by its InputSplit
  */
 public class PartialBuilder extends Builder {
-
+  
   private static final Logger log = LoggerFactory.getLogger(PartialBuilder.class);
-
+  
   /**
    * Indicates if we should run the second step of the builder.<br>
    * This parameter is only meant for debuging, so we keep it protected.
@@ -59,100 +58,107 @@
   protected static boolean isStep2(Configuration conf) {
     return conf.getBoolean("debug.mahout.rf.partial.step2", true);
   }
-
+  
   /**
    * Should run the second step of the builder ?
    * 
    * @param conf
-   * @param value true to indicate that the second step will be launched
+   * @param value
+   *          true to indicate that the second step will be launched
    * 
    */
   protected static void setStep2(Configuration conf, boolean value) {
     conf.setBoolean("debug.mahout.rf.partial.step2", value);
   }
-
-  public PartialBuilder(TreeBuilder treeBuilder, Path dataPath,
-      Path datasetPath, Long seed) {
+  
+  public PartialBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed) {
     this(treeBuilder, dataPath, datasetPath, seed, new Configuration());
   }
-
-  public PartialBuilder(TreeBuilder treeBuilder, Path dataPath,
-      Path datasetPath, Long seed, Configuration conf) {
+  
+  public PartialBuilder(TreeBuilder treeBuilder,
+                        Path dataPath,
+                        Path datasetPath,
+                        Long seed,
+                        Configuration conf) {
     super(treeBuilder, dataPath, datasetPath, seed, conf);
   }
-
+  
   @Override
-  protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate)
-      throws IOException {
+  protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException {
     FileInputFormat.setInputPaths(job, getDataPath());
     FileOutputFormat.setOutputPath(job, getOutputPath(job));
-
+    
     job.setOutputKeyClass(TreeID.class);
     job.setOutputValueClass(MapredOutput.class);
-
+    
     job.setMapperClass(Step1Mapper.class);
     job.setNumReduceTasks(0); // no reducers
-
+    
     job.setInputFormat(TextInputFormat.class);
     job.setOutputFormat(SequenceFileOutputFormat.class);
-
+    
     // if we are in 'local' mode, correct the number of maps
     // or the mappers won't be able to compute the right indexes
     String tracker = job.get("mapred.job.tracker", "local");
     if ("local".equals(tracker)) {
-      log.warn("Hadoop running in 'local' mode, only one map task will be launched");
+      PartialBuilder.log.warn("Hadoop running in 'local' mode, only one map task will be launched");
       job.setNumMapTasks(1);
     }
   }
-
+  
   @Override
-  protected DecisionForest parseOutput(JobConf job, PredictionCallback callback)
-      throws IOException {
+  protected DecisionForest parseOutput(JobConf job, PredictionCallback callback) throws IOException {
     int numMaps = job.getNumMapTasks();
-    int numTrees = getNbTrees(job);
-
+    int numTrees = Builder.getNbTrees(job);
+    
     Path outputPath = getOutputPath(job);
-
-    log.info("Computing partitions' first ids...");
+    
+    PartialBuilder.log.info("Computing partitions' first ids...");
     Step0Job step0 = new Step0Job(getOutputPath(job), getDataPath(), getDatasetPath());
     Step0Output[] partitions = step0.run(getConf());
-
-    log.info("Processing the output...");
+    
+    PartialBuilder.log.info("Processing the output...");
     TreeID[] keys = new TreeID[numTrees];
     Node[] trees = new Node[numTrees];
     int[] firstIds = Step0Output.extractFirstIds(partitions);
-    processOutput(job, outputPath, firstIds, keys, trees, callback);
-
+    PartialBuilder.processOutput(job, outputPath, firstIds, keys, trees, callback);
+    
     // call the second step in order to complete the oob predictions
-    if (callback != null && numMaps > 1 && isStep2(getConf())) {
-      log.info("*****************************");
-      log.info("Second Step");
-      log.info("*****************************");
+    if ((callback != null) && (numMaps > 1) && PartialBuilder.isStep2(getConf())) {
+      PartialBuilder.log.info("*****************************");
+      PartialBuilder.log.info("Second Step");
+      PartialBuilder.log.info("*****************************");
       Step2Job step2 = new Step2Job(getOutputPath(job), getDataPath(), getDatasetPath(), partitions);
-
+      
       step2.run(job, keys, trees, callback);
     }
-
+    
     return new DecisionForest(Arrays.asList(trees));
   }
-
+  
   /**
    * Processes the output from the output path.<br>
    * 
    * @param job
-   * @param outputPath directory that contains the output of the job
-   * @param firstIds partitions' first ids in hadoop's order
+   * @param outputPath
+   *          directory that contains the output of the job
+   * @param firstIds
+   *          partitions' first ids in hadoop's order
    * @param keys
-   * @param callback can be null
+   * @param callback
+   *          can be null
    * @throws IOException
    */
-  protected static void processOutput(JobConf job, Path outputPath,
-      int[] firstIds, TreeID[] keys, Node[] trees, PredictionCallback callback)
-      throws IOException {
+  protected static void processOutput(JobConf job,
+                                      Path outputPath,
+                                      int[] firstIds,
+                                      TreeID[] keys,
+                                      Node[] trees,
+                                      PredictionCallback callback) throws IOException {
     FileSystem fs = outputPath.getFileSystem(job);
-
+    
     Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
-
+    
     // read all the outputs
     TreeID key = new TreeID();
     MapredOutput value = new MapredOutput();
@@ -160,7 +166,7 @@
     int index = 0;
     for (Path path : outfiles) {
       Reader reader = new Reader(fs, path, job);
-
+      
       try {
         while (reader.next(key, value)) {
           if (keys != null) {
@@ -171,7 +177,7 @@
             trees[index] = value.getTree();
           }
           
-          processOutput(firstIds, key, value, callback);
+          PartialBuilder.processOutput(firstIds, key, value, callback);
           
           index++;
         }
@@ -179,30 +185,31 @@
         reader.close();
       }
     }
-
+    
     // make sure we got all the keys/values
     if (index != keys.length) {
       throw new IllegalStateException();
     }
   }
-
+  
   /**
-   * Process the output, extracting the trees and passing the predictions to the
-   * callback
+   * Process the output, extracting the trees and passing the predictions to the callback
    * 
-   * @param firstIds partitions' first ids in hadoop's order
+   * @param firstIds
+   *          partitions' first ids in hadoop's order
    * @param callback
    * @return
    */
-  private static void processOutput(int[] firstIds, TreeID key,
-      MapredOutput value, PredictionCallback callback) {
-
+  private static void processOutput(int[] firstIds,
+                                    TreeID key,
+                                    MapredOutput value,
+                                    PredictionCallback callback) {
+    
     if (callback != null) {
       int[] predictions = value.getPredictions();
-
+      
       for (int instanceId = 0; instanceId < predictions.length; instanceId++) {
-        callback.prediction(key.treeId(), firstIds[key.partition()] + instanceId, 
-            predictions[instanceId]);
+        callback.prediction(key.treeId(), firstIds[key.partition()] + instanceId, predictions[instanceId]);
       }
     }
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step0Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step0Job.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step0Job.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step0Job.java Sat Feb 13 20:27:25 2010
@@ -46,23 +46,24 @@
 import org.apache.mahout.df.DFUtils;
 
 /**
- * preparation step of the partial mapreduce builder. Computes some stats that
- * will be used by the builder.
+ * preparation step of the partial mapreduce builder. Computes some stats that will be used by the builder.
  */
 public class Step0Job {
-
+  
   /** directory that will hold this job's output */
   private final Path outputPath;
-
+  
   /** file that contains the serialized dataset */
   private final Path datasetPath;
-
+  
   /** directory that contains the data used in the first step */
   private final Path dataPath;
-
+  
   /**
-   * @param base base directory
-   * @param dataPath data used in the first step
+   * @param base
+   *          base directory
+   * @param dataPath
+   *          data used in the first step
    * @param datasetPath
    */
   public Step0Job(Path base, Path dataPath, Path datasetPath) {
@@ -70,45 +71,47 @@
     this.dataPath = dataPath;
     this.datasetPath = datasetPath;
   }
-
+  
   /**
    * Computes the partitions' first ids in Hadoop's order
    * 
-   * @param conf configuration
+   * @param conf
+   *          configuration
    * @return first ids for all the partitions
    * @throws IOException
    */
   public Step0Output[] run(Configuration conf) throws IOException {
-
+    
     JobConf job = new JobConf(conf, Step0Job.class);
-
+    
     // check the output
-    if (outputPath.getFileSystem(job).exists(outputPath))
+    if (outputPath.getFileSystem(job).exists(outputPath)) {
       throw new IOException("Output path already exists : " + outputPath);
-
+    }
+    
     // put the dataset into the DistributedCache
     // use setCacheFiles() to overwrite the first-step cache files
-    URI[] files = { datasetPath.toUri() };
+    URI[] files = {datasetPath.toUri()};
     DistributedCache.setCacheFiles(files, job);
-
+    
     FileInputFormat.setInputPaths(job, dataPath);
     FileOutputFormat.setOutputPath(job, outputPath);
-
+    
     job.setOutputKeyClass(IntWritable.class);
     job.setOutputValueClass(Step0Output.class);
-
+    
     job.setMapperClass(Step0Mapper.class);
     job.setNumReduceTasks(0); // no reducers
-
+    
     job.setInputFormat(TextInputFormat.class);
     job.setOutputFormat(SequenceFileOutputFormat.class);
-
+    
     // run the job
     JobClient.runJob(job);
-
+    
     return parseOutput(job);
   }
-
+  
   /**
    * Extracts the output and processes it
    * 
@@ -120,35 +123,35 @@
   protected Step0Output[] parseOutput(JobConf job) throws IOException {
     int numMaps = job.getNumMapTasks();
     FileSystem fs = outputPath.getFileSystem(job);
-
+    
     Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
-
+    
     int[] keys = new int[numMaps];
     Step0Output[] values = new Step0Output[numMaps];
-
+    
     // read all the outputs
     IntWritable key = new IntWritable();
     Step0Output value = new Step0Output(0L, 0);
-
+    
     int index = 0;
     for (Path path : outfiles) {
       Reader reader = new Reader(fs, path, job);
-
+      
       try {
         while (reader.next(key, value)) {
           keys[index] = key.get();
           values[index] = value.clone();
-
+          
           index++;
         }
       } finally {
         reader.close();
       }
     }
-
-    return processOutput(keys, values);
+    
+    return Step0Job.processOutput(keys, values);
   }
-
+  
   /**
    * Replaces the first id for each partition in Hadoop's order
    * 
@@ -158,18 +161,18 @@
    */
   protected static Step0Output[] processOutput(int[] keys, Step0Output[] values) {
     int numMaps = values.length;
-
+    
     // sort the values using firstId
     Step0Output[] sorted = Arrays.copyOf(values, numMaps);
     Arrays.sort(sorted);
-
+    
     // compute the partitions firstIds (file order)
     int[] orderedIds = new int[numMaps];
     orderedIds[0] = 0;
     for (int p = 1; p < numMaps; p++) {
       orderedIds[p] = orderedIds[p - 1] + sorted[p - 1].size;
     }
-
+    
     // update the values' first ids
     for (int p = 0; p < numMaps; p++) {
       int order = ArrayUtils.indexOf(sorted, values[p]);
@@ -181,30 +184,29 @@
     for (int p = 0; p < numMaps; p++) {
       reordered[keys[p]] = values[p];
     }
-
+    
     return reordered;
   }
-
+  
   /**
    * Outputs the first key and the size of the partition
    * 
    */
-  static class Step0Mapper extends MapReduceBase implements
-      Mapper<LongWritable, Text, IntWritable, Step0Output> {
-
+  static class Step0Mapper extends MapReduceBase implements Mapper<LongWritable,Text,IntWritable,Step0Output> {
+    
     private int partition;
-
+    
     private int size;
-
+    
     private Long firstId;
-
-    private OutputCollector<IntWritable, Step0Output> collector;
-
+    
+    private OutputCollector<IntWritable,Step0Output> collector;
+    
     @Override
     public void configure(JobConf job) {
       configure(job.getInt("mapred.task.partition", -1));
     }
-
+    
     /**
      * Useful when testing
      * 
@@ -216,76 +218,75 @@
         throw new IllegalArgumentException("Wrong partition id : " + partition);
       }
     }
-
+    
     @Override
-    public void map(LongWritable key, Text value,
-        OutputCollector<IntWritable, Step0Output> output, Reporter reporter)
-        throws IOException {
+    public void map(LongWritable key,
+                    Text value,
+                    OutputCollector<IntWritable,Step0Output> output,
+                    Reporter reporter) throws IOException {
       if (firstId == null) {
         firstId = key.get();
       }
-
+      
       if (collector == null) {
         collector = output;
       }
-
+      
       size++;
     }
-
+    
     @Override
     public void close() throws IOException {
-      collector.collect(new IntWritable(partition), new Step0Output(firstId,
-          size));
+      collector.collect(new IntWritable(partition), new Step0Output(firstId, size));
     }
-
+    
   }
-
+  
   /**
    * Output of the step0's mappers
    * 
    */
-  public static class Step0Output implements Writable,
-      Comparable<Step0Output>, Cloneable {
-
+  public static class Step0Output implements Writable, Comparable<Step0Output>, Cloneable {
+    
     /**
      * first key of the partition<br>
      * used to sort the partition
      */
     private long firstId;
-
+    
     /** number of instances in the partition */
     private int size;
-
+    
     protected Step0Output(long firstId, int size) {
       this.firstId = firstId;
       this.size = size;
     }
-
+    
     protected long getFirstId() {
       return firstId;
     }
-
+    
     protected int getSize() {
       return size;
     }
-
+    
     @Override
     public void readFields(DataInput in) throws IOException {
       firstId = in.readLong();
       size = in.readInt();
     }
-
+    
     @Override
     public void write(DataOutput out) throws IOException {
       out.writeLong(firstId);
       out.writeInt(size);
     }
-
+    
     @Override
     protected Step0Output clone() {
       return new Step0Output(firstId, size);
     }
-
+    
     @Override
     public boolean equals(Object other) {
       if (!(other instanceof Step0Output)) {
@@ -293,39 +294,40 @@
       }
       return firstId == ((Step0Output) other).firstId;
     }
-
+    
     @Override
     public int hashCode() {
       return (int) firstId;
     }
-
+    
     @Override
     public int compareTo(Step0Output obj) {
-      if (firstId < obj.firstId)
+      if (firstId < obj.firstId) {
         return -1;
-      else if (firstId > obj.firstId)
+      } else if (firstId > obj.firstId) {
         return 1;
-      else
+      } else {
         return 0;
+      }
     }
-
+    
     public static int[] extractFirstIds(Step0Output[] partitions) {
       int[] ids = new int[partitions.length];
       
       for (int p = 0; p < partitions.length; p++) {
         ids[p] = (int) partitions[p].firstId;
       }
-
+      
       return ids;
     }
-
+    
     public static int[] extractSizes(Step0Output[] partitions) {
       int[] sizes = new int[partitions.length];
       
       for (int p = 0; p < partitions.length; p++) {
         sizes[p] = partitions[p].size;
       }
-
+      
       return sizes;
     }
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step1Mapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step1Mapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step1Mapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step1Mapper.java Sat Feb 13 20:27:25 2010
@@ -43,93 +43,97 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * First step of the Partial Data Builder. Builds the trees using the data
- * available in the InputSplit. Predict the oob classes for each tree in its
- * growing partition (input split).
+ * First step of the Partial Data Builder. Builds the trees using the data available in the InputSplit.
+ * Predict the oob classes for each tree in its growing partition (input split).
  */
-public class Step1Mapper extends MapredMapper implements
-    Mapper<LongWritable, Text, TreeID, MapredOutput> {
-
+public class Step1Mapper extends MapredMapper implements Mapper<LongWritable,Text,TreeID,MapredOutput> {
+  
   private static final Logger log = LoggerFactory.getLogger(Step1Mapper.class);
   
   /** used to convert input values to data instances */
   private DataConverter converter;
-
+  
   private Random rng;
-
+  
   /** number of trees to be built by this mapper */
   private int nbTrees;
-
+  
   /** id of the first tree */
   private int firstTreeId;
-
+  
   /** mapper's partition */
   private int partition;
-
+  
   /** used by close() */
-  private OutputCollector<TreeID, MapredOutput> output;
-
+  private OutputCollector<TreeID,MapredOutput> output;
+  
   /** will contain all instances if this mapper's split */
   private final List<Instance> instances = new ArrayList<Instance>();
-
+  
   public int getFirstTreeId() {
     return firstTreeId;
   }
-
+  
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-
-    configure(Builder.getRandomSeed(job), job.getInt("mapred.task.partition",
-        -1), job.getNumMapTasks(), Builder.getNbTrees(job));
+    
+    configure(Builder.getRandomSeed(job), job.getInt("mapred.task.partition", -1), job.getNumMapTasks(),
+      Builder.getNbTrees(job));
   }
-
+  
   /**
    * Useful when testing
    * 
    * @param seed
-   * @param partition current mapper inputSplit partition
-   * @param numMapTasks number of running map tasks
-   * @param numTrees total number of trees in the forest
+   * @param partition
+   *          current mapper inputSplit partition
+   * @param numMapTasks
+   *          number of running map tasks
+   * @param numTrees
+   *          total number of trees in the forest
    */
-  protected void configure(Long seed, int partition, int numMapTasks,
-      int numTrees) {
+  protected void configure(Long seed, int partition, int numMapTasks, int numTrees) {
     converter = new DataConverter(getDataset());
-
+    
     // prepare random-numders generator
-    log.debug("seed : {}", seed);
-    if (seed == null)
-        rng = RandomUtils.getRandom();
-      else
-        rng = RandomUtils.getRandom(seed);
-
+    Step1Mapper.log.debug("seed : {}", seed);
+    if (seed == null) {
+      rng = RandomUtils.getRandom();
+    } else {
+      rng = RandomUtils.getRandom(seed);
+    }
+    
     // mapper's partition
     if (partition < 0) {
       throw new IllegalArgumentException("Wrong partition ID");
     }
     this.partition = partition;
-
+    
     // compute number of trees to build
-    nbTrees = nbTrees(numMapTasks, numTrees, partition);
-
+    nbTrees = Step1Mapper.nbTrees(numMapTasks, numTrees, partition);
+    
     // compute first tree id
     firstTreeId = 0;
     for (int p = 0; p < partition; p++) {
-      firstTreeId += nbTrees(numMapTasks, numTrees, p);
+      firstTreeId += Step1Mapper.nbTrees(numMapTasks, numTrees, p);
     }
-
-    log.debug("partition : {}", partition);
-    log.debug("nbTrees : {}", nbTrees);
-    log.debug("firstTreeId : {}", firstTreeId);
+    
+    Step1Mapper.log.debug("partition : {}", partition);
+    Step1Mapper.log.debug("nbTrees : {}", nbTrees);
+    Step1Mapper.log.debug("firstTreeId : {}", firstTreeId);
   }
-
+  
   /**
-   * Compute the number of trees for a given partition. The first partition (0)
-   * may be longer than the rest of partition because of the remainder.
+   * Compute the number of trees for a given partition. The first partition (0) may be longer than the rest of
+   * partition because of the remainder.
    * 
-   * @param numMaps total number of maps (partitions)
-   * @param numTrees total number of trees to build
-   * @param partition partition to compute the number of trees for
+   * @param numMaps
+   *          total number of maps (partitions)
+   * @param numTrees
+   *          total number of trees to build
+   * @param partition
+   *          partition to compute the number of trees for
    * @return
    */
   public static int nbTrees(int numMaps, int numTrees, int partition) {
@@ -137,45 +141,43 @@
     if (partition == 0) {
       nbTrees += numTrees - nbTrees * numMaps;
     }
-
+    
     return nbTrees;
   }
-
+  
   @Override
-  public void map(LongWritable key, Text value,
-      OutputCollector<TreeID, MapredOutput> output, Reporter reporter)
-      throws IOException {
+  public void map(LongWritable key, Text value, OutputCollector<TreeID,MapredOutput> output, Reporter reporter) throws IOException {
     if (this.output == null) {
       this.output = output;
     }
-
+    
     instances.add(converter.convert((int) key.get(), value.toString()));
   }
-
+  
   @Override
   public void close() throws IOException {
     // prepare the data
-    log.debug("partition: {} numInstances: {}", partition, instances.size());
+    Step1Mapper.log.debug("partition: {} numInstances: {}", partition, instances.size());
     
     Data data = new Data(getDataset(), instances);
     Bagging bagging = new Bagging(getTreeBuilder(), data);
-
+    
     TreeID key = new TreeID();
-
-    log.debug("Building {} trees", nbTrees);
+    
+    Step1Mapper.log.debug("Building {} trees", nbTrees);
     SingleTreePredictions callback = null;
     int[] predictions = null;
     for (int treeId = 0; treeId < nbTrees; treeId++) {
-      log.debug("Building tree number: {}", treeId);
+      Step1Mapper.log.debug("Building tree number: {}", treeId);
       if (isOobEstimate() && !isNoOutput()) {
         callback = new SingleTreePredictions(data.size());
         predictions = callback.getPredictions();
       }
-
+      
       Node tree = bagging.build(treeId, rng, callback);
-
+      
       key.set(partition, firstTreeId + treeId);
-
+      
       if (!isNoOutput()) {
         MapredOutput emOut = new MapredOutput(tree, predictions);
         output.collect(key, emOut);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step2Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step2Job.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step2Job.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step2Job.java Sat Feb 13 20:27:25 2010
@@ -38,31 +38,33 @@
 import org.apache.mahout.df.node.Node;
 
 /**
- * 2nd step of the partial mapreduce builder. Computes the oob predictions using
- * all the trees of the forest
+ * 2nd step of the partial mapreduce builder. Computes the oob predictions using all the trees of the forest
  */
 public class Step2Job {
-
+  
   /** directory that will hold this job's output */
   private final Path outputPath;
-
+  
   /** file that will contains the forest, passed to the maps */
   private final Path forestPath;
-
+  
   /** file that contains the serialized dataset */
   private final Path datasetPath;
-
+  
   /** directory that contains the data used in the first step */
   private final Path dataPath;
-
+  
   /** partitions info in Hadoop's order */
   private final Step0Output[] partitions;
   
   /**
-   * @param base base directory
-   * @param dataPath data used in the first step
+   * @param base
+   *          base directory
+   * @param dataPath
+   *          data used in the first step
    * @param datasetPath
-   * @param partitions partitions' infos in hadoop's order
+   * @param partitions
+   *          partitions' infos in hadoop's order
    */
   public Step2Job(Path base, Path dataPath, Path datasetPath, Step0Output[] partitions) {
     this.outputPath = new Path(base, "step2.output");
@@ -71,61 +73,64 @@
     this.datasetPath = datasetPath;
     this.partitions = partitions;
   }
-
+  
   /**
    * Run the second step.
    * 
-   * @param conf configuration
-   * @param keys keys returned by the first step
-   * @param trees trees returned by the first step
+   * @param conf
+   *          configuration
+   * @param keys
+   *          keys returned by the first step
+   * @param trees
+   *          trees returned by the first step
    * @param callback
    * @throws IOException
    */
-  public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback) 
-      throws IOException {
+  public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback) throws IOException {
     if (callback == null) {
       // no need to launch the job
       return;
     }
-
+    
     int numTrees = keys.length;
-
+    
     JobConf job = new JobConf(conf, Step2Job.class);
-
+    
     // check the output
-    if (outputPath.getFileSystem(job).exists(outputPath))
+    if (outputPath.getFileSystem(job).exists(outputPath)) {
       throw new IOException("Output path already exists : " + outputPath);
-
+    }
+    
     int[] sizes = Step0Output.extractSizes(partitions);
     
     InterResults.store(forestPath.getFileSystem(job), forestPath, keys, trees, sizes);
-
+    
     // needed by the mapper
     Builder.setNbTrees(job, numTrees);
-
+    
     // put the dataset and the forest into the DistributedCache
     // use setCacheFiles() to overwrite the first-step cache files
-    URI[] files = { datasetPath.toUri(), forestPath.toUri() };
+    URI[] files = {datasetPath.toUri(), forestPath.toUri()};
     DistributedCache.setCacheFiles(files, job);
-
+    
     FileInputFormat.setInputPaths(job, dataPath);
     FileOutputFormat.setOutputPath(job, outputPath);
-
+    
     job.setOutputKeyClass(TreeID.class);
     job.setOutputValueClass(MapredOutput.class);
-
+    
     job.setMapperClass(Step2Mapper.class);
     job.setNumReduceTasks(0); // no reducers
-
+    
     job.setInputFormat(TextInputFormat.class);
     job.setOutputFormat(SequenceFileOutputFormat.class);
-
+    
     // run the job
     JobClient.runJob(job);
-
+    
     parseOutput(job, callback);
   }
-
+  
   /**
    * Extracts the output and processes it
    * 
@@ -133,17 +138,16 @@
    * @param callback
    * @throws IOException
    */
-  protected void parseOutput(JobConf job, PredictionCallback callback)
-      throws IOException {
+  protected void parseOutput(JobConf job, PredictionCallback callback) throws IOException {
     int numMaps = job.getNumMapTasks();
     int numTrees = Builder.getNbTrees(job);
-
+    
     // compute the total number of output values
     int total = 0;
     for (int partition = 0; partition < numMaps; partition++) {
       total += Step2Mapper.nbConcerned(numMaps, numTrees, partition);
     }
-
+    
     int[] firstIds = Step0Output.extractFirstIds(partitions);
     PartialBuilder.processOutput(job, outputPath, firstIds, null, null, callback);
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step2Mapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step2Mapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step2Mapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/partial/Step2Mapper.java Sat Feb 13 20:27:25 2010
@@ -43,31 +43,29 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Second step of PartialBuilder. Using the trees of the first step, computes
- * the oob predictions for each tree, except those of its own partition, on all
- * instancesof the partition.
+ * Second step of PartialBuilder. Using the trees of the first step, computes the oob predictions for each
+ * tree, except those of its own partition, on all instancesof the partition.
  */
-public class Step2Mapper extends MapReduceBase implements
-    Mapper<LongWritable, Text, TreeID, MapredOutput> {
-
+public class Step2Mapper extends MapReduceBase implements Mapper<LongWritable,Text,TreeID,MapredOutput> {
+  
   private static final Logger log = LoggerFactory.getLogger(Step2Mapper.class);
-
+  
   private TreeID[] keys;
-
+  
   private Node[] trees;
-
+  
   private SingleTreePredictions[] callbacks;
-
+  
   private DataConverter converter;
-
+  
   private int partition = -1;
-
+  
   /** used by close() */
-  private OutputCollector<TreeID, MapredOutput> output;
-
+  private OutputCollector<TreeID,MapredOutput> output;
+  
   /** num treated instances */
   private int instanceId;
-
+  
   @Override
   public void configure(JobConf job) {
     // get the cached files' paths
@@ -77,11 +75,11 @@
     } catch (IOException e) {
       throw new IllegalStateException("Exception while getting the cache files : ", e);
     }
-
-    if (files == null || files.length < 2) {
+    
+    if ((files == null) || (files.length < 2)) {
       throw new IllegalArgumentException("missing paths from the DistributedCache");
     }
-
+    
     Dataset dataset;
     try {
       Path datasetPath = new Path(files[0].getPath());
@@ -89,43 +87,44 @@
     } catch (IOException e) {
       throw new IllegalStateException("Exception while loading the dataset : ", e);
     }
-
+    
     int numMaps = job.getNumMapTasks();
     int p = job.getInt("mapred.task.partition", -1);
-
+    
     // total number of trees in the forest
     int numTrees = Builder.getNbTrees(job);
     if (numTrees == -1) {
       throw new IllegalArgumentException("numTrees not found !");
     }
-
-    int nbConcerned = nbConcerned(numMaps, numTrees, p);
+    
+    int nbConcerned = Step2Mapper.nbConcerned(numMaps, numTrees, p);
     keys = new TreeID[nbConcerned];
     trees = new Node[nbConcerned];
-
+    
     int numInstances;
-
+    
     try {
       Path forestPath = new Path(files[1].getPath());
       FileSystem fs = forestPath.getFileSystem(job);
-      numInstances = InterResults.load(fs, forestPath, numMaps, numTrees,
-          p, keys, trees);
-
-      log.debug("partition: {} numInstances: {}", p, numInstances);
+      numInstances = InterResults.load(fs, forestPath, numMaps, numTrees, p, keys, trees);
+      
+      Step2Mapper.log.debug("partition: {} numInstances: {}", p, numInstances);
     } catch (IOException e) {
       throw new IllegalStateException("Exception while loading the forest : ", e);
     }
-
+    
     configure(p, dataset, keys, trees, numInstances);
   }
-
+  
   /**
-   * Compute the number of trees that need to classify the instances of this
-   * mapper's partition
+   * Compute the number of trees that need to classify the instances of this mapper's partition
    * 
-   * @param numMaps total number of map tasks
-   * @param numTrees total number of trees in the forest
-   * @param partition mapper's partition
+   * @param numMaps
+   *          total number of map tasks
+   * @param numTrees
+   *          total number of trees in the forest
+   * @param partition
+   *          mapper's partition
    * @return
    */
   public static int nbConcerned(int numMaps, int numTrees, int partition) {
@@ -135,67 +134,68 @@
     // the trees of the mapper's partition are not concerned
     return numTrees - Step1Mapper.nbTrees(numMaps, numTrees, partition);
   }
-
+  
   /**
    * Useful for testing. Configures the mapper without using a JobConf<br>
    * TODO we don't need the keys partitions, the tree ids should suffice
    * 
-   * @param partition mapper's partition
+   * @param partition
+   *          mapper's partition
    * @param dataset
-   * @param keys keys returned by the first step
-   * @param trees trees returned by the first step
-   * @param numInstances number of instances in the mapper's partition
+   * @param keys
+   *          keys returned by the first step
+   * @param trees
+   *          trees returned by the first step
+   * @param numInstances
+   *          number of instances in the mapper's partition
    */
-  public void configure(int partition, Dataset dataset, TreeID[] keys,
-      Node[] trees, int numInstances) {
+  public void configure(int partition, Dataset dataset, TreeID[] keys, Node[] trees, int numInstances) {
     this.partition = partition;
     if (partition < 0) {
       throw new IllegalArgumentException("Wrong partition id : " + partition);
     }
-
+    
     converter = new DataConverter(dataset);
-
+    
     if (keys.length != trees.length) {
       throw new IllegalArgumentException("keys.length != trees.length");
     }
     int nbConcerned = keys.length;
-
+    
     this.keys = keys;
     this.trees = trees;
-
+    
     // make sure the trees are not from this partition
     for (TreeID key : keys) {
       if (key.partition() == partition) {
         throw new IllegalArgumentException("a tree from this partition was found !");
       }
     }
-
+    
     // init the callbacks
     callbacks = new SingleTreePredictions[nbConcerned];
     for (int index = 0; index < nbConcerned; index++) {
       callbacks[index] = new SingleTreePredictions(numInstances);
     }
-
+    
   }
-
+  
   @Override
-  public void map(LongWritable key, Text value,
-      OutputCollector<TreeID, MapredOutput> output, Reporter reporter)
-      throws IOException {
+  public void map(LongWritable key, Text value, OutputCollector<TreeID,MapredOutput> output, Reporter reporter) throws IOException {
     if (this.output == null) {
       this.output = output;
     }
-
+    
     Instance instance = converter.convert(instanceId, value.toString());
-
+    
     for (int index = 0; index < keys.length; index++) {
       int prediction = trees[index].classify(instance);
       callbacks[index].prediction(index, instanceId, prediction);
     }
-
+    
     instanceId++;
   }
-
+  
   @Override
   public void close() throws IOException {
     for (int index = 0; index < keys.length; index++) {
@@ -203,5 +203,5 @@
       output.collect(key, new MapredOutput(callbacks[index].getPredictions()));
     }
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/Builder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/Builder.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/Builder.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/Builder.java Sat Feb 13 20:27:25 2010
@@ -17,6 +17,11 @@
 
 package org.apache.mahout.df.mapreduce;
 
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.Comparator;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.FileSystem;
@@ -31,63 +36,55 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.net.URI;
-import java.util.Arrays;
-import java.util.Comparator;
-
 /**
- * Base class for Mapred DecisionForest builders. Takes care of storing the
- * parameters common to the mapred implementations.<br>
+ * Base class for Mapred DecisionForest builders. Takes care of storing the parameters common to the mapred
+ * implementations.<br>
  * The child classes must implement at least :
  * <ul>
- * <li> void configureJob(Job) : to further configure the job before its
- * launch; and </li>
- * <li> DecisionForest parseOutput(Job, PredictionCallback) : in order to
- * convert the job outputs into a DecisionForest and its corresponding oob
- * predictions </li>
+ * <li>void configureJob(Job) : to further configure the job before its launch; and</li>
+ * <li>DecisionForest parseOutput(Job, PredictionCallback) : in order to convert the job outputs into a
+ * DecisionForest and its corresponding oob predictions</li>
  * </ul>
  * 
  */
 public abstract class Builder {
-
+  
   private static final Logger log = LoggerFactory.getLogger(Builder.class);
-
+  
   /** Tree Builder Component */
   private final TreeBuilder treeBuilder;
-
+  
   private final Path dataPath;
-
+  
   private final Path datasetPath;
-
+  
   private final Long seed;
-
+  
   private final Configuration conf;
-
+  
   private String outputDirName = "output";
-
+  
   protected TreeBuilder getTreeBuilder() {
     return treeBuilder;
   }
-
+  
   protected Path getDataPath() {
     return dataPath;
   }
-
+  
   protected Path getDatasetPath() {
     return datasetPath;
   }
-
+  
   protected Long getSeed() {
     return seed;
   }
-
-
+  
   /**
-   * Return the value of "mapred.map.tasks". In case the 'local' runner is
-   * detected, returns 1
+   * Return the value of "mapred.map.tasks". In case the 'local' runner is detected, returns 1
    * 
-   * @param conf configuration
+   * @param conf
+   *          configuration
    * @return number of map tasks
    */
   public static int getNumMaps(Configuration conf) {
@@ -95,109 +92,121 @@
     // or the mappers won't be able to compute the right indexes
     String tracker = conf.get("mapred.job.tracker", "local");
     if ("local".equals(tracker)) {
-      log
-          .warn("Hadoop running in 'local' mode, only one map task will be launched");
+      Builder.log.warn("Hadoop running in 'local' mode, only one map task will be launched");
       return 1;
     }
-
+    
     return conf.getInt("mapred.map.tasks", -1);
   }
-
+  
   /**
-   * Used only for DEBUG purposes. if false, the mappers doesn't output
-   * anything, so the builder has nothing to process
+   * Used only for DEBUG purposes. if false, the mappers doesn't output anything, so the builder has nothing
+   * to process
    * 
-   * @param conf configuration
+   * @param conf
+   *          configuration
    * @return true if the builder has to return output. false otherwise
    */
   protected static boolean isOutput(Configuration conf) {
     return conf.getBoolean("debug.mahout.rf.output", true);
   }
-
+  
   protected static boolean isOobEstimate(Configuration conf) {
     return conf.getBoolean("mahout.rf.oob", false);
   }
-
+  
   private static void setOobEstimate(Configuration conf, boolean value) {
     conf.setBoolean("mahout.rf.oob", value);
   }
-
+  
   /**
    * Returns the random seed
    * 
-   * @param conf configuration
+   * @param conf
+   *          configuration
    * @return null if no seed is available
    */
   public static Long getRandomSeed(Configuration conf) {
     String seed = conf.get("mahout.rf.random.seed");
-    if (seed == null)
+    if (seed == null) {
       return null;
-
+    }
+    
     return Long.valueOf(seed);
   }
-
+  
   /**
    * Sets the random seed value
    * 
-   * @param conf configuration
-   * @param seed random seed
+   * @param conf
+   *          configuration
+   * @param seed
+   *          random seed
    */
   private static void setRandomSeed(Configuration conf, long seed) {
     conf.setLong("mahout.rf.random.seed", seed);
   }
-
+  
   public static TreeBuilder getTreeBuilder(Configuration conf) {
     String string = conf.get("mahout.rf.treebuilder");
-    if (string == null)
+    if (string == null) {
       return null;
-
+    }
+    
     return StringUtils.fromString(string);
   }
-
-  private static void setTreeBuilder(Configuration conf,
-      TreeBuilder treeBuilder) {
+  
+  private static void setTreeBuilder(Configuration conf, TreeBuilder treeBuilder) {
     conf.set("mahout.rf.treebuilder", StringUtils.toString(treeBuilder));
   }
-
+  
   /**
    * Get the number of trees for the map-reduce job.
    * 
-   * @param conf configuration
+   * @param conf
+   *          configuration
    * @return number of trees to build
    */
   public static int getNbTrees(Configuration conf) {
     return conf.getInt("mahout.rf.nbtrees", -1);
   }
-
+  
   /**
    * Set the number of trees to grow for the map-reduce job
    * 
-   * @param conf configuration
-   * @param nbTrees number of trees to build
-   * @throws IllegalArgumentException if (nbTrees <= 0)
+   * @param conf
+   *          configuration
+   * @param nbTrees
+   *          number of trees to build
+   * @throws IllegalArgumentException
+   *           if (nbTrees <= 0)
    */
   public static void setNbTrees(Configuration conf, int nbTrees) {
-    if (nbTrees <= 0)
+    if (nbTrees <= 0) {
       throw new IllegalArgumentException("nbTrees should be greater than 0");
-
+    }
+    
     conf.setInt("mahout.rf.nbtrees", nbTrees);
   }
-
+  
   /**
    * Sets the Output directory name, will be creating in the working directory
    * 
-   * @param name output dir. name
+   * @param name
+   *          output dir. name
    */
   public void setOutputDirName(String name) {
     outputDirName = name;
   }
-
+  
   /**
    * Output Directory name
    * 
-   * @param conf configuration
+   * @param conf
+   *          configuration
    * @return output dir. path (%WORKING_DIRECTORY%/OUTPUT_DIR_NAME%)
-   * @throws IOException if we cannot get the default FileSystem
+   * @throws IOException
+   *           if we cannot get the default FileSystem
    */
   public Path getOutputPath(Configuration conf) throws IOException {
     // the output directory is accessed only by this class, so use the default
@@ -205,130 +214,145 @@
     FileSystem fs = FileSystem.get(conf);
     return new Path(fs.getWorkingDirectory(), outputDirName);
   }
-
+  
   /**
    * Helper method. Get a path from the DistributedCache
    * 
-   * @param conf configuration
-   * @param index index of the path in the DistributedCache files
+   * @param conf
+   *          configuration
+   * @param index
+   *          index of the path in the DistributedCache files
    * @return path from the DistributedCache
-   * @throws IOException if no path is found
+   * @throws IOException
+   *           if no path is found
    */
-  public static Path getDistributedCacheFile(Configuration conf, int index)
-      throws IOException {
+  public static Path getDistributedCacheFile(Configuration conf, int index) throws IOException {
     URI[] files = DistributedCache.getCacheFiles(conf);
-
-    if (files == null || files.length <= index) {
+    
+    if ((files == null) || (files.length <= index)) {
       throw new IOException("path not found in the DistributedCache");
     }
-
+    
     return new Path(files[index].getPath());
   }
-
+  
   /**
    * Helper method. Load a Dataset stored in the DistributedCache
    * 
-   * @param conf configuration
+   * @param conf
+   *          configuration
    * @return loaded Dataset
-   * @throws IOException if we cannot retrieve the Dataset path from the DistributedCache, or the Dataset could not be loaded
+   * @throws IOException
+   *           if we cannot retrieve the Dataset path from the DistributedCache, or the Dataset could not be
+   *           loaded
    */
   public static Dataset loadDataset(Configuration conf) throws IOException {
-    Path datasetPath = getDistributedCacheFile(conf, 0);
-
+    Path datasetPath = Builder.getDistributedCacheFile(conf, 0);
+    
     return Dataset.load(conf, datasetPath);
   }
-
-  protected Builder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath,
-      Long seed, Configuration conf) {
+  
+  protected Builder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed, Configuration conf) {
     this.treeBuilder = treeBuilder;
     this.dataPath = dataPath;
     this.datasetPath = datasetPath;
     this.seed = seed;
     this.conf = new Configuration(conf);
   }
-
+  
   /**
    * Used by the inheriting classes to configure the job
    * 
-   * @param job Hadoop's Job
-   * @param nbTrees number of trees to grow
-   * @param oobEstimate true, if oob error should be estimated
-   * @throws IOException if anything goes wrong while configuring the job
+   * @param job
+   *          Hadoop's Job
+   * @param nbTrees
+   *          number of trees to grow
+   * @param oobEstimate
+   *          true, if oob error should be estimated
+   * @throws IOException
+   *           if anything goes wrong while configuring the job
    */
-  protected abstract void configureJob(Job job, int nbTrees, boolean oobEstimate)
-      throws IOException;
-
+  protected abstract void configureJob(Job job, int nbTrees, boolean oobEstimate) throws IOException;
+  
   /**
-   * Sequential implementation should override this method to simulate the job
-   * execution
+   * Sequential implementation should override this method to simulate the job execution
    * 
-   * @param job Hadoop's job
+   * @param job
+   *          Hadoop's job
    * @return true is the job succeeded
    */
   protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
     return job.waitForCompletion(true);
   }
-
+  
   /**
-   * Parse the output files to extract the trees and pass the predictions to the
-   * callback
+   * Parse the output files to extract the trees and pass the predictions to the callback
    * 
-   * @param job Hadoop's job
-   * @param callback can be null
+   * @param job
+   *          Hadoop's job
+   * @param callback
+   *          can be null
    * @return Built DecisionForest
-   * @throws IOException if anything goes wrong while parsing the output
+   * @throws IOException
+   *           if anything goes wrong while parsing the output
    */
-  protected abstract DecisionForest parseOutput(Job job, PredictionCallback callback) throws IOException, ClassNotFoundException, InterruptedException;
-
-  public DecisionForest build(int nbTrees, PredictionCallback callback)
-      throws IOException, ClassNotFoundException, InterruptedException {
-    //int numTrees = getNbTrees(conf);
-
+  protected abstract DecisionForest parseOutput(Job job, PredictionCallback callback) throws IOException,
+                                                                                     ClassNotFoundException,
+                                                                                     InterruptedException;
+  
+  public DecisionForest build(int nbTrees, PredictionCallback callback) throws IOException,
+                                                                       ClassNotFoundException,
+                                                                       InterruptedException {
+    // int numTrees = getNbTrees(conf);
+    
     Path outputPath = getOutputPath(conf);
     FileSystem fs = outputPath.getFileSystem(conf);
-
+    
     // check the output
-    if (fs.exists(outputPath))
+    if (fs.exists(outputPath)) {
       throw new IOException("Output path already exists : " + outputPath);
-
-    if (seed != null)
-      setRandomSeed(conf, seed);
-    setNbTrees(conf, nbTrees);
-    setTreeBuilder(conf, treeBuilder);
-    setOobEstimate(conf, callback != null);
-
+    }
+    
+    if (seed != null) {
+      Builder.setRandomSeed(conf, seed);
+    }
+    Builder.setNbTrees(conf, nbTrees);
+    Builder.setTreeBuilder(conf, treeBuilder);
+    Builder.setOobEstimate(conf, callback != null);
+    
     // put the dataset into the DistributedCache
     DistributedCache.addCacheFile(datasetPath.toUri(), conf);
-
+    
     Job job = new Job(conf, "decision forest builder");
-
-    log.debug("Configuring the job...");
+    
+    Builder.log.debug("Configuring the job...");
     configureJob(job, nbTrees, callback != null);
-
-    log.debug("Running the job...");
+    
+    Builder.log.debug("Running the job...");
     if (!runJob(job)) {
-      log.error("Job failed!");
+      Builder.log.error("Job failed!");
       return null;
     }
-
-    if (isOutput(conf)) {
-      log.debug("Parsing the output...");
+    
+    if (Builder.isOutput(conf)) {
+      Builder.log.debug("Parsing the output...");
       DecisionForest forest = parseOutput(job, callback);
-
+      
       // delete the output path
       fs.delete(outputPath, true);
-
+      
       return forest;
     }
-
+    
     return null;
   }
-
+  
   /**
    * sort the splits into order based on size, so that the biggest go first.<br>
    * This is the same code used by Hadoop's JobClient.
    * 
-   * @param splits input splits
+   * @param splits
+   *          input splits
    */
   public static void sortSplits(InputSplit[] splits) {
     Arrays.sort(splits, new Comparator<InputSplit>() {
@@ -352,5 +376,5 @@
       }
     });
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredMapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredMapper.java Sat Feb 13 20:27:25 2010
@@ -27,17 +27,16 @@
 /**
  * Base class for Mapred mappers. Loads common parameters from the job
  */
-public class MapredMapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> extends
-    Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
-
+public class MapredMapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT> extends Mapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
+  
   private boolean noOutput;
-
+  
   private boolean oobEstimate;
-
+  
   private TreeBuilder treeBuilder;
-
+  
   private Dataset dataset;
-
+  
   /**
    * 
    * @return if false, the mapper does not output
@@ -45,7 +44,7 @@
   protected boolean isOobEstimate() {
     return oobEstimate;
   }
-
+  
   /**
    * 
    * @return if false, the mapper does not estimate and output predictions
@@ -53,15 +52,14 @@
   protected boolean isNoOutput() {
     return noOutput;
   }
-
+  
   protected TreeBuilder getTreeBuilder() {
     return treeBuilder;
   }
-
+  
   protected Dataset getDataset() {
     return dataset;
   }
-
   
   @Override
   protected void setup(Context context) throws IOException, InterruptedException {
@@ -69,10 +67,10 @@
     
     Configuration conf = context.getConfiguration();
     
-    configure(!Builder.isOutput(conf), Builder.isOobEstimate(conf), Builder
-          .getTreeBuilder(conf), Builder.loadDataset(conf));
+    configure(!Builder.isOutput(conf), Builder.isOobEstimate(conf), Builder.getTreeBuilder(conf), Builder
+        .loadDataset(conf));
   }
-
+  
   /**
    * Useful for testing
    * 
@@ -81,16 +79,15 @@
    * @param treeBuilder
    * @param dataset
    */
-  protected void configure(boolean noOutput, boolean oobEstimate,
-      TreeBuilder treeBuilder, Dataset dataset) {
+  protected void configure(boolean noOutput, boolean oobEstimate, TreeBuilder treeBuilder, Dataset dataset) {
     this.noOutput = noOutput;
     this.oobEstimate = oobEstimate;
-
+    
     if (treeBuilder == null) {
       throw new IllegalArgumentException("TreeBuilder not found in the Job parameters");
     }
     this.treeBuilder = treeBuilder;
-
+    
     this.dataset = dataset;
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java Sat Feb 13 20:27:25 2010
@@ -31,87 +31,89 @@
  * Contains a grown tree and and its oob predictions.
  */
 public class MapredOutput implements Writable, Cloneable {
-
+  
   private Node tree;
-
+  
   private int[] predictions;
-
+  
   public Node getTree() {
     return tree;
   }
-
+  
   public int[] getPredictions() {
     return predictions;
   }
-
-  public MapredOutput() {
-  }
-
+  
+  public MapredOutput() { }
+  
   public MapredOutput(Node tree, int[] predictions) {
     this.tree = tree;
     this.predictions = predictions;
   }
-
+  
   public MapredOutput(Node tree) {
     this(tree, null);
   }
-
+  
   public MapredOutput(int[] predictions) {
     this(null, predictions);
   }
-
+  
   public static MapredOutput read(DataInput in) throws IOException {
     MapredOutput rfOutput = new MapredOutput();
     rfOutput.readFields(in);
     return rfOutput;
   }
-
+  
   @Override
   public void readFields(DataInput in) throws IOException {
     boolean readTree = in.readBoolean();
     if (readTree) {
       tree = Node.read(in);
     }
-
+    
     boolean readPredictions = in.readBoolean();
     if (readPredictions) {
       predictions = DFUtils.readIntArray(in);
     }
   }
-
+  
   @Override
   public void write(DataOutput out) throws IOException {
     out.writeBoolean(tree != null);
     if (tree != null) {
       tree.write(out);
     }
-
+    
     out.writeBoolean(predictions != null);
     if (predictions != null) {
       DFUtils.writeArray(out, predictions);
     }
   }
-
+  
   @Override
   public MapredOutput clone() {
     return new MapredOutput(tree, predictions);
   }
-
+  
   @Override
   public boolean equals(Object obj) {
-    if (this == obj)
+    if (this == obj) {
       return true;
-    if (obj == null || !(obj instanceof MapredOutput))
+    }
+    if ((obj == null) || !(obj instanceof MapredOutput)) {
       return false;
-
+    }
+    
     MapredOutput mo = (MapredOutput) obj;
-
-    if (tree != null && tree.equals(mo.getTree()) == false)
+    
+    if ((tree != null) && (tree.equals(mo.getTree()) == false)) {
       return false;
-
+    }
+    
     return Arrays.equals(predictions, mo.getPredictions());
   }
-
+  
   @Override
   public int hashCode() {
     int hashCode = tree == null ? 1 : tree.hashCode();
@@ -120,10 +122,10 @@
     }
     return hashCode;
   }
-
+  
   @Override
   public String toString() {
     return "{" + tree + " | " + Arrays.toString(predictions) + '}';
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemBuilder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemBuilder.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemBuilder.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemBuilder.java Sat Feb 13 20:27:25 2010
@@ -41,62 +41,59 @@
 import org.apache.mahout.df.node.Node;
 
 /**
- * MapReduce implementation where each mapper loads a full copy of the data
- * in-memory. The forest trees are splitted across all the mappers
+ * MapReduce implementation where each mapper loads a full copy of the data in-memory. The forest trees are
+ * splitted across all the mappers
  */
 public class InMemBuilder extends Builder {
-
-  public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath,
-      Long seed, Configuration conf) {
+  
+  public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed, Configuration conf) {
     super(treeBuilder, dataPath, datasetPath, seed, conf);
   }
-
+  
   public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath) {
     this(treeBuilder, dataPath, datasetPath, null, new Configuration());
   }
-
+  
   @Override
-  protected void configureJob(Job job, int nbTrees, boolean oobEstimate)
-      throws IOException {
+  protected void configureJob(Job job, int nbTrees, boolean oobEstimate) throws IOException {
     Configuration conf = job.getConfiguration();
     
     job.setJarByClass(InMemBuilder.class);
     
     FileOutputFormat.setOutputPath(job, getOutputPath(conf));
-
+    
     // put the data in the DistributedCache
     DistributedCache.addCacheFile(getDataPath().toUri(), conf);
-
+    
     job.setOutputKeyClass(IntWritable.class);
     job.setOutputValueClass(MapredOutput.class);
-
+    
     job.setMapperClass(InMemMapper.class);
     job.setNumReduceTasks(0); // no reducers
-
+    
     job.setInputFormatClass(InMemInputFormat.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
-
+    
   }
-
+  
   @Override
-  protected DecisionForest parseOutput(Job job, PredictionCallback callback)
-      throws IOException {
+  protected DecisionForest parseOutput(Job job, PredictionCallback callback) throws IOException {
     Configuration conf = job.getConfiguration();
     
-    Map<Integer, MapredOutput> output = new HashMap<Integer, MapredOutput>();
-
+    Map<Integer,MapredOutput> output = new HashMap<Integer,MapredOutput>();
+    
     Path outputPath = getOutputPath(conf);
     FileSystem fs = outputPath.getFileSystem(conf);
-
+    
     Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
-
+    
     // import the InMemOutputs
     IntWritable key = new IntWritable();
     MapredOutput value = new MapredOutput();
-
+    
     for (Path path : outfiles) {
       Reader reader = new Reader(fs, path, conf);
-
+      
       try {
         while (reader.next(key, value)) {
           output.put(key.get(), value.clone());
@@ -105,27 +102,25 @@
         reader.close();
       }
     }
-
-    return processOutput(output, callback);
+    
+    return InMemBuilder.processOutput(output, callback);
   }
-
+  
   /**
-   * Process the output, extracting the trees and passing the predictions to the
-   * callback
+   * Process the output, extracting the trees and passing the predictions to the callback
    * 
    * @param output
    * @param callback
    * @return
    */
-  private static DecisionForest processOutput(Map<Integer, MapredOutput> output,
-      PredictionCallback callback) {
+  private static DecisionForest processOutput(Map<Integer,MapredOutput> output, PredictionCallback callback) {
     List<Node> trees = new ArrayList<Node>();
-
-    for (Map.Entry<Integer, MapredOutput> entry : output.entrySet()) {
+    
+    for (Map.Entry<Integer,MapredOutput> entry : output.entrySet()) {
       MapredOutput value = entry.getValue();
-
+      
       trees.add(value.getTree());
-
+      
       if (callback != null) {
         int[] predictions = value.getPredictions();
         for (int index = 0; index < predictions.length; index++) {
@@ -133,7 +128,7 @@
         }
       }
     }
-
+    
     return new DecisionForest(trees);
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java Sat Feb 13 20:27:25 2010
@@ -39,26 +39,23 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Custom InputFormat that generates InputSplits given the desired number of
- * trees.<br>
+ * Custom InputFormat that generates InputSplits given the desired number of trees.<br>
  * each input split contains a subset of the trees.<br>
  * The number of splits is equal to the number of requested splits
  */
-public class InMemInputFormat extends InputFormat<IntWritable, NullWritable> {
-
-  private static final Logger log = LoggerFactory
-      .getLogger(InMemInputSplit.class);
-
+public class InMemInputFormat extends InputFormat<IntWritable,NullWritable> {
+  
+  private static final Logger log = LoggerFactory.getLogger(InMemInputSplit.class);
+  
   private Random rng;
-
+  
   private Long seed;
-
+  
   private boolean isSingleSeed;
-
+  
   /**
-   * Used for DEBUG purposes only. if true and a seed is available, all the
-   * mappers use the same seed, thus all the mapper should take the same time to
-   * build their trees.
+   * Used for DEBUG purposes only. if true and a seed is available, all the mappers use the same seed, thus
+   * all the mapper should take the same time to build their trees.
    * 
    * @param conf
    * @return
@@ -66,69 +63,68 @@
   private static boolean isSingleSeed(Configuration conf) {
     return conf.getBoolean("debug.mahout.rf.single.seed", false);
   }
-
+  
   @Override
-  public RecordReader<IntWritable, NullWritable> createRecordReader(
-      InputSplit split, TaskAttemptContext context) throws IOException,
-      InterruptedException {
+  public RecordReader<IntWritable,NullWritable> createRecordReader(InputSplit split,
+                                                                   TaskAttemptContext context) throws IOException,
+                                                                                              InterruptedException {
     return new InMemRecordReader((InMemInputSplit) split);
   }
-
+  
   @Override
-  public List<InputSplit> getSplits(JobContext context) throws IOException,
-      InterruptedException {
+  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
     Configuration conf = context.getConfiguration();
     int numSplits = conf.getInt("mapred.map.tasks", -1);
-
+    
     return getSplits(conf, numSplits);
   }
-
+  
   public List<InputSplit> getSplits(Configuration conf, int numSplits) {
     int nbTrees = Builder.getNbTrees(conf);
     int splitSize = nbTrees / numSplits;
-
+    
     seed = Builder.getRandomSeed(conf);
-    isSingleSeed = isSingleSeed(conf);
-
-    if (rng != null && seed != null) {
-      log.warn("getSplits() was called more than once and the 'seed' is set, "
-          + "this can lead to no-repeatable behavior");
+    isSingleSeed = InMemInputFormat.isSingleSeed(conf);
+    
+    if ((rng != null) && (seed != null)) {
+      InMemInputFormat.log.warn("getSplits() was called more than once and the 'seed' is set, "
+                                + "this can lead to no-repeatable behavior");
     }
-
-    rng = (seed == null || isSingleSeed) ? null : RandomUtils.getRandom(seed);
-
+    
+    rng = (seed == null) || isSingleSeed ? null : RandomUtils.getRandom(seed);
+    
     int id = 0;
-
+    
     List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
-
+    
     for (int index = 0; index < numSplits - 1; index++) {
       splits.add(new InMemInputSplit(id, splitSize, nextSeed()));
       id += splitSize;
     }
-
+    
     // take care of the remainder
     splits.add(new InMemInputSplit(id, nbTrees - id, nextSeed()));
-
+    
     return splits;
   }
-
+  
   /**
    * Return the seed for the next InputSplit
    * 
    * @return
    */
   private Long nextSeed() {
-    if (seed == null)
+    if (seed == null) {
       return null;
-    else if (isSingleSeed)
+    } else if (isSingleSeed) {
       return seed;
-    else
+    } else {
       return rng.nextLong();
+    }
   }
-
-  public static class InMemRecordReader extends
-      RecordReader<IntWritable, NullWritable> {
-
+  
+  public static class InMemRecordReader extends RecordReader<IntWritable,NullWritable> {
+    
     private final InMemInputSplit split;
     private int pos;
     private IntWritable key;
@@ -137,31 +133,32 @@
     public InMemRecordReader(InMemInputSplit split) {
       this.split = split;
     }
-
+    
     @Override
     public float getProgress() throws IOException {
-      if (pos == 0)
+      if (pos == 0) {
         return 0.0f;
-      else
+      } else {
         return (float) (pos - 1) / split.nbTrees;
+      }
     }
-
+    
     @Override
     public IntWritable getCurrentKey() throws IOException, InterruptedException {
       return key;
     }
-
+    
     @Override
     public NullWritable getCurrentValue() throws IOException, InterruptedException {
       return value;
     }
-
+    
     @Override
     public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException {
       key = new IntWritable();
       value = NullWritable.get();
     }
-
+    
     @Override
     public boolean nextKeyValue() throws IOException, InterruptedException {
       if (pos < split.nbTrees) {
@@ -172,36 +169,34 @@
         return false;
       }
     }
-
+    
     @Override
-    public void close() throws IOException {
-    }
-
+    public void close() throws IOException {}
+    
   }
-
+  
   /**
    * Custom InputSplit that indicates how many trees are built by each mapper
    */
   public static class InMemInputSplit extends InputSplit implements Writable {
-
+    
     private static final String[] NO_LOCATIONS = new String[0];
-
+    
     /** Id of the first tree of this split */
     private int firstId;
-
+    
     private int nbTrees;
-
+    
     private Long seed;
-
-    public InMemInputSplit() {
-    }
-
+    
+    public InMemInputSplit() {}
+    
     public InMemInputSplit(int firstId, int nbTrees, Long seed) {
       this.firstId = firstId;
       this.nbTrees = nbTrees;
       this.seed = seed;
     }
-
+    
     /**
      * Return the Id of the first tree of this split
      * 
@@ -210,7 +205,7 @@
     public int getFirstId() {
       return firstId;
     }
-
+    
     /**
      * Return the number of trees
      * 
@@ -219,7 +214,7 @@
     public int getNbTrees() {
       return nbTrees;
     }
-
+    
     /**
      * Return the random seed
      * 
@@ -228,52 +223,54 @@
     public Long getSeed() {
       return seed;
     }
-
+    
     @Override
     public long getLength() throws IOException {
       return nbTrees;
     }
-
+    
     @Override
     public String[] getLocations() throws IOException {
-      return NO_LOCATIONS;
+      return InMemInputSplit.NO_LOCATIONS;
     }
-
+    
     @Override
     public boolean equals(Object obj) {
-      if (this == obj)
+      if (this == obj) {
         return true;
-      if (obj == null || !(obj instanceof InMemInputSplit))
+      }
+      if ((obj == null) || !(obj instanceof InMemInputSplit)) {
         return false;
-
+      }
+      
       InMemInputSplit split = (InMemInputSplit) obj;
-
-      if (seed == null && split.seed != null)
+      
+      if ((seed == null) && (split.seed != null)) {
         return false;
-
-      return firstId == split.firstId && nbTrees == split.nbTrees
-          && (seed == null || seed.equals(split.seed));
+      }
+      
+      return (firstId == split.firstId) && (nbTrees == split.nbTrees)
+             && ((seed == null) || seed.equals(split.seed));
     }
-
+    
     @Override
     public int hashCode() {
       return firstId + nbTrees + (seed == null ? 0 : seed.intValue());
     }
-
+    
     @Override
     public String toString() {
-      return String.format("[firstId:%d, nbTrees:%d, seed:%d]", firstId,
-          nbTrees, seed);
+      return String.format("[firstId:%d, nbTrees:%d, seed:%d]", firstId, nbTrees, seed);
     }
-
+    
     @Override
     public void readFields(DataInput in) throws IOException {
       firstId = in.readInt();
       nbTrees = in.readInt();
       boolean isSeed = in.readBoolean();
-      seed = (isSeed) ? in.readLong() : null;
+      seed = isSeed ? in.readLong() : null;
     }
-
+    
     @Override
     public void write(DataOutput out) throws IOException {
       out.writeInt(firstId);
@@ -283,7 +280,7 @@
         out.writeLong(seed);
       }
     }
-
+    
     public static InMemInputSplit read(DataInput in) throws IOException {
       InMemInputSplit split = new InMemInputSplit();
       split.readFields(in);
@@ -291,5 +288,5 @@
     }
     
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemMapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemMapper.java Sat Feb 13 20:27:25 2010
@@ -40,87 +40,86 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * In-memory mapper that grows the trees using a full copy of the data loaded
- * in-memory. The number of trees to grow is determined by the current
- * InMemInputSplit.
+ * In-memory mapper that grows the trees using a full copy of the data loaded in-memory. The number of trees
+ * to grow is determined by the current InMemInputSplit.
  */
-public class InMemMapper extends
-    MapredMapper<IntWritable, NullWritable, IntWritable, MapredOutput> {
-
+public class InMemMapper extends MapredMapper<IntWritable,NullWritable,IntWritable,MapredOutput> {
+  
   private static final Logger log = LoggerFactory.getLogger(InMemMapper.class);
-
+  
   private Bagging bagging;
-
+  
   private Random rng;
-
+  
   private Data data;
-
+  
   /**
    * Load the training data
    * 
    * @param conf
    * @return
-   * @throws RuntimeException if the data could not be loaded
+   * @throws RuntimeException
+   *           if the data could not be loaded
    */
   private static Data loadData(Configuration conf, Dataset dataset) throws IOException {
     Path dataPath = Builder.getDistributedCacheFile(conf, 1);
     FileSystem fs = FileSystem.get(dataPath.toUri(), conf);
     return DataLoader.loadData(dataset, fs, dataPath);
   }
-
+  
   @Override
-  protected void setup(Context context) throws IOException,
-      InterruptedException {
+  protected void setup(Context context) throws IOException, InterruptedException {
     super.setup(context);
-
+    
     Configuration conf = context.getConfiguration();
-
-    log.info("Loading the data...");
-    data = loadData(conf, getDataset());
-    log.info("Data loaded : {} instances", data.size());
-
+    
+    InMemMapper.log.info("Loading the data...");
+    data = InMemMapper.loadData(conf, getDataset());
+    InMemMapper.log.info("Data loaded : {} instances", data.size());
+    
     bagging = new Bagging(getTreeBuilder(), data);
   }
-
+  
   @Override
-  protected void map(IntWritable key, NullWritable value, Context context)
-      throws IOException, InterruptedException {
+  protected void map(IntWritable key, NullWritable value, Context context) throws IOException,
+                                                                          InterruptedException {
     map(key, context);
   }
-
+  
   public void map(IntWritable key, Context context) throws IOException, InterruptedException {
-
+    
     SingleTreePredictions callback = null;
     int[] predictions = null;
-
+    
     if (isOobEstimate() && !isNoOutput()) {
       callback = new SingleTreePredictions(data.size());
       predictions = callback.getPredictions();
     }
-
-    initRandom((InMemInputSplit)context.getInputSplit());
-
-    log.debug("Building...");
+    
+    initRandom((InMemInputSplit) context.getInputSplit());
+    
+    InMemMapper.log.debug("Building...");
     Node tree = bagging.build(key.get(), rng, callback);
-
+    
     if (!isNoOutput()) {
-      log.debug("Outputing...");
+      InMemMapper.log.debug("Outputing...");
       MapredOutput mrOut = new MapredOutput(tree, predictions);
-
+      
       context.write(key, mrOut);
     }
   }
-
+  
   protected void initRandom(InMemInputSplit split) {
     if (rng == null) { // first execution of this mapper
       Long seed = split.getSeed();
-      log.debug("Initialising rng with seed : {}", seed);
-
-      if (seed == null)
+      InMemMapper.log.debug("Initialising rng with seed : {}", seed);
+      
+      if (seed == null) {
         rng = RandomUtils.getRandom();
-      else
+      } else {
         rng = RandomUtils.getRandom(seed);
+      }
     }
   }
-
+  
 }



Mime
View raw message