mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From robina...@apache.org
Subject svn commit: r896311 [3/4] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/classifier/ core/src/main/java/org/apache/mahout/classifier/bayes/algorithm/ core/src/main/java/org/apache/mahout/classifier/bayes/common/ core/src/main/java/org/...
Date Wed, 06 Jan 2010 02:46:23 GMT
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java Wed Jan  6 02:46:22 2010
@@ -16,22 +16,26 @@
  */
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
-import org.apache.mahout.classifier.bayes.common.BayesParameters;
-
 import java.io.IOException;
 
+import org.apache.mahout.classifier.bayes.common.BayesParameters;
+
 /**
- * Implementors of this interface provide a way for running bayes training jobs on
- * a hadoop cluster.
+ * Implementors of this interface provide a way for running bayes training jobs
+ * on a hadoop cluster.
  * */
 public interface BayesJob {
-
+  
   /**
    * Execute a classification job on a cluster.
-   * @param input path to training documents.
-   * @param output path to output directory.
+   * 
+   * @param input
+   *          path to training documents.
+   * @param output
+   *          path to output directory.
    * */
-  void runJob(String input, String output, BayesParameters params)
-      throws IOException, ClassNotFoundException, InterruptedException;
-
+  void runJob(String input, String output, BayesParameters params) throws IOException,
+                                                                  ClassNotFoundException,
+                                                                  InterruptedException;
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+import java.util.Map;
+
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -39,87 +42,94 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.util.Map;
-
 /** The Driver which drives the Tf-Idf Generation */
 public class BayesTfIdfDriver implements BayesJob {
-
-  private static final Logger log = LoggerFactory.getLogger(BayesTfIdfDriver.class);
-
+  
+  private static final Logger log = LoggerFactory
+      .getLogger(BayesTfIdfDriver.class);
+  
   /**
    * Run the job
    * 
-   * @param input the input pathname String
-   * @param output the output pathname String
+   * @param input
+   *          the input pathname String
+   * @param output
+   *          the output pathname String
    * @throws ClassNotFoundException
    */
   @Override
   public void runJob(String input, String output, BayesParameters params) throws IOException {
-
+    
     Configurable client = new JobClient();
     JobConf conf = new JobConf(BayesWeightSummerDriver.class);
     conf.setJobName("TfIdf Driver running over input: " + input);
-
+    
     conf.setOutputKeyClass(StringTuple.class);
     conf.setOutputValueClass(DoubleWritable.class);
-
-    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-termDocCount"));
+    
+    FileInputFormat.addInputPath(conf, new Path(output
+                                                + "/trainer-termDocCount"));
     FileInputFormat.addInputPath(conf, new Path(output + "/trainer-wordFreq"));
-    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-featureCount"));
+    FileInputFormat.addInputPath(conf, new Path(output
+                                                + "/trainer-featureCount"));
     Path outPath = new Path(output + "/trainer-tfIdf/");
     FileOutputFormat.setOutputPath(conf, outPath);
-
+    
     // conf.setNumMapTasks(100);
-
+    
     conf.setJarByClass(BayesTfIdfDriver.class);
-
+    
     conf.setMapperClass(BayesTfIdfMapper.class);
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setCombinerClass(BayesTfIdfReducer.class);
-
+    
     conf.setReducerClass(BayesTfIdfReducer.class);
-
+    
     conf.setOutputFormat(BayesTfIdfOutputFormat.class);
-
-    conf.set("io.serializations",
-        "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
+    
+    conf
+        .set(
+          "io.serializations",
+          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
     // Dont ever forget this. People should keep track of how hadoop conf
     // parameters and make or break a piece of code
-
+    
     FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
     if (dfs.exists(outPath)) {
       dfs.delete(outPath, true);
     }
-
+    
     Path interimFile = new Path(output + "/trainer-docCount/part-*");
-
-    Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile, conf);
-
-    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
-        GenericsUtil.getClass(labelDocumentCounts));
-
-    String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
+    
+    Map<String,Double> labelDocumentCounts = SequenceFileModelReader
+        .readLabelDocumentCounts(dfs, interimFile, conf);
+    
+    DefaultStringifier<Map<String,Double>> mapStringifier = new DefaultStringifier<Map<String,Double>>(
+        conf, GenericsUtil.getClass(labelDocumentCounts));
+    
+    String labelDocumentCountString = mapStringifier
+        .toString(labelDocumentCounts);
     log.info("Counts of documents in Each Label");
-    Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString);
+    Map<String,Double> c = mapStringifier.fromString(labelDocumentCountString);
     log.info("{}", c);
-
+    
     conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
     log.info(params.print());
     if (params.get("dataSource").equals("hbase")) {
       HBaseConfiguration hc = new HBaseConfiguration(new Configuration());
       HTableDescriptor ht = new HTableDescriptor(output);
-      HColumnDescriptor hcd = new HColumnDescriptor(BayesConstants.HBASE_COLUMN_FAMILY+ ':');
+      HColumnDescriptor hcd = new HColumnDescriptor(
+          BayesConstants.HBASE_COLUMN_FAMILY + ':');
       hcd.setBloomfilter(true);
       hcd.setInMemory(true);
       hcd.setMaxVersions(1);
       hcd.setBlockCacheEnabled(true);
       ht.addFamily(hcd);
-
+      
       log.info("{}", "Connecting to hbase...");
       HBaseAdmin hba = new HBaseAdmin(hc);
       log.info("{}", "Creating Table " + output);
-
+      
       if (hba.tableExists(output)) {
         hba.disableTable(output);
         hba.deleteTable(output);
@@ -129,9 +139,9 @@
       conf.set("output.table", output);
     }
     conf.set("bayes.parameters", params.toString());
-
+    
     client.setConf(conf);
-
+    
     JobClient.runJob(conf);
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfMapper.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfMapper.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,10 @@
 
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
 import org.apache.hadoop.io.DefaultStringifier;
 import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.mapred.JobConf;
@@ -29,34 +33,36 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
+/**
+ * Naive Bayes Tfidf Mapper. Calculates per document statistics
+ * 
+ */
 public class BayesTfIdfMapper extends MapReduceBase implements
-    Mapper<StringTuple, DoubleWritable, StringTuple, DoubleWritable> {
-
+    Mapper<StringTuple,DoubleWritable,StringTuple,DoubleWritable> {
+  
   private static final Logger log = LoggerFactory
       .getLogger(BayesTfIdfMapper.class);
-
-  private Map<String, Double> labelDocumentCounts = null;
-
-  private static final StringTuple vocabCount = new StringTuple(
+  
+  private static final StringTuple VOCAB_COUNT = new StringTuple(
       BayesConstants.FEATURE_SET_SIZE);
-
-  private static final DoubleWritable one = new DoubleWritable(1.0);
-
+  
+  private static final DoubleWritable ONE = new DoubleWritable(1.0);
+  
+  private Map<String,Double> labelDocumentCounts;
+  
   /**
    * We need to calculate the Tf-Idf of each feature in each label
    * 
-   * @param key The label,feature pair (can either be the freq Count or the term
-   *        Document count
+   * @param key
+   *          The label,feature pair (can either be the freq Count or the term
+   *          Document count
    */
   @Override
-  public void map(StringTuple key, DoubleWritable value,
-      OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter)
-      throws IOException {
-
+  public void map(StringTuple key,
+                  DoubleWritable value,
+                  OutputCollector<StringTuple,DoubleWritable> output,
+                  Reporter reporter) throws IOException {
+    
     if (key.length() == 3) {
       if (key.stringAt(0).equals(BayesConstants.WEIGHT)) {
         reporter.setStatus("Bayes TfIdf Mapper: Tf: " + key);
@@ -68,32 +74,30 @@
         key.replaceAt(0, BayesConstants.WEIGHT);
         output.collect(key, new DoubleWritable(logIdf));
         reporter.setStatus("Bayes TfIdf Mapper: log(Idf): " + key);
-      } else
-        throw new IllegalArgumentException("Unrecognized Tuple: " + key);
+      } else throw new IllegalArgumentException("Unrecognized Tuple: " + key);
     } else if (key.length() == 2) {
       if (key.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) {
-        output.collect(vocabCount, one);
+        output.collect(VOCAB_COUNT, ONE);
         reporter.setStatus("Bayes TfIdf Mapper: vocabCount");
-      } else
-        throw new IllegalArgumentException("Unexpected Tuple: " + key);
+      } else throw new IllegalArgumentException("Unexpected Tuple: " + key);
     }
-
+    
   }
-
+  
   @Override
   public void configure(JobConf job) {
     try {
       if (labelDocumentCounts == null) {
-        labelDocumentCounts = new HashMap<String, Double>();
-
-        DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(
+        labelDocumentCounts = new HashMap<String,Double>();
+        
+        DefaultStringifier<Map<String,Double>> mapStringifier = new DefaultStringifier<Map<String,Double>>(
             job, GenericsUtil.getClass(labelDocumentCounts));
-
+        
         String labelDocumentCountString = mapStringifier
             .toString(labelDocumentCounts);
         labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts",
-            labelDocumentCountString);
-
+          labelDocumentCountString);
+        
         labelDocumentCounts = mapStringifier
             .fromString(labelDocumentCountString);
       }
@@ -101,5 +105,5 @@
       log.warn(ex.toString(), ex);
     }
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfOutputFormat.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfOutputFormat.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfOutputFormat.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
@@ -27,37 +29,36 @@
 import org.apache.hadoop.util.Progressable;
 import org.apache.mahout.common.StringTuple;
 
-import java.io.IOException;
-
 /**
  * This class extends the MultipleOutputFormat, allowing to write the output
  * data to different output files in sequence file output format.
  */
 public class BayesTfIdfOutputFormat extends
-    MultipleOutputFormat<WritableComparable<?>, Writable> {
-
-  private SequenceFileOutputFormat<WritableComparable<?>, Writable> theSequenceFileOutputFormat = null;
-
+    MultipleOutputFormat<WritableComparable<?>,Writable> {
+  
+  private SequenceFileOutputFormat<WritableComparable<?>,Writable> theSequenceFileOutputFormat;
+  
   @Override
-  protected RecordWriter<WritableComparable<?>, Writable> getBaseRecordWriter(
-      FileSystem fs, JobConf job, String name, Progressable arg3)
-      throws IOException {
+  protected RecordWriter<WritableComparable<?>,Writable> getBaseRecordWriter(FileSystem fs,
+                                                                             JobConf job,
+                                                                             String name,
+                                                                             Progressable arg3) throws IOException {
     if (theSequenceFileOutputFormat == null) {
-      theSequenceFileOutputFormat = new SequenceFileOutputFormat<WritableComparable<?>, Writable>();
+      theSequenceFileOutputFormat = new SequenceFileOutputFormat<WritableComparable<?>,Writable>();
     }
     return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
   }
-
+  
   @Override
   protected String generateFileNameForKeyValue(WritableComparable<?> k,
-      Writable v, String name) {
+                                               Writable v,
+                                               String name) {
     StringTuple key = (StringTuple) k;
-
+    
     if (key.length() == 1
-        && key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE))
+        && key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) {
       return "trainer-vocabCount/" + name;
-    else
-      return "trainer-tfIdf/" + name;
+    } else return "trainer-tfIdf/" + name;
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfReducer.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfReducer.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+import java.util.Iterator;
+
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Put;
@@ -32,37 +35,36 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.util.Iterator;
-
 /**
  * Can also be used as a local Combiner beacuse only two values should be there
  * inside the values
  */
 public class BayesTfIdfReducer extends MapReduceBase implements
-    Reducer<StringTuple, DoubleWritable, StringTuple, DoubleWritable> {
-
-  private static final Logger log = LoggerFactory.getLogger(BayesTfIdfReducer.class);
-
+    Reducer<StringTuple,DoubleWritable,StringTuple,DoubleWritable> {
+  
+  private static final Logger log = LoggerFactory
+      .getLogger(BayesTfIdfReducer.class);
+  
   private HTable table;
   
-  private boolean useHbase = false;
-
+  private boolean useHbase;
+  
   @Override
-  public void reduce(StringTuple key, Iterator<DoubleWritable> values,
-      OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter)
-      throws IOException {
+  public void reduce(StringTuple key,
+                     Iterator<DoubleWritable> values,
+                     OutputCollector<StringTuple,DoubleWritable> output,
+                     Reporter reporter) throws IOException {
     // Key is label,word, value is the number of times we've seen this label
     // word per local node. Output is the same
-
+    
     if (key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) {
       double vocabCount = 0.0;
-
+      
       while (values.hasNext()) {
         reporter.setStatus("Bayes TfIdf Reducer: vocabCount " + vocabCount);
         vocabCount += values.next().get();
       }
-
+      
       log.info("{}\t{}", key, vocabCount);
       if (useHbase) {
         Put bu = new Put(Bytes.toBytes(BayesConstants.HBASE_COUNTS_ROW));
@@ -73,10 +75,10 @@
       }
       output.collect(key, new DoubleWritable(vocabCount));
     } else if (key.stringAt(0).equals(BayesConstants.WEIGHT)) {
-      double idfTimes_D_ij = 1.0;
+      double idfTimesDIJ = 1.0;
       int numberofValues = 0;
       while (values.hasNext()) {
-        idfTimes_D_ij *= values.next().get();
+        idfTimesDIJ *= values.next().get();
         numberofValues++;
       }
       if (numberofValues == 2) { // Found TFIdf
@@ -84,39 +86,38 @@
         String feature = key.stringAt(2);
         if (useHbase) {
           Put bu = new Put(Bytes.toBytes(feature));
-          bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY),
-                 Bytes.toBytes(label), Bytes.toBytes(idfTimes_D_ij));
+          bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY), Bytes
+              .toBytes(label), Bytes.toBytes(idfTimesDIJ));
           table.put(bu);
         }
-
+        
       }
-      reporter.setStatus("Bayes TfIdf Reducer: " + key + " => " + idfTimes_D_ij);
-      output.collect(key, new DoubleWritable(idfTimes_D_ij));
+      reporter
+          .setStatus("Bayes TfIdf Reducer: " + key + " => " + idfTimesDIJ);
+      output.collect(key, new DoubleWritable(idfTimesDIJ));
     } else {
       throw new IllegalArgumentException("Unexpected StringTuple: " + key);
     }
   }
-
+  
   @Override
   public void configure(JobConf job) {
     try {
       Parameters params = Parameters
           .fromString(job.get("bayes.parameters", ""));
-      if (params.get("dataSource").equals("hbase"))
-        useHbase = true;
-      else
-        return;
-
-      HBaseConfiguration HBconf = new HBaseConfiguration(job);
-
-      table = new HTable(HBconf, job.get("output.table"));
-
+      if (params.get("dataSource").equals("hbase")) useHbase = true;
+      else return;
+      
+      HBaseConfiguration hBconf = new HBaseConfiguration(job);
+      
+      table = new HTable(hBconf, job.get("output.table"));
+      
     } catch (IOException e) {
       log.error("Unexpected error during configuration", e);
     }
-
+    
   }
-
+  
   @Override
   public void close() throws IOException {
     if (useHbase) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -29,34 +31,35 @@
 import org.apache.mahout.classifier.bayes.common.BayesParameters;
 import org.apache.mahout.common.StringTuple;
 
-import java.io.IOException;
-
 /** Create and run the Bayes Trainer. */
 public class BayesWeightSummerDriver implements BayesJob {
-
+  
   /**
    * Run the job
-   *
-   * @param input  the input pathname String
-   * @param output the output pathname String
+   * 
+   * @param input
+   *          the input pathname String
+   * @param output
+   *          the output pathname String
    */
   @Override
   public void runJob(String input, String output, BayesParameters params) throws IOException {
     Configurable client = new JobClient();
     JobConf conf = new JobConf(BayesWeightSummerDriver.class);
-    conf.setJobName("Bayes Weight Summer Driver running over input: " +  input);
-
-
+    conf.setJobName("Bayes Weight Summer Driver running over input: " + input);
+    
     conf.setOutputKeyClass(StringTuple.class);
     conf.setOutputValueClass(DoubleWritable.class);
-
-    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
+    
+    FileInputFormat.addInputPath(conf, new Path(
+        output + "/trainer-tfIdf/trainer-tfIdf"));
     Path outPath = new Path(output + "/trainer-weights");
     FileOutputFormat.setOutputPath(conf, outPath);
-    //conf.setNumReduceTasks(1);
-    //conf.setNumMapTasks(100);
+    // conf.setNumReduceTasks(1);
+    // conf.setNumMapTasks(100);
     conf.setMapperClass(BayesWeightSummerMapper.class);
-    //see the javadoc for the spec for file input formats: first token is key, rest is input.  Whole document on one line
+    // see the javadoc for the spec for file input formats: first token is key,
+    // rest is input. Whole document on one line
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setCombinerClass(BayesWeightSummerReducer.class);
     conf.setReducerClass(BayesWeightSummerReducer.class);
@@ -66,11 +69,11 @@
       dfs.delete(outPath, true);
     }
     conf.set("bayes.parameters", params.toString());
-
+    
     conf.set("output.table", output);
-
+    
     client.setConf(conf);
-
+    
     JobClient.runJob(conf);
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerMapper.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerMapper.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+
 import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.mapred.MapReduceBase;
 import org.apache.hadoop.mapred.Mapper;
@@ -24,34 +26,39 @@
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.common.StringTuple;
 
-import java.io.IOException;
-
+/**
+ * 
+ * Calculates weight sum for a unique label, and feature
+ * 
+ */
 public class BayesWeightSummerMapper extends MapReduceBase implements
-    Mapper<StringTuple, DoubleWritable, StringTuple, DoubleWritable> {
-
+    Mapper<StringTuple,DoubleWritable,StringTuple,DoubleWritable> {
+  
   /**
    * We need to calculate the weight sums across each label and each feature
    * 
-   * @param key The label,feature tuple containing the tfidf value
+   * @param key
+   *          The label,feature tuple containing the tfidf value
    */
   @Override
-  public void map(StringTuple key, DoubleWritable value,
-      OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter)
-      throws IOException {
+  public void map(StringTuple key,
+                  DoubleWritable value,
+                  OutputCollector<StringTuple,DoubleWritable> output,
+                  Reporter reporter) throws IOException {
     String label = key.stringAt(1);
     String feature = key.stringAt(2);
     reporter.setStatus("Bayes Weight Summer Mapper: " + key);
     StringTuple featureSum = new StringTuple(BayesConstants.FEATURE_SUM);
     featureSum.add(feature);
-    output.collect(featureSum, value);// sum of weight for all labels for a
-                                      // feature Sigma_j
+    output.collect(featureSum, value); // sum of weight for all labels for a
+                                       // feature Sigma_j
     StringTuple labelSum = new StringTuple(BayesConstants.LABEL_SUM);
     labelSum.add(label);
-    output.collect(labelSum, value);// sum of weight for all features for a
-                                    // label Sigma_k
+    output.collect(labelSum, value); // sum of weight for all features for a
+                                     // label Sigma_k
     StringTuple totalSum = new StringTuple(BayesConstants.TOTAL_SUM);
-    output.collect(totalSum, value);// sum of weight of all features for all
-                                    // label Sigma_kSigma_j
-
+    output.collect(totalSum, value); // sum of weight of all features for all
+                                     // label Sigma_kSigma_j
+    
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerOutputFormat.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerOutputFormat.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerOutputFormat.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
@@ -27,45 +29,41 @@
 import org.apache.hadoop.util.Progressable;
 import org.apache.mahout.common.StringTuple;
 
-import java.io.IOException;
-
 /**
- * This class extends the MultipleOutputFormat, allowing to write the output data to different output files in sequence
- * file output format.
+ * This class extends the MultipleOutputFormat, allowing to write the output
+ * data to different output files in sequence file output format.
  */
-public class BayesWeightSummerOutputFormat extends MultipleOutputFormat<WritableComparable<?>, Writable> {
-
-  private SequenceFileOutputFormat<WritableComparable<?>, Writable> theSequenceFileOutputFormat = null;
-
+public class BayesWeightSummerOutputFormat extends
+    MultipleOutputFormat<WritableComparable<?>,Writable> {
+  
+  private SequenceFileOutputFormat<WritableComparable<?>,Writable> theSequenceFileOutputFormat;
+  
   @Override
-  protected RecordWriter<WritableComparable<?>, Writable> getBaseRecordWriter(
-      FileSystem fs, JobConf job, String name, Progressable arg3)
-      throws IOException {
+  protected RecordWriter<WritableComparable<?>,Writable> getBaseRecordWriter(FileSystem fs,
+                                                                             JobConf job,
+                                                                             String name,
+                                                                             Progressable arg3) throws IOException {
     if (theSequenceFileOutputFormat == null) {
-      theSequenceFileOutputFormat = new SequenceFileOutputFormat<WritableComparable<?>, Writable>();
+      theSequenceFileOutputFormat = new SequenceFileOutputFormat<WritableComparable<?>,Writable>();
     }
     return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
   }
-
+  
   @Override
-  protected String generateFileNameForKeyValue(WritableComparable<?> k, Writable v,
+  protected String generateFileNameForKeyValue(WritableComparable<?> k,
+                                               Writable v,
                                                String name) {
     StringTuple key = (StringTuple) k;
-
-    if(key.length() == 1 && key.stringAt(0).equals(BayesConstants.TOTAL_SUM))
-    {
+    
+    if (key.length() == 1 && key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) {
       return "Sigma_kSigma_j/" + name;
-    }
-    else{
-      if(key.stringAt(0).equals(BayesConstants.FEATURE_SUM))
-      {
+    } else {
+      if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) {
         return "Sigma_j/" + name;
-      }
-      else if(key.stringAt(0).equals(BayesConstants.LABEL_SUM))
-        return "Sigma_k/" + name;
-      else
-        throw new IllegalArgumentException("Unexpected StringTuple: " + key);
+      } else if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) return "Sigma_k/"
+                                                                          + name;
+      else throw new IllegalArgumentException("Unexpected StringTuple: " + key);
     }
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerReducer.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerReducer.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+import java.util.Iterator;
+
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Put;
@@ -32,33 +35,32 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.util.Iterator;
-
 /** Can also be used as a local Combiner */
 public class BayesWeightSummerReducer extends MapReduceBase implements
-    Reducer<StringTuple, DoubleWritable, StringTuple, DoubleWritable> {
-
-  private static final Logger log = LoggerFactory.getLogger(BayesWeightSummerReducer.class);
-
+    Reducer<StringTuple,DoubleWritable,StringTuple,DoubleWritable> {
+  
+  private static final Logger log = LoggerFactory
+      .getLogger(BayesWeightSummerReducer.class);
+  
   private HTable table;
-
-  private boolean useHbase = false;
-
+  
+  private boolean useHbase;
+  
   @Override
-  public void reduce(StringTuple key, Iterator<DoubleWritable> values,
-      OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter)
-      throws IOException {
+  public void reduce(StringTuple key,
+                     Iterator<DoubleWritable> values,
+                     OutputCollector<StringTuple,DoubleWritable> output,
+                     Reporter reporter) throws IOException {
     // Key is label,word, value is the tfidf of the feature of times we've seen
     // this label word per local node. Output is the same
-
+    
     double sum = 0.0;
     while (values.hasNext()) {
       reporter.setStatus("Weight Summer Reducer: " + key);
       sum += values.next().get();
     }
     reporter.setStatus("Bayes Weight Summer Reducer: " + key + " => " + sum);
-    //char firstChar = key.toString().charAt(0);
+    // char firstChar = key.toString().charAt(0);
     if (useHbase) {
       if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) { // sum of weight
         // for all
@@ -66,12 +68,12 @@
         // feature
         // Sigma_j
         String feature = key.stringAt(1);
-
+        
         Put bu = new Put(Bytes.toBytes(feature));
         bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY), Bytes
             .toBytes(BayesConstants.FEATURE_SUM), Bytes.toBytes(sum));
         table.put(bu);
-
+        
       } else if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) {
         String label = key.stringAt(1);
         Put bu = new Put(Bytes.toBytes(BayesConstants.LABEL_SUM));
@@ -85,28 +87,26 @@
         table.put(bu);
       }
     }
-
+    
     output.collect(key, new DoubleWritable(sum));
   }
-
+  
   @Override
   public void configure(JobConf job) {
     try {
       Parameters params = Parameters
           .fromString(job.get("bayes.parameters", ""));
-      if (params.get("dataSource").equals("hbase"))
-        useHbase = true;
-      else
-        return;
-
-      HBaseConfiguration HBconf = new HBaseConfiguration(job);
-      table = new HTable(HBconf, job.get("output.table"));
+      if (params.get("dataSource").equals("hbase")) useHbase = true;
+      else return;
+      
+      HBaseConfiguration hBconf = new HBaseConfiguration(job);
+      table = new HTable(hBconf, job.get("output.table"));
     } catch (IOException e) {
       log.error("Unexpected error during configuration", e);
     }
-
+    
   }
-
+  
   @Override
   public void close() throws IOException {
     if (useHbase) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java Wed Jan  6 02:46:22 2010
@@ -16,6 +16,8 @@
  */
 package org.apache.mahout.classifier.bayes.mapreduce.common;
 
+import java.io.IOException;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -28,48 +30,52 @@
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-public class JobExecutor {
+/**
+ * Base class for executing the Bayes Map/Reduce Jobs
+ *
+ */
+public final class JobExecutor {
   /** Logger for this class. */
   private static final Logger log = LoggerFactory.getLogger(BayesDriver.class);
-
-  private JobExecutor() {
-  }
-
+  
+  private JobExecutor() { }
+  
   /**
    * Execute a bayes classification job. Input and output path are parsed from
    * the input parameters.
-   * @param args input parameters.
-   * @param job the job to execute. 
-   * @throws Exception any exception thrown at job execution.
+   * 
+   * @param args
+   *          input parameters.
+   * @param job
+   *          the job to execute.
+   * @throws Exception
+   *           any exception thrown at job execution.
    * */
-  public static void execute(String[] args, BayesJob job)
-      throws ClassNotFoundException, IOException, InterruptedException {
+  public static void execute(String[] args, BayesJob job) throws ClassNotFoundException,
+                                                         IOException,
+                                                         InterruptedException {
     GroupBuilder gbuilder = new GroupBuilder();
-
+    
     Option inputOpt = DefaultOptionCreator.inputOption().create();
     Option outputOpt = DefaultOptionCreator.outputOption().create();
     Option helpOpt = DefaultOptionCreator.helpOption();
-
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
-        .withOption(helpOpt).create();
-
-
+    
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(
+      outputOpt).withOption(helpOpt).create();
+    
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
+      
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
       }
-
+      
       String input = cmdLine.getValue(inputOpt).toString();
       String output = cmdLine.getValue(outputOpt).toString();
-
+      
       job.runJob(input, output, new BayesParameters(1));
     } catch (OptionException e) {
       log.error(e.getMessage());

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/model/ClassifierContext.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/model/ClassifierContext.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/model/ClassifierContext.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/model/ClassifierContext.java Wed Jan  6 02:46:22 2010
@@ -23,9 +23,12 @@
 import org.apache.mahout.classifier.bayes.exceptions.InvalidDatastoreException;
 import org.apache.mahout.classifier.bayes.interfaces.Algorithm;
 import org.apache.mahout.classifier.bayes.interfaces.Datastore;
-
+/**
+ * The Classifier Wrapper used for choosing the {@link Algorithm} and {@link Datastore}
+ *
+ */
 public class ClassifierContext {
-
+  
   private final Algorithm algorithm;
   private final Datastore datastore;
   
@@ -35,50 +38,62 @@
   }
   
   /**
-   * Initializes the Context. Gets the necessary data and checks if the Datastore is valid 
+   * Initializes the Context. Gets the necessary data and checks if the
+   * Datastore is valid
+   * 
    * @throws InvalidDatastoreException
    */
-  public void initialize() throws InvalidDatastoreException{
+  public void initialize() throws InvalidDatastoreException {
     datastore.initialize();
     algorithm.initialize(this.datastore);
   }
   
   /**
    * Classify the document and return the Result
-   *
-   * @param document        The document to classify
-   * @param defaultCategory The default category to assign
-   * Ties are broken by comparing the category
-   * @return A Collection of {@link org.apache.mahout.classifier.ClassifierResult}s.
-   * @throws InvalidDatastoreException 
+   * 
+   * @param document
+   *          The document to classify
+   * @param defaultCategory
+   *          The default category to assign Ties are broken by comparing the
+   *          category
+   * @return A Collection of
+   *         {@link org.apache.mahout.classifier.ClassifierResult}s.
+   * @throws InvalidDatastoreException
    */
-  public ClassifierResult classifyDocument(String[] document, String defaultCategory)
-      throws InvalidDatastoreException {
-    return algorithm.classifyDocument(document, datastore, defaultCategory);   
+  public ClassifierResult classifyDocument(String[] document,
+                                           String defaultCategory) throws InvalidDatastoreException {
+    return algorithm.classifyDocument(document, datastore, defaultCategory);
   }
-
+  
   /**
    * Classify the document and return the top <code>numResults</code>
-   *
-   * @param document        The document to classify
-   * @param defaultCategory The default category to assign
-   * @param numResults      The maximum number of results to return, ranked by score.
-   * Ties are broken by comparing the category
-   * @return A Collection of {@link org.apache.mahout.classifier.ClassifierResult}s.
-   * @throws InvalidDatastoreException 
-   */ 
-  public ClassifierResult[] classifyDocument(String[] document, String defaultCategory, int numResults)
-      throws InvalidDatastoreException{
-    return algorithm.classifyDocument(document, datastore, defaultCategory, numResults);
+   * 
+   * @param document
+   *          The document to classify
+   * @param defaultCategory
+   *          The default category to assign
+   * @param numResults
+   *          The maximum number of results to return, ranked by score. Ties are
+   *          broken by comparing the category
+   * @return A Collection of
+   *         {@link org.apache.mahout.classifier.ClassifierResult}s.
+   * @throws InvalidDatastoreException
+   */
+  public ClassifierResult[] classifyDocument(String[] document,
+                                             String defaultCategory,
+                                             int numResults) throws InvalidDatastoreException {
+    return algorithm.classifyDocument(document, datastore, defaultCategory,
+      numResults);
   }
+  
   /**
    * Gets the labels in the given model
+   * 
    * @return Collection of Labels
-   * @throws InvalidDatastoreException 
+   * @throws InvalidDatastoreException
    */
   public Collection<String> getLabels() throws InvalidDatastoreException {
-   return algorithm.getLabels(datastore);
+    return algorithm.getLabels(datastore);
   }
-
   
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/common/FileLineIterator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/common/FileLineIterator.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/common/FileLineIterator.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/common/FileLineIterator.java Wed Jan  6 02:46:22 2010
@@ -17,8 +17,6 @@
 
 package org.apache.mahout.common;
 
-import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
-
 import java.io.BufferedReader;
 import java.io.Closeable;
 import java.io.File;
@@ -32,6 +30,8 @@
 import java.util.zip.GZIPInputStream;
 import java.util.zip.ZipInputStream;
 
+import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
+
 /**
  * Iterates over the lines of a text file. This assumes the text file's lines are delimited in a manner consistent with
  * how {@link BufferedReader} defines lines.

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierTest.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierTest.java Wed Jan  6 02:46:22 2010
@@ -27,15 +27,15 @@
 import org.apache.mahout.classifier.bayes.model.ClassifierContext;
 
 public class BayesClassifierTest extends TestCase {
-
+  
   protected Algorithm algorithm;
-
+  
   protected InMemoryBayesDatastore store;
-
+  
   public BayesClassifierTest(String s) {
     super(s);
   }
-
+  
   @Override
   protected void setUp() throws Exception {
     super.setUp();
@@ -44,69 +44,59 @@
     // String[] labels = new String[]{"a", "b", "c", "d", "e"};
     // long[] labelCounts = new long[]{6, 20, 60, 100, 200};
     // String[] features = new String[]{"aa", "bb", "cc", "dd", "ee"};
-    store.setSigma_jSigma_k(100.0);
-
+    store.setSigmaJSigmaK(100.0);
+    
     store.setSumFeatureWeight("aa", 100);
     store.setSumFeatureWeight("bb", 100);
     store.setSumFeatureWeight("cc", 100);
     store.setSumFeatureWeight("dd", 100);
     store.setSumFeatureWeight("ee", 100);
-
+    
     store.setSumLabelWeight("a", 1);
     store.setSumLabelWeight("b", 1);
     store.setSumLabelWeight("c", 1);
     store.setSumLabelWeight("d", 1);
     store.setSumLabelWeight("e", 1);
-
+    
     store.loadFeatureWeight("aa", "a", 5);
     store.loadFeatureWeight("bb", "a", 1);
-
+    
     store.loadFeatureWeight("bb", "b", 20);
-
+    
     store.loadFeatureWeight("cc", "c", 30);
     store.loadFeatureWeight("aa", "c", 25);
     store.loadFeatureWeight("dd", "c", 5);
-
+    
     store.loadFeatureWeight("dd", "d", 60);
     store.loadFeatureWeight("cc", "d", 40);
-
+    
     store.loadFeatureWeight("ee", "e", 100);
     store.loadFeatureWeight("aa", "e", 50);
     store.loadFeatureWeight("dd", "e", 50);
     store.updateVocabCount();
   }
-
+  
   public void test() throws InvalidDatastoreException {
     ClassifierContext classifier = new ClassifierContext(algorithm, store);
-    String[] document = { "aa", "ff" };
+    String[] document = {"aa", "ff"};
     ClassifierResult result = classifier.classifyDocument(document, "unknown");
     assertNotNull("category is null and it shouldn't be", result);
     assertEquals(result + " is not equal to e", "e", result.getLabel());
-
-    document = new String[] { "ff" };
+    
+    document = new String[] {"ff"};
     result = classifier.classifyDocument(document, "unknown");
     assertNotNull("category is null and it shouldn't be", result);
-    assertEquals(result + " is not equal to d", "d", result.getLabel());// GSI:
-                                                                        // was
-                                                                        // unknown,
-                                                                        // but
-                                                                        // we
-                                                                        // now
-                                                                        // just
-                                                                        // pick
-                                                                        // the
-                                                                        // first
-                                                                        // cat
-
-    document = new String[] { "cc" };
+    assertEquals(result + " is not equal to d", "d", result.getLabel());
+    
+    document = new String[] {"cc"};
     result = classifier.classifyDocument(document, "unknown");
     assertNotNull("category is null and it shouldn't be", result);
     assertEquals(result + " is not equal to d", "d", result.getLabel());
   }
-
+  
   public void testResults() throws Exception {
     ClassifierContext classifier = new ClassifierContext(algorithm, store);
-    String[] document = { "aa", "ff" };
+    String[] document = {"aa", "ff"};
     ClassifierResult result = classifier.classifyDocument(document, "unknown");
     assertNotNull("category is null and it shouldn't be", result);
     System.out.println("Result: " + result);

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesFeatureMapperTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesFeatureMapperTest.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesFeatureMapperTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesFeatureMapperTest.java Wed Jan  6 02:46:22 2010
@@ -17,41 +17,46 @@
 
 package org.apache.mahout.classifier.bayes;
 
+import java.util.List;
+import java.util.Map;
+
 import junit.framework.TestCase;
+
 import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.Reporter;
-import org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureMapper;
 import org.apache.mahout.classifier.bayes.common.BayesParameters;
+import org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureMapper;
 import org.apache.mahout.common.DummyOutputCollector;
 import org.apache.mahout.common.StringTuple;
 
-import java.util.List;
-import java.util.Map;
-
 public class BayesFeatureMapperTest extends TestCase {
-
+  
   public void test() throws Exception {
     BayesFeatureMapper mapper = new BayesFeatureMapper();
     JobConf conf = new JobConf();
     conf.set("io.serializations",
-        "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
+      "org.apache.hadoop.io.serializer.JavaSerialization,"
+          + "org.apache.hadoop.io.serializer.WritableSerialization");
     conf.set("bayes.parameters", new BayesParameters(3).toString());
     mapper.configure(conf);
-
-    DummyOutputCollector<StringTuple, DoubleWritable> output = new DummyOutputCollector<StringTuple, DoubleWritable>();
-    mapper.map(new Text("foo"), new Text("big brown shoe"), output, Reporter.NULL);
-    Map<String, List<DoubleWritable>> outMap = output.getData();
+    
+    DummyOutputCollector<StringTuple,DoubleWritable> output = new DummyOutputCollector<StringTuple,DoubleWritable>();
+    mapper.map(new Text("foo"), new Text("big brown shoe"), output,
+      Reporter.NULL);
+    Map<String,List<DoubleWritable>> outMap = output.getData();
     System.out.println("Map: " + outMap);
     assertNotNull("outMap is null and it shouldn't be", outMap);
-    //TODO: How about not such a lame test here?
-    for (Map.Entry<String, List<DoubleWritable>> entry : outMap.entrySet()) {
-      assertTrue("entry.getKey() Size: " + entry.getKey().length() + " is not greater than: 0", entry.getKey().length() > 0);
-      assertEquals("entry.getValue() Size: " + entry.getValue().size() + " is not: 1", 1, entry.getValue().size());
+    // TODO: How about not such a lame test here?
+    for (Map.Entry<String,List<DoubleWritable>> entry : outMap.entrySet()) {
+      assertTrue("entry.getKey() Size: " + entry.getKey().length()
+                 + " is not greater than: 0", entry.getKey().length() > 0);
+      assertEquals("entry.getValue() Size: " + entry.getValue().size()
+                   + " is not: 1", 1, entry.getValue().size());
       assertTrue("value is not valie", entry.getValue().get(0).get() > 0);
     }
-
+    
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesFileFormatterTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesFileFormatterTest.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesFileFormatterTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesFileFormatterTest.java Wed Jan  6 02:46:22 2010
@@ -17,13 +17,6 @@
 
 package org.apache.mahout.classifier.bayes;
 
-import junit.framework.TestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.mahout.classifier.BayesFileFormatter;
-import org.apache.mahout.common.FileLineIterator;
-import org.apache.mahout.common.FileLineIterable;
-
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
@@ -31,6 +24,14 @@
 import java.io.Writer;
 import java.nio.charset.Charset;
 
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.mahout.classifier.BayesFileFormatter;
+import org.apache.mahout.common.FileLineIterable;
+import org.apache.mahout.common.FileLineIterator;
+
 public class BayesFileFormatterTest extends TestCase {
 
   protected File input;

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/CBayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/CBayesClassifierTest.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/CBayesClassifierTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/CBayesClassifierTest.java Wed Jan  6 02:46:22 2010
@@ -27,83 +27,82 @@
 import org.apache.mahout.classifier.bayes.model.ClassifierContext;
 
 public class CBayesClassifierTest extends TestCase {
-
+  
   protected Algorithm algorithm;
   protected InMemoryBayesDatastore store;
-
+  
   public CBayesClassifierTest(String s) {
     super(s);
   }
-
+  
   @Override
   protected void setUp() throws Exception {
     super.setUp();
     algorithm = new CBayesAlgorithm();
     store = new InMemoryBayesDatastore(new BayesParameters(1));
-    //String[] labels = new String[]{"a", "b", "c", "d", "e"};
-    //long[] labelCounts = new long[]{6, 20, 60, 100, 200};
-    //String[] features = new String[]{"aa", "bb", "cc", "dd", "ee"};
-    store.setSigma_jSigma_k(500.0);
-
+    // String[] labels = new String[]{"a", "b", "c", "d", "e"};
+    // long[] labelCounts = new long[]{6, 20, 60, 100, 200};
+    // String[] features = new String[]{"aa", "bb", "cc", "dd", "ee"};
+    store.setSigmaJSigmaK(500.0);
+    
     store.setSumFeatureWeight("aa", 80);
     store.setSumFeatureWeight("bb", 21);
     store.setSumFeatureWeight("cc", 60);
     store.setSumFeatureWeight("dd", 115);
     store.setSumFeatureWeight("ee", 100);
-
+    
     store.setSumLabelWeight("a", 100);
     store.setSumLabelWeight("b", 100);
     store.setSumLabelWeight("c", 100);
     store.setSumLabelWeight("d", 100);
     store.setSumLabelWeight("e", 100);
-
+    
     store.setThetaNormalizer("a", -100);
     store.setThetaNormalizer("b", -100);
     store.setThetaNormalizer("c", -100);
     store.setThetaNormalizer("d", -100);
     store.setThetaNormalizer("e", -100);
-
-
+    
     store.loadFeatureWeight("aa", "a", 5);
     store.loadFeatureWeight("bb", "a", 1);
-
+    
     store.loadFeatureWeight("bb", "b", 20);
-
+    
     store.loadFeatureWeight("cc", "c", 30);
     store.loadFeatureWeight("aa", "c", 25);
     store.loadFeatureWeight("dd", "c", 5);
-
+    
     store.loadFeatureWeight("dd", "d", 60);
     store.loadFeatureWeight("cc", "d", 40);
-
+    
     store.loadFeatureWeight("ee", "e", 100);
     store.loadFeatureWeight("aa", "e", 50);
     store.loadFeatureWeight("dd", "e", 50);
     store.updateVocabCount();
   }
-
+  
   public void test() throws InvalidDatastoreException {
     ClassifierContext classifier = new ClassifierContext(algorithm, store);
     String[] document = {"aa", "ff"};
-    ClassifierResult result = classifier.classifyDocument( document, "unknown");
+    ClassifierResult result = classifier.classifyDocument(document, "unknown");
     assertNotNull("category is null and it shouldn't be", result);
     assertEquals(result + " is not equal to e", "e", result.getLabel());
-
-    document = new String[]{"ff"};
+    
+    document = new String[] {"ff"};
     result = classifier.classifyDocument(document, "unknown");
     assertNotNull("category is null and it shouldn't be", result);
     assertEquals(result + " is not equal to d", "d", result.getLabel());
-
-    document = new String[]{"cc"};
-    result = classifier.classifyDocument( document, "unknown");
+    
+    document = new String[] {"cc"};
+    result = classifier.classifyDocument(document, "unknown");
     assertNotNull("category is null and it shouldn't be", result);
     assertEquals(result + " is not equal to d", "d", result.getLabel());
   }
-
+  
   public void testResults() throws Exception {
     ClassifierContext classifier = new ClassifierContext(algorithm, store);
     String[] document = {"aa", "ff"};
-    ClassifierResult result = classifier.classifyDocument( document, "unknown");
+    ClassifierResult result = classifier.classifyDocument(document, "unknown");
     assertNotNull("category is null and it shouldn't be", result);
     System.out.println("Result: " + result);
   }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/PrepareTwentyNewsgroups.java Wed Jan  6 02:46:22 2010
@@ -17,74 +17,89 @@
 
 package org.apache.mahout.classifier.bayes;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.mahout.classifier.BayesFileFormatter;
-import org.apache.commons.cli2.Option;
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
 import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.charset.Charset;
-
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.classifier.BayesFileFormatter;
 
 /**
- * Prepare the 20 Newsgroups files for training using the {@link org.apache.mahout.classifier.BayesFileFormatter}.
- *
- * This class takes the directory containing the extracted newsgroups and collapses them into a single file per category, with
- * one document per line (first token on each line is the label) 
- *
+ * Prepare the 20 Newsgroups files for training using the
+ * {@link org.apache.mahout.classifier.BayesFileFormatter}.
+ * 
+ * This class takes the directory containing the extracted newsgroups and
+ * collapses them into a single file per category, with one document per line
+ * (first token on each line is the label)
+ * 
  */
-public class PrepareTwentyNewsgroups {
-
-  private PrepareTwentyNewsgroups() {
-  }
-
+public final class PrepareTwentyNewsgroups {
+  
+  private PrepareTwentyNewsgroups() { }
+  
   public static void main(String[] args) throws IOException,
-          ClassNotFoundException, InstantiationException, IllegalAccessException, OptionException {
+                                        ClassNotFoundException,
+                                        InstantiationException,
+                                        IllegalAccessException,
+                                        OptionException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
-    Option parentOpt = obuilder.withLongName("parent").withRequired(true).withArgument(
-            abuilder.withName("parent").withMinimum(1).withMaximum(1).create()).
-            withDescription("Parent dir containing the newsgroups").withShortName("p").create();
-
-    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true).withArgument(
-            abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output directory").withShortName("o").create();
-
-    Option analyzerNameOpt = obuilder.withLongName("analyzerName").withRequired(true).withArgument(
-            abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).
-            withDescription("The class name of the analyzer").withShortName("a").create();
-
-    Option charsetOpt = obuilder.withLongName("charset").withRequired(true).withArgument(
-            abuilder.withName("charset").withMinimum(1).withMaximum(1).create()).
-            withDescription("The name of the character encoding of the input files").withShortName("c").create();
-
-    Group group = gbuilder.withName("Options").withOption(analyzerNameOpt).withOption(charsetOpt).withOption(outputDirOpt).withOption(parentOpt).create();
-
+    
+    Option parentOpt = obuilder.withLongName("parent").withRequired(true)
+        .withArgument(
+          abuilder.withName("parent").withMinimum(1).withMaximum(1).create())
+        .withDescription("Parent dir containing the newsgroups").withShortName(
+          "p").create();
+    
+    Option outputDirOpt = obuilder
+        .withLongName("outputDir")
+        .withRequired(true)
+        .withArgument(
+          abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
+        .withDescription("The output directory").withShortName("o").create();
+    
+    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
+        .withRequired(true).withArgument(
+          abuilder.withName("analyzerName").withMinimum(1).withMaximum(1)
+              .create()).withDescription("The class name of the analyzer")
+        .withShortName("a").create();
+    
+    Option charsetOpt = obuilder.withLongName("charset").withRequired(true)
+        .withArgument(
+          abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "The name of the character encoding of the input files")
+        .withShortName("c").create();
+    
+    Group group = gbuilder.withName("Options").withOption(analyzerNameOpt)
+        .withOption(charsetOpt).withOption(outputDirOpt).withOption(parentOpt)
+        .create();
+    
     Parser parser = new Parser();
     parser.setGroup(group);
     CommandLine cmdLine = parser.parse(args);
-
-
+    
     File parentDir = new File((String) cmdLine.getValue(parentOpt));
     File outputDir = new File((String) cmdLine.getValue(outputDirOpt));
     String analyzerName = (String) cmdLine.getValue(analyzerNameOpt);
     Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
     Analyzer analyzer = (Analyzer) Class.forName(analyzerName).newInstance();
-    //parent dir contains dir by category
-    File [] categoryDirs = parentDir.listFiles();
+    // parent dir contains dir by category
+    File[] categoryDirs = parentDir.listFiles();
     for (File dir : categoryDirs) {
-      if (dir.isDirectory()){
+      if (dir.isDirectory()) {
         File outputFile = new File(outputDir, dir.getName() + ".txt");
-        BayesFileFormatter.collapse(dir.getName(), analyzer, dir, charset, outputFile);
+        BayesFileFormatter.collapse(dir.getName(), analyzer, dir, charset,
+          outputFile);
       }
     }
   }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TestClassifier.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TestClassifier.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TestClassifier.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TestClassifier.java Wed Jan  6 02:46:22 2010
@@ -17,8 +17,22 @@
 
 package org.apache.mahout.classifier.bayes;
 
-import org.apache.mahout.classifier.ClassifierResult;
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.List;
+import java.util.Map;
 
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.classifier.ClassifierResult;
 import org.apache.mahout.classifier.ResultAnalyzer;
 import org.apache.mahout.classifier.bayes.algorithm.BayesAlgorithm;
 import org.apache.mahout.classifier.bayes.algorithm.CBayesAlgorithm;
@@ -31,187 +45,168 @@
 import org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver;
 import org.apache.mahout.classifier.bayes.model.ClassifierContext;
 import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.FileLineIterable;
 import org.apache.mahout.common.TimingStatistics;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.nlp.NGrams;
-import org.apache.mahout.common.FileLineIterable;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-import java.nio.charset.Charset;
-
 /**
  * Test the Naive Bayes classifier with improved weighting
  * <p/>
  * To run the twenty newsgroups example: refer
  * http://cwiki.apache.org/MAHOUT/twentynewsgroups.html
  */
-public class TestClassifier {
-
+public final class TestClassifier {
+  
   private static final Logger log = LoggerFactory
       .getLogger(TestClassifier.class);
-
+  
   private TestClassifier() {
-    // do nothing
+  // do nothing
   }
-
-  public static void main(String[] args) throws IOException, InvalidDatastoreException {
+  
+  public static void main(String[] args) throws IOException,
+                                        InvalidDatastoreException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
+    
     Option pathOpt = obuilder
         .withLongName("model")
         .withRequired(true)
         .withArgument(
-            abuilder.withName("model").withMinimum(1).withMaximum(1).create())
+          abuilder.withName("model").withMinimum(1).withMaximum(1).create())
         .withDescription(
-            "The path on HDFS / Name of Hbase Table as defined by the -source parameter")
+          "The path on HDFS / Name of Hbase Table as defined by the -source parameter")
         .withShortName("m").create();
-
-    Option dirOpt = obuilder
-        .withLongName("testDir")
-        .withRequired(true)
+    
+    Option dirOpt = obuilder.withLongName("testDir").withRequired(true)
         .withArgument(
-            abuilder.withName("testDir").withMinimum(1).withMaximum(1).create())
+          abuilder.withName("testDir").withMinimum(1).withMaximum(1).create())
         .withDescription("The directory where test documents resides in")
         .withShortName("d").create();
-
+    
     Option helpOpt = DefaultOptionCreator.helpOption();
-
+    
     Option encodingOpt = obuilder.withLongName("encoding").withArgument(
-        abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
+      abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
         .withDescription("The file encoding.  Defaults to UTF-8")
         .withShortName("e").create();
-
+    
     Option defaultCatOpt = obuilder.withLongName("defaultCat").withArgument(
-        abuilder.withName("defaultCat").withMinimum(1).withMaximum(1).create())
+      abuilder.withName("defaultCat").withMinimum(1).withMaximum(1).create())
         .withDescription("The default category Default Value: unknown")
         .withShortName("default").create();
-
+    
     Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(true)
         .withArgument(
-            abuilder.withName("gramSize").withMinimum(1).withMaximum(1)
-                .create()).withDescription(
-            "Size of the n-gram. Default Value: 1").withShortName("ng")
-        .create();
-
+          abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
+        .withDescription("Size of the n-gram. Default Value: 1").withShortName(
+          "ng").create();
+    
     Option alphaOpt = obuilder.withLongName("alpha").withRequired(false)
         .withArgument(
-            abuilder.withName("a").withMinimum(1).withMaximum(1).create())
+          abuilder.withName("a").withMinimum(1).withMaximum(1).create())
         .withDescription("Smoothing parameter Default Value: 1.0")
         .withShortName("a").create();
-
+    
     Option verboseOutputOpt = obuilder.withLongName("verbose").withRequired(
-        false).withDescription(
-        "Output which values were correctly and incorrectly classified")
+      false).withDescription(
+      "Output which values were correctly and incorrectly classified")
         .withShortName("v").create();
-
+    
     Option typeOpt = obuilder.withLongName("classifierType").withRequired(true)
         .withArgument(
-            abuilder.withName("classifierType").withMinimum(1).withMaximum(1)
-                .create()).withDescription(
-            "Type of classifier: bayes|cbayes. Default Value: bayes")
+          abuilder.withName("classifierType").withMinimum(1).withMaximum(1)
+              .create()).withDescription(
+          "Type of classifier: bayes|cbayes. Default Value: bayes")
         .withShortName("type").create();
-
+    
     Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(
-        true).withArgument(
-        abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create())
+      true).withArgument(
+      abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create())
         .withDescription("Location of model: hdfs|hbase Default Value: hdfs")
         .withShortName("source").create();
-
+    
     Option methodOpt = obuilder
         .withLongName("method")
         .withRequired(true)
         .withArgument(
-            abuilder.withName("method").withMinimum(1).withMaximum(1).create())
+          abuilder.withName("method").withMinimum(1).withMaximum(1).create())
         .withDescription(
-            "Method of Classification: sequential|mapreduce. Default Value: sequential")
+          "Method of Classification: sequential|mapreduce. Default Value: sequential")
         .withShortName("method").create();
-
+    
     Group group = gbuilder.withName("Options").withOption(defaultCatOpt)
         .withOption(dirOpt).withOption(encodingOpt).withOption(gramSizeOpt)
         .withOption(pathOpt).withOption(typeOpt).withOption(dataSourceOpt)
         .withOption(helpOpt).withOption(methodOpt).withOption(verboseOutputOpt)
         .withOption(alphaOpt).create();
-
+    
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
+      
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
       }
-
+      
       int gramSize = 1;
       if (cmdLine.hasOption(gramSizeOpt)) {
         gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt));
-
+        
       }
       BayesParameters params = new BayesParameters(gramSize);
-
+      
       String modelBasePath = (String) cmdLine.getValue(pathOpt);
-
+      
       String classifierType = (String) cmdLine.getValue(typeOpt);
       String dataSource = (String) cmdLine.getValue(dataSourceOpt);
-
+      
       String defaultCat = "unknown";
       if (cmdLine.hasOption(defaultCatOpt)) {
         defaultCat = (String) cmdLine.getValue(defaultCatOpt);
       }
-
+      
       String encoding = "UTF-8";
       if (cmdLine.hasOption(encodingOpt)) {
         encoding = (String) cmdLine.getValue(encodingOpt);
       }
-
-      String alpha_i = "1.0";
+      
+      String alphaI = "1.0";
       if (cmdLine.hasOption(alphaOpt)) {
-        alpha_i = (String) cmdLine.getValue(alphaOpt);
+        alphaI = (String) cmdLine.getValue(alphaOpt);
       }
-
+      
       boolean verbose = cmdLine.hasOption(verboseOutputOpt);
-
+      
       String testDirPath = (String) cmdLine.getValue(dirOpt);
-
+      
       String classificationMethod = (String) cmdLine.getValue(methodOpt);
-
+      
       params.set("verbose", Boolean.toString(verbose));
       params.set("basePath", modelBasePath);
       params.set("classifierType", classifierType);
       params.set("dataSource", dataSource);
       params.set("defaultCat", defaultCat);
       params.set("encoding", encoding);
-      params.set("alpha_i", alpha_i);
+      params.set("alpha_i", alphaI);
       params.set("testDirPath", testDirPath);
-
-      if (classificationMethod.equalsIgnoreCase("sequential"))
-        classifySequential(params);
-      else if (classificationMethod.equalsIgnoreCase("mapreduce"))
-        classifyParallel(params);
+      
+      if (classificationMethod.equalsIgnoreCase("sequential")) classifySequential(params);
+      else if (classificationMethod.equalsIgnoreCase("mapreduce")) classifyParallel(params);
     } catch (OptionException e) {
       CommandLineUtil.printHelp(group);
       return;
     }
   }
-
-  public static void classifySequential(BayesParameters params)
-      throws IOException, InvalidDatastoreException {
+  
+  public static void classifySequential(BayesParameters params) throws IOException,
+                                                               InvalidDatastoreException {
     log.info("Loading model from: {}", params.print());
     boolean verbose = Boolean.valueOf(params.get("verbose"));
     File dir = new File(params.get("testDirPath"));
@@ -221,10 +216,10 @@
         return s.startsWith(".") == false;
       }
     });
-
+    
     Algorithm algorithm;
     Datastore datastore;
-
+    
     if (params.get("dataSource").equals("hdfs")) {
       if (params.get("classifierType").equalsIgnoreCase("bayes")) {
         log.info("Testing Bayes Classifier");
@@ -236,9 +231,9 @@
         datastore = new InMemoryBayesDatastore(params);
       } else {
         throw new IllegalArgumentException("Unrecognized classifier type: "
-            + params.get("classifierType"));
+                                           + params.get("classifierType"));
       }
-
+      
     } else if (params.get("dataSource").equals("hbase")) {
       if (params.get("classifierType").equalsIgnoreCase("bayes")) {
         log.info("Testing Bayes Classifier");
@@ -250,12 +245,12 @@
         datastore = new HBaseBayesDatastore(params.get("basePath"), params);
       } else {
         throw new IllegalArgumentException("Unrecognized classifier type: "
-            + params.get("classifierType"));
+                                           + params.get("classifierType"));
       }
-
+      
     } else {
       throw new IllegalArgumentException("Unrecognized dataSource type: "
-          + params.get("dataSource"));
+                                         + params.get("dataSource"));
     }
     ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
     classifier.initialize();
@@ -263,57 +258,63 @@
         params.get("defaultCat"));
     TimingStatistics totalStatistics = new TimingStatistics();
     if (subdirs != null) {
-
+      
       for (File file : subdirs) {
         log.info("--------------");
         log.info("Testing: " + file);
         String correctLabel = file.getName().split(".txt")[0];
         TimingStatistics operationStats = new TimingStatistics();
-
+        
         long lineNum = 0;
         for (String line : new FileLineIterable(new File(file.getPath()),
             Charset.forName(params.get("encoding")), false)) {
-
-          Map<String, List<String>> document = new NGrams(line, Integer
+          
+          Map<String,List<String>> document = new NGrams(line, Integer
               .parseInt(params.get("gramSize"))).generateNGrams();
-          for (Map.Entry<String, List<String>> stringListEntry : document
+          for (Map.Entry<String,List<String>> stringListEntry : document
               .entrySet()) {
             List<String> strings = stringListEntry.getValue();
             TimingStatistics.Call call = operationStats.newCall();
             TimingStatistics.Call outercall = totalStatistics.newCall();
             ClassifierResult classifiedLabel = classifier.classifyDocument(
-                strings.toArray(new String[strings.size()]), params
-                    .get("defaultCat"));
+              strings.toArray(new String[strings.size()]), params
+                  .get("defaultCat"));
             call.end();
             outercall.end();
             boolean correct = resultAnalyzer.addInstance(correctLabel,
-                classifiedLabel);
+              classifiedLabel);
             if (verbose) {
               // We have one document per line
               log.info("Line Number: " + lineNum + " Line(30): "
-                  + (line.length() > 30 ? line.substring(0, 30) : line)
-                  + " Expected Label: " + correctLabel + " Classified Label: "
-                  + classifiedLabel.getLabel() + " Correct: " + correct);
+                       + (line.length() > 30 ? line.substring(0, 30) : line)
+                       + " Expected Label: " + correctLabel
+                       + " Classified Label: " + classifiedLabel.getLabel()
+                       + " Correct: " + correct);
             }
             // log.info("{} {}", correctLabel, classifiedLabel);
-
+            
           }
           lineNum++;
         }
-        log.info("{}\t{}\t{}/{}", new Object[] { correctLabel,
-            resultAnalyzer.getConfusionMatrix().getAccuracy(correctLabel),
-            resultAnalyzer.getConfusionMatrix().getCorrect(correctLabel),
-            resultAnalyzer.getConfusionMatrix().getTotal(correctLabel) });
+        log.info("{}\t{}\t{}/{}", new Object[] {correctLabel,
+                                                resultAnalyzer
+                                                    .getConfusionMatrix()
+                                                    .getAccuracy(correctLabel),
+                                                resultAnalyzer
+                                                    .getConfusionMatrix()
+                                                    .getCorrect(correctLabel),
+                                                resultAnalyzer
+                                                    .getConfusionMatrix()
+                                                    .getTotal(correctLabel)});
         log.info("{}", operationStats.toString());
       }
-
+      
     }
     log.info("{}", totalStatistics.toString());
     log.info(resultAnalyzer.summarize());
   }
-
-  public static void classifyParallel(BayesParameters params)
-      throws IOException {
+  
+  public static void classifyParallel(BayesParameters params) throws IOException {
     BayesClassifierDriver.runJob(params);
   }
 }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.classifier.bayes;
 
+import java.io.IOException;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -33,127 +35,127 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-
 /**
  * Train the Naive Bayes classifier with improved weighting
  * <p/>
  * To run the twenty newsgroups example: refer
  * http://cwiki.apache.org/MAHOUT/twentynewsgroups.html
  */
-public class TrainClassifier {
-
+public final class TrainClassifier {
+  
   private static final Logger log = LoggerFactory
       .getLogger(TrainClassifier.class);
-
-  private TrainClassifier() {
-  }
-
-  public static void trainNaiveBayes(String dir, String outputDir,
-      BayesParameters params) throws IOException, InterruptedException,
-      ClassNotFoundException {
+  
+  private TrainClassifier() { }
+  
+  public static void trainNaiveBayes(String dir,
+                                     String outputDir,
+                                     BayesParameters params) throws IOException,
+                                                            InterruptedException,
+                                                            ClassNotFoundException {
     BayesDriver driver = new BayesDriver();
     driver.runJob(dir, outputDir, params);
   }
-
-  public static void trainCNaiveBayes(String dir, String outputDir,
-      BayesParameters params) throws IOException, InterruptedException,
-      ClassNotFoundException {
+  
+  public static void trainCNaiveBayes(String dir,
+                                      String outputDir,
+                                      BayesParameters params) throws IOException,
+                                                             InterruptedException,
+                                                             ClassNotFoundException {
     CBayesDriver driver = new CBayesDriver();
     driver.runJob(dir, outputDir, params);
   }
-
+  
   public static void main(String[] args) throws IOException,
-      NumberFormatException, IllegalStateException, InterruptedException,
-      ClassNotFoundException {
+                                        NumberFormatException,
+                                        IllegalStateException,
+                                        InterruptedException,
+                                        ClassNotFoundException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
+    
     Option helpOpt = DefaultOptionCreator.helpOption();
-
+    
     Option inputDirOpt = obuilder
         .withLongName("input")
         .withRequired(true)
         .withArgument(
-            abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+          abuilder.withName("input").withMinimum(1).withMaximum(1).create())
         .withDescription(
-            "The Directory on HDFS containing the collapsed, properly formatted files")
+          "The Directory on HDFS containing the collapsed, properly formatted files")
         .withShortName("i").create();
-
+    
     Option outputOpt = obuilder.withLongName("output").withRequired(true)
         .withArgument(
-            abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
         .withDescription("The location of the modelon the HDFS").withShortName(
-            "o").create();
-
+          "o").create();
+    
     Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(true)
         .withArgument(
-            abuilder.withName("gramSize").withMinimum(1).withMaximum(1)
-                .create()).withDescription(
-            "Size of the n-gram. Default Value: 1 ").withShortName("ng")
-        .create();
-
+          abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
+        .withDescription("Size of the n-gram. Default Value: 1 ")
+        .withShortName("ng").create();
+    
     Option alphaOpt = obuilder.withLongName("alpha").withRequired(false)
         .withArgument(
-            abuilder.withName("a").withMinimum(1).withMaximum(1).create())
+          abuilder.withName("a").withMinimum(1).withMaximum(1).create())
         .withDescription("Smoothing parameter Default Value: 1.0")
         .withShortName("a").create();
-
+    
     Option typeOpt = obuilder.withLongName("classifierType").withRequired(true)
         .withArgument(
-            abuilder.withName("classifierType").withMinimum(1).withMaximum(1)
-                .create()).withDescription(
-            "Type of classifier: bayes|cbayes. Default: bayes").withShortName(
-            "type").create();
+          abuilder.withName("classifierType").withMinimum(1).withMaximum(1)
+              .create()).withDescription(
+          "Type of classifier: bayes|cbayes. Default: bayes").withShortName(
+          "type").create();
     Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(
-        true).withArgument(
-        abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create())
+      true).withArgument(
+      abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create())
         .withDescription("Location of model: hdfs|hbase. Default Value: hdfs")
         .withShortName("source").create();
-
+    
     Group group = gbuilder.withName("Options").withOption(gramSizeOpt)
         .withOption(helpOpt).withOption(inputDirOpt).withOption(outputOpt)
         .withOption(typeOpt).withOption(dataSourceOpt).withOption(alphaOpt)
         .create();
     try {
       Parser parser = new Parser();
-
+      
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
       }
-
+      
       String classifierType = (String) cmdLine.getValue(typeOpt);
       String dataSourceType = (String) cmdLine.getValue(dataSourceOpt);
-
+      
       BayesParameters params = new BayesParameters(Integer
           .parseInt((String) cmdLine.getValue(gramSizeOpt)));
-
-      String alpha_i = "1.0";
+      
+      String alphaI = "1.0";
       if (cmdLine.hasOption(alphaOpt)) {
-        alpha_i = (String) cmdLine.getValue(alphaOpt);
+        alphaI = (String) cmdLine.getValue(alphaOpt);
       }
-
-      params.set("alpha_i", alpha_i);
-
-      if (dataSourceType.equals("hbase"))
-        params.set("dataSource", "hbase");
-      else
-        params.set("dataSource", "hdfs");
-
+      
+      params.set("alpha_i", alphaI);
+      
+      if (dataSourceType.equals("hbase")) params.set("dataSource", "hbase");
+      else params.set("dataSource", "hdfs");
+      
       if (classifierType.equalsIgnoreCase("bayes")) {
         log.info("Training Bayes Classifier");
         trainNaiveBayes((String) cmdLine.getValue(inputDirOpt),
-            (String) cmdLine.getValue(outputOpt), params);
-
+          (String) cmdLine.getValue(outputOpt), params);
+        
       } else if (classifierType.equalsIgnoreCase("cbayes")) {
         log.info("Training Complementary Bayes Classifier");
         // setup the HDFS and copy the files there, then run the trainer
         trainCNaiveBayes((String) cmdLine.getValue(inputDirOpt),
-            (String) cmdLine.getValue(outputOpt), params);
+          (String) cmdLine.getValue(outputOpt), params);
       }
     } catch (OptionException e) {
       log.info("{}", e);



Mime
View raw message