mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r688522 [1/2] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/cf/taste/impl/correlation/ core/src/main/java/org/apache/mahout/classifier/ core/src/main/java/org/apache/mahout/classifier/bayes/ core/src/main/java/org/apache/m...
Date Sun, 24 Aug 2008 16:10:44 GMT
Author: srowen
Date: Sun Aug 24 09:10:42 2008
New Revision: 688522

URL: http://svn.apache.org/viewvc?rev=688522&view=rev
Log:
Change printStackTrace() / System.{out,err}.println() in core to use a logger. Also replace many broad uses of Exception with more specific exception classes.

Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/correlation/AbstractCorrelation.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesModel.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaNormalizerDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaNormalizerMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaNormalizerReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansJob.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansJob.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/common/Model.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/common/Summarizable.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/ga/watchmaker/EvalMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/SparseVector.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/VectorView.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/ManhattanDistanceMeasure.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedDistanceMeasure.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/parameters/AbstractParameter.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/parameters/ClassParameter.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/parameters/CompositeParameter.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierTest.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/CBayesClassifierTest.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/correlation/AbstractCorrelation.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/correlation/AbstractCorrelation.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/correlation/AbstractCorrelation.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/correlation/AbstractCorrelation.java Sun Aug 24 09:10:42 2008
@@ -73,7 +73,7 @@
     this.cachedNumItems = dataModel.getNumItems();
     this.cachedNumUsers = dataModel.getNumUsers();
     this.refreshHelper = new RefreshHelper(new Callable<Object>() {
-      public Object call() throws Exception {
+      public Object call() throws TasteException {
         cachedNumItems = AbstractCorrelation.this.dataModel.getNumItems();
         cachedNumUsers = AbstractCorrelation.this.dataModel.getNumUsers();
         return null;

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java Sun Aug 24 09:10:42 2008
@@ -28,6 +28,8 @@
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.File;
 import java.io.FileFilter;
@@ -49,6 +51,8 @@
  */
 public class BayesFileFormatter {
 
+  private static final Logger log = LoggerFactory.getLogger(BayesFileFormatter.class);
+
   private static final String LINE_SEP = System.getProperty("line.separator");
 
   /**
@@ -275,7 +279,7 @@
       PosixParser parser = new PosixParser();
       cmdLine = parser.parse(options, args);
       if (cmdLine.hasOption(helpOpt.getOpt())) {
-        System.out.println("Options: " + options);
+        log.info("Options: {}", options);
         return;
       }
       File input = new File(cmdLine.getOptionValue(inputOpt.getOpt()));
@@ -301,8 +305,8 @@
       }
 
     } catch (ParseException exp) {
-      exp.printStackTrace();
-      System.out.println("Options: " + options);
+      log.warn(exp.toString(), exp);
+      log.info("Options: {}", options);
     }
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/Classify.java Sun Aug 24 09:10:42 2008
@@ -34,6 +34,8 @@
 import org.apache.mahout.classifier.cbayes.CBayesModel;
 import org.apache.mahout.common.Classifier;
 import org.apache.mahout.common.Model;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.File;
 import java.io.FileInputStream;
@@ -43,15 +45,13 @@
 import java.util.List;
 import java.util.Map;
 
-
-/**
- *
- *
- **/
 public class Classify {
 
-  @SuppressWarnings({ "static-access", "unchecked" })
-  public static void main(String[] args) throws IOException, ClassNotFoundException, IllegalAccessException, InstantiationException {
+  private static final Logger log = LoggerFactory.getLogger(Classify.class);
+
+  @SuppressWarnings({ "static-access" })
+  public static void main(String[] args)
+      throws IOException, ClassNotFoundException, IllegalAccessException, InstantiationException, ParseException {
     Options options = new Options();
     Option pathOpt = OptionBuilder.withLongOpt("path").isRequired().hasArg().withDescription("The local file system path").create("p");
     options.addOption(pathOpt);
@@ -67,92 +67,84 @@
     options.addOption(gramSizeOpt);
     Option typeOpt = OptionBuilder.withLongOpt("classifierType").isRequired().hasArg().withDescription("Type of classifier").create("type");
     options.addOption(typeOpt);
-    
-    
-    CommandLine cmdLine;
-    try {
-      PosixParser parser = new PosixParser();
-      cmdLine = parser.parse(options, args);
-      SequenceFileModelReader reader = new SequenceFileModelReader();
-      JobConf conf = new JobConf(Classify.class);
-      
-
-      Map<String, Path> modelPaths = new HashMap<String, Path>();
-      String modelBasePath = cmdLine.getOptionValue(pathOpt.getOpt());
-      modelPaths.put("sigma_j", new Path(modelBasePath + "/trainer-weights/Sigma_j/part-*"));
-      modelPaths.put("sigma_k", new Path(modelBasePath + "/trainer-weights/Sigma_k/part-*"));
-      modelPaths.put("sigma_kSigma_j", new Path(modelBasePath + "/trainer-weights/Sigma_kSigma_j/part-*"));
-      modelPaths.put("thetaNormalizer", new Path(modelBasePath + "/trainer-thetaNormalizer/part-*"));
-      modelPaths.put("weight", new Path(modelBasePath + "/trainer-tfIdf/trainer-tfIdf/part-*"));
-
-      FileSystem fs = FileSystem.get(conf);
-
-      System.out.println("Loading model from: " + modelPaths);
-
-      Model model = null;
-      Classifier classifier = null;
-      
-      String classifierType = cmdLine.getOptionValue(typeOpt.getOpt());
-      
-      if (classifierType.equalsIgnoreCase("bayes")) {
-        System.out.println("Testing Bayes Classifier");
-        model = new BayesModel();
-        classifier = new BayesClassifier();
-      } else if (classifierType.equalsIgnoreCase("cbayes")) {
-        System.out.println("Testing Complementary Bayes Classifier");
-        model = new CBayesModel();
-        classifier = new CBayesClassifier();
-      }
-     
-      model = reader.loadModel(model, fs, modelPaths, conf);
-
-      System.out.println("Done loading model: # labels: "
-          + model.getLabels().size());
-
-      System.out.println("Done generating Model ");
-      
-      
-      String defaultCat = "unknown";
-      if (cmdLine.hasOption(defaultCatOpt.getOpt())) {
-        defaultCat = cmdLine.getOptionValue(defaultCatOpt.getOpt());
-      }
-      File docPath = new File(cmdLine.getOptionValue(classifyOpt.getOpt()));
-      String encoding = "UTF-8";
-      if (cmdLine.hasOption(encodingOpt.getOpt())) {
-        encoding = cmdLine.getOptionValue(encodingOpt.getOpt());
-      }
-      Analyzer analyzer = null;
-      if (cmdLine.hasOption(analyzerOpt.getOpt())) {
-        String className = cmdLine.getOptionValue(analyzerOpt.getOpt());
-        Class clazz = Class.forName(className);
-        analyzer = (Analyzer) clazz.newInstance();
-      }
-      if (analyzer == null) {
-        analyzer = new StandardAnalyzer();
-      }
-      
-      int gramSize = 1;
-      if (cmdLine.hasOption(gramSizeOpt.getOpt())) {
-        gramSize = Integer.parseInt(cmdLine
-            .getOptionValue(gramSizeOpt.getOpt()));
-
-      }
-      
-      System.out.println("Converting input document to proper format");
-      String [] document = BayesFileFormatter.readerToDocument(analyzer, new InputStreamReader(new FileInputStream(docPath), encoding));      
-      StringBuilder line = new StringBuilder();
-      for(String token : document)
-      {
-        line.append(token).append(' ');
-      }
-      List<String> doc = Model.generateNGramsWithoutLabel(line.toString(), gramSize) ;
-      System.out.println("Done converting");
-      System.out.println("Classifying document: " + docPath);      
-      ClassifierResult category = classifier.classify(model, doc.toArray(new String[doc.size()]), defaultCat);
-      System.out.println("Category for " + docPath + " is " + category);
+
+    PosixParser parser = new PosixParser();
+    CommandLine cmdLine = parser.parse(options, args);
+    SequenceFileModelReader reader = new SequenceFileModelReader();
+    JobConf conf = new JobConf(Classify.class);
+
+    Map<String, Path> modelPaths = new HashMap<String, Path>();
+    String modelBasePath = cmdLine.getOptionValue(pathOpt.getOpt());
+    modelPaths.put("sigma_j", new Path(modelBasePath + "/trainer-weights/Sigma_j/part-*"));
+    modelPaths.put("sigma_k", new Path(modelBasePath + "/trainer-weights/Sigma_k/part-*"));
+    modelPaths.put("sigma_kSigma_j", new Path(modelBasePath + "/trainer-weights/Sigma_kSigma_j/part-*"));
+    modelPaths.put("thetaNormalizer", new Path(modelBasePath + "/trainer-thetaNormalizer/part-*"));
+    modelPaths.put("weight", new Path(modelBasePath + "/trainer-tfIdf/trainer-tfIdf/part-*"));
+
+    FileSystem fs = FileSystem.get(conf);
+
+    log.info("Loading model from: {}", modelPaths);
+
+    Model model = null;
+    Classifier classifier = null;
+
+    String classifierType = cmdLine.getOptionValue(typeOpt.getOpt());
+
+    if (classifierType.equalsIgnoreCase("bayes")) {
+      log.info("Testing Bayes Classifier");
+      model = new BayesModel();
+      classifier = new BayesClassifier();
+    } else if (classifierType.equalsIgnoreCase("cbayes")) {
+      log.info("Testing Complementary Bayes Classifier");
+      model = new CBayesModel();
+      classifier = new CBayesClassifier();
+    }
+
+    model = reader.loadModel(model, fs, modelPaths, conf);
+
+    log.info("Done loading model: # labels: {}", model.getLabels().size());
+
+    log.info("Done generating Model");
+
+
+    String defaultCat = "unknown";
+    if (cmdLine.hasOption(defaultCatOpt.getOpt())) {
+      defaultCat = cmdLine.getOptionValue(defaultCatOpt.getOpt());
     }
-    catch (ParseException exp) {
-      exp.printStackTrace(System.err);
+    File docPath = new File(cmdLine.getOptionValue(classifyOpt.getOpt()));
+    String encoding = "UTF-8";
+    if (cmdLine.hasOption(encodingOpt.getOpt())) {
+      encoding = cmdLine.getOptionValue(encodingOpt.getOpt());
     }
+    Analyzer analyzer = null;
+    if (cmdLine.hasOption(analyzerOpt.getOpt())) {
+      String className = cmdLine.getOptionValue(analyzerOpt.getOpt());
+      Class clazz = Class.forName(className);
+      analyzer = (Analyzer) clazz.newInstance();
+    }
+    if (analyzer == null) {
+      analyzer = new StandardAnalyzer();
+    }
+
+    int gramSize = 1;
+    if (cmdLine.hasOption(gramSizeOpt.getOpt())) {
+      gramSize = Integer.parseInt(cmdLine
+          .getOptionValue(gramSizeOpt.getOpt()));
+
+    }
+
+    log.info("Converting input document to proper format");
+    String[] document = BayesFileFormatter.readerToDocument(analyzer, new InputStreamReader(new FileInputStream(docPath), encoding));
+    StringBuilder line = new StringBuilder();
+    for(String token : document)
+    {
+      line.append(token).append(' ');
+    }
+    List<String> doc = Model.generateNGramsWithoutLabel(line.toString(), gramSize) ;
+    log.info("Done converting");
+    log.info("Classifying document: {}", docPath);
+    ClassifierResult category = classifier.classify(model, doc.toArray(new String[doc.size()]), defaultCat);
+    log.info("Category for {} is {}", docPath, category);
+
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java Sun Aug 24 09:10:42 2008
@@ -77,31 +77,28 @@
     }
   }
   
-  public void addInstance(String correctLabel, ClassifierResult classifiedResult) throws Exception{
+  public void addInstance(String correctLabel, ClassifierResult classifiedResult) {
     incrementCount(correctLabel, classifiedResult.getLabel());
   }  
   
-  public void addInstance(String correctLabel, String classifiedLabel) throws Exception{
+  public void addInstance(String correctLabel, String classifiedLabel) {
     incrementCount(correctLabel, classifiedLabel);
   }
   
-  public int getCount(String correctLabel, String classifiedLabel)
-      throws Exception {
+  public int getCount(String correctLabel, String classifiedLabel) {
     if (this.getLabels().contains(correctLabel)
         && this.getLabels().contains(classifiedLabel) == false) {
-      //System.err.println(correctLabel + " " + classifiedLabel);
-      throw new Exception("Label not found " +correctLabel + " " +classifiedLabel );
+      throw new IllegalArgumentException("Label not found " +correctLabel + " " +classifiedLabel );
     }
     int correctId = labelMap.get(correctLabel).intValue();
     int classifiedId = labelMap.get(classifiedLabel).intValue();
     return confusionMatrix[correctId][classifiedId];
   }
 
-  public void putCount(String correctLabel, String classifiedLabel, int count)
-      throws Exception {
+  public void putCount(String correctLabel, String classifiedLabel, int count) {
     if (this.getLabels().contains(correctLabel)
         && this.getLabels().contains(classifiedLabel) == false) {
-      throw new Exception("Label not found");
+      throw new IllegalArgumentException("Label not found");
     }
     int correctId = labelMap.get(correctLabel).intValue();
     int classifiedId = labelMap.get(classifiedLabel).intValue();
@@ -109,19 +106,18 @@
   }
 
   public void incrementCount(String correctLabel, String classifiedLabel,
-      int count) throws Exception {
+      int count) {
     putCount(correctLabel, classifiedLabel, count
         + getCount(correctLabel, classifiedLabel));
   }
 
-  public void incrementCount(String correctLabel, String classifiedLabel)
-      throws Exception {
+  public void incrementCount(String correctLabel, String classifiedLabel) {
     incrementCount(correctLabel, classifiedLabel, 1);
   }
 
-  public ConfusionMatrix Merge(ConfusionMatrix b) throws Exception {
+  public ConfusionMatrix Merge(ConfusionMatrix b) {
     if (this.getLabels().size() != b.getLabels().size())
-      throw new Exception("The Labels do not Match");
+      throw new IllegalArgumentException("The Labels do not Match");
 
     //if (this.getLabels().containsAll(b.getLabels()))
     //  ;
@@ -134,7 +130,7 @@
     return this;
   }
 
-  public String summarize() throws Exception {
+  public String summarize() {
     StringBuilder returnString = new StringBuilder();
     returnString
         .append("=======================================================\n");

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java Sun Aug 24 09:10:42 2008
@@ -45,8 +45,7 @@
   public ConfusionMatrix getConfusionMatrix(){
     return this.confusionMatrix;
   }
-  public void addInstance(String correctLabel, ClassifierResult classifiedResult)
-      throws Exception {
+  public void addInstance(String correctLabel, ClassifierResult classifiedResult) {
     if (correctLabel.equals(classifiedResult.getLabel()))
       correctlyClassified++;
     else
@@ -58,7 +57,7 @@
     return "";
   }
 
-  public String summarize() throws Exception {
+  public String summarize() {
     StringBuilder returnString = new StringBuilder();
 
     returnString

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesDriver.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesDriver.java Sun Aug 24 09:10:42 2008
@@ -23,12 +23,18 @@
 import org.apache.mahout.classifier.bayes.common.BayesFeatureDriver;
 import org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver;
 import org.apache.mahout.classifier.bayes.common.BayesWeightSummerDriver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
 
 /**
  * Create and run the Bayes Trainer.
- * 
  */
 public class BayesDriver {
+
+  private static final Logger log = LoggerFactory.getLogger(BayesDriver.class);
+
   /**
    * Takes in two arguments:
    * <ol>
@@ -41,7 +47,7 @@
    * 
    * @param args The args
    */
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     String input = args[0];
     String output = args[1];
 
@@ -53,74 +59,66 @@
    * 
    * @param input the input pathname String
    * @param output the output pathname String
-   * 
    */
-  @SuppressWarnings("deprecation")
-  public static void runJob(String input, String output, int gramSize) {
+  public static void runJob(String input, String output, int gramSize) throws IOException {
     JobConf conf = new JobConf(BayesDriver.class);
-    try {
-      FileSystem dfs = FileSystem.get(conf);
-      Path outPath = new Path(output);
-      if (dfs.exists(outPath))
-        dfs.delete(outPath);
-      
-      System.out.println("Reading features...");
-      //Read the features in each document normalized by length of each document
-      BayesFeatureDriver.runJob(input, output, gramSize);
-      
-      System.out.println("Calculating Tf-Idf...");
-      //Calculate the TfIdf for each word in each label
-      BayesTfIdfDriver.runJob(input, output);
-      
-      System.out.println("Calculating weight sums for labels and features...");
-      //Calculate the Sums of weights for each label, for each feature and for each feature and for each label
-      BayesWeightSummerDriver.runJob(input, output);
-      
-      //System.out.println("Calculating the weight of the features of each label in the complement class...");
-      //Calculate the W_ij = log(Theta) for each label, feature. This step actually generates the complement class
-      //CBayesThetaDriver.runJob(input, output);
-      
-      System.out.println("Calculating the weight Normalisation factor for each class...");
-      //Calculate the normalization factor Sigma_W_ij for each complement class. 
-      BayesThetaNormalizerDriver.runJob(input, output);
-      
-      //System.out.println("Calculating the final Weight Normalized Complementary Naive Bayes Model...");
-      //Calculate the normalization factor Sigma_W_ij for each complement class. 
-      //CBayesNormalizedWeightDriver.runJob(input, output);
-      
-      Path docCountOutPath = new Path(output+ "/trainer-docCount");
-      if (dfs.exists(docCountOutPath))
-        dfs.delete(docCountOutPath, true);
-      Path termDocCountOutPath = new Path(output+ "/trainer-termDocCount");
-      if (dfs.exists(termDocCountOutPath))
-        dfs.delete(termDocCountOutPath, true);
-      Path featureCountOutPath = new Path(output+ "/trainer-featureCount");
-      if (dfs.exists(featureCountOutPath))
-        dfs.delete(featureCountOutPath, true);
-      Path wordFreqOutPath = new Path(output+ "/trainer-wordFreq");
-      if (dfs.exists(wordFreqOutPath))
-        dfs.delete(wordFreqOutPath, true);
-      Path vocabCountPath = new Path(output+ "/trainer-tfIdf/trainer-vocabCount");
-      if (dfs.exists(vocabCountPath))
-        dfs.delete(vocabCountPath, true);
-      /*Path tfIdfOutPath = new Path(output+ "/trainer-tfIdf");
-      if (dfs.exists(tfIdfOutPath))
-        dfs.delete(tfIdfOutPath, true);*/
-      Path vocabCountOutPath = new Path(output+ "/trainer-vocabCount");
-      if (dfs.exists(vocabCountOutPath))
-        dfs.delete(vocabCountOutPath, true);
-     /* Path weightsOutPath = new Path(output+ "/trainer-weights");
-      if (dfs.exists(weightsOutPath))
-        dfs.delete(weightsOutPath, true);*/
-      /*Path thetaOutPath = new Path(output+ "/trainer-theta");
-      if (dfs.exists(thetaOutPath))
-        dfs.delete(thetaOutPath, true);*/
-      /*Path thetaNormalizerOutPath = new Path(output+ "/trainer-thetaNormalizer");
-      if (dfs.exists(thetaNormalizerOutPath))
-        dfs.delete(thetaNormalizerOutPath, true);*/
-      
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
+    FileSystem dfs = FileSystem.get(conf);
+    Path outPath = new Path(output);
+    if (dfs.exists(outPath))
+      dfs.delete(outPath, true);
+
+    log.info("Reading features...");
+    //Read the features in each document normalized by length of each document
+    BayesFeatureDriver.runJob(input, output, gramSize);
+
+    log.info("Calculating Tf-Idf...");
+    //Calculate the TfIdf for each word in each label
+    BayesTfIdfDriver.runJob(input, output);
+
+    log.info("Calculating weight sums for labels and features...");
+    //Calculate the Sums of weights for each label, for each feature and for each feature and for each label
+    BayesWeightSummerDriver.runJob(input, output);
+
+    //Calculate the W_ij = log(Theta) for each label, feature. This step actually generates the complement class
+    //CBayesThetaDriver.runJob(input, output);
+
+    log.info("Calculating the weight Normalisation factor for each class...");
+    //Calculate the normalization factor Sigma_W_ij for each complement class.
+    BayesThetaNormalizerDriver.runJob(input, output);
+
+    //Calculate the normalization factor Sigma_W_ij for each complement class.
+    //CBayesNormalizedWeightDriver.runJob(input, output);
+
+    Path docCountOutPath = new Path(output+ "/trainer-docCount");
+    if (dfs.exists(docCountOutPath))
+      dfs.delete(docCountOutPath, true);
+    Path termDocCountOutPath = new Path(output+ "/trainer-termDocCount");
+    if (dfs.exists(termDocCountOutPath))
+      dfs.delete(termDocCountOutPath, true);
+    Path featureCountOutPath = new Path(output+ "/trainer-featureCount");
+    if (dfs.exists(featureCountOutPath))
+      dfs.delete(featureCountOutPath, true);
+    Path wordFreqOutPath = new Path(output+ "/trainer-wordFreq");
+    if (dfs.exists(wordFreqOutPath))
+      dfs.delete(wordFreqOutPath, true);
+    Path vocabCountPath = new Path(output+ "/trainer-tfIdf/trainer-vocabCount");
+    if (dfs.exists(vocabCountPath))
+      dfs.delete(vocabCountPath, true);
+    /*Path tfIdfOutPath = new Path(output+ "/trainer-tfIdf");
+    if (dfs.exists(tfIdfOutPath))
+      dfs.delete(tfIdfOutPath, true);*/
+    Path vocabCountOutPath = new Path(output+ "/trainer-vocabCount");
+    if (dfs.exists(vocabCountOutPath))
+      dfs.delete(vocabCountOutPath, true);
+   /* Path weightsOutPath = new Path(output+ "/trainer-weights");
+    if (dfs.exists(weightsOutPath))
+      dfs.delete(weightsOutPath, true);*/
+    /*Path thetaOutPath = new Path(output+ "/trainer-theta");
+    if (dfs.exists(thetaOutPath))
+      dfs.delete(thetaOutPath, true);*/
+    /*Path thetaNormalizerOutPath = new Path(output+ "/trainer-thetaNormalizer");
+    if (dfs.exists(thetaNormalizerOutPath))
+      dfs.delete(thetaNormalizerOutPath, true);*/
+
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesModel.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesModel.java Sun Aug 24 09:10:42 2008
@@ -19,16 +19,15 @@
 
 
 import org.apache.mahout.common.Model;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.util.Map;
 
-
-/**
- * 
- * 
- */
 public class BayesModel extends Model {
 
+  private static final Logger log = LoggerFactory.getLogger(BayesModel.class);
+
   @Override
   protected float getWeight(Integer label, Integer feature) {
     float result = 0.0f;
@@ -69,7 +68,7 @@
   public void InitializeNormalizer() {
     float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
 
-    System.out.println(thetaNormalizer);
+    log.info("{}", thetaNormalizer);
     for (Integer label : thetaNormalizer.keySet()) {
       float Sigma_W_ij = thetaNormalizer.get(label);
       if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
@@ -82,12 +81,11 @@
       thetaNormalizer.put(label, Sigma_W_ij
           / perLabelWeightSumNormalisationFactor);
     }
-    System.out.println(thetaNormalizer);
+    log.info("{}", thetaNormalizer);
   }
 
   @Override
   public void GenerateModel() {
-    try {
       float vocabCount = featureList.size();
 
       float[] perLabelThetaNormalizer = new float[labelList.size()];
@@ -114,7 +112,7 @@
 
         }
       }
-      System.out.println("Normalizing Weights");
+      log.info("Normalizing Weights");
       for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
         float Sigma_W_ij = perLabelThetaNormalizer[label];
         if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
@@ -138,10 +136,7 @@
           setWeight(label, feature, normalizedWeight);
         }
       }
-    } catch (Exception e) {
-      // TODO Auto-generated catch block
-      e.printStackTrace();
-    }
+
   }
 
   

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerDriver.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerDriver.java Sun Aug 24 09:10:42 2008
@@ -27,16 +27,20 @@
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.util.GenericsUtil;
 import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.util.HashMap;
 import java.util.Map;
-
+import java.io.IOException;
 
 /**
  * Create and run the Bayes Theta Normalization Step.
- *
- **/
+ */
 public class BayesThetaNormalizerDriver {
+
+  private static final Logger log = LoggerFactory.getLogger(BayesThetaNormalizerDriver.class);
+
   /**
    * Takes in two arguments:
    * <ol>
@@ -45,7 +49,7 @@
    * </ol>
    * @param args The args
    */
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     String input = args[0];
     String output = args[1];
 
@@ -58,7 +62,7 @@
    * @param input            the input pathname String
    * @param output           the output pathname String
    */
-  public static void runJob(String input, String output) {
+  public static void runJob(String input, String output) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);
     
@@ -77,50 +81,45 @@
     conf.setOutputFormat(SequenceFileOutputFormat.class);
     conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
     
-    try {
-      FileSystem dfs = FileSystem.get(conf);
-      if (dfs.exists(outPath))
-        dfs.delete(outPath, true);
-      
- SequenceFileModelReader reader = new SequenceFileModelReader();
-      
-      Path Sigma_kFiles = new Path(output+"/trainer-weights/Sigma_k/*");         
-      HashMap<String,Float> labelWeightSum= reader.readLabelSums(dfs, Sigma_kFiles, conf);
-      DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf, GenericsUtil.getClass(labelWeightSum));     
-      String labelWeightSumString = mapStringifier.toString(labelWeightSum);
-      
-      System.out.println("Sigma_k for Each Label");
-      Map<String,Float> c = mapStringifier.fromString(labelWeightSumString);
-      System.out.println(c);
-      conf.set("cnaivebayes.sigma_k", labelWeightSumString);
-      
-      
-      Path sigma_kSigma_jFile = new Path(output+"/trainer-weights/Sigma_kSigma_j/*");         
-      Float sigma_jSigma_k = reader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf);
-      DefaultStringifier<Float> floatStringifier = new DefaultStringifier<Float>(conf, Float.class);     
-      String sigma_jSigma_kString = floatStringifier.toString(sigma_jSigma_k);
-      
-      System.out.println("Sigma_kSigma_j for each Label and for each Features");
-      Float retSigma_jSigma_k = floatStringifier.fromString(sigma_jSigma_kString);      
-      System.out.println(retSigma_jSigma_k);
-      conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);
-      
-      Path vocabCountFile = new Path(output+"/trainer-tfIdf/trainer-vocabCount/*"); 
-      Float vocabCount = reader.readVocabCount(dfs, vocabCountFile, conf);
-      String vocabCountString = floatStringifier.toString(vocabCount);
-      
-      System.out.println("Vocabulary Count");
-      conf.set("cnaivebayes.vocabCount", vocabCountString);
-      Float retvocabCount = floatStringifier.fromString(vocabCountString);
-      System.out.println(retvocabCount);
-      
-      client.setConf(conf);    
-    
-      JobClient.runJob(conf);      
-      
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
+    FileSystem dfs = FileSystem.get(conf);
+    if (dfs.exists(outPath))
+      dfs.delete(outPath, true);
+
+    SequenceFileModelReader reader = new SequenceFileModelReader();
+
+    Path Sigma_kFiles = new Path(output+"/trainer-weights/Sigma_k/*");
+    HashMap<String,Float> labelWeightSum= reader.readLabelSums(dfs, Sigma_kFiles, conf);
+    DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf, GenericsUtil.getClass(labelWeightSum));
+    String labelWeightSumString = mapStringifier.toString(labelWeightSum);
+
+    log.info("Sigma_k for Each Label");
+    Map<String,Float> c = mapStringifier.fromString(labelWeightSumString);
+    log.info("{}", c);
+    conf.set("cnaivebayes.sigma_k", labelWeightSumString);
+
+
+    Path sigma_kSigma_jFile = new Path(output+"/trainer-weights/Sigma_kSigma_j/*");
+    Float sigma_jSigma_k = reader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf);
+    DefaultStringifier<Float> floatStringifier = new DefaultStringifier<Float>(conf, Float.class);
+    String sigma_jSigma_kString = floatStringifier.toString(sigma_jSigma_k);
+
+    log.info("Sigma_kSigma_j for each Label and for each Features");
+    Float retSigma_jSigma_k = floatStringifier.fromString(sigma_jSigma_kString);
+    log.info("{}", retSigma_jSigma_k);
+    conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);
+
+    Path vocabCountFile = new Path(output+"/trainer-tfIdf/trainer-vocabCount/*");
+    Float vocabCount = reader.readVocabCount(dfs, vocabCountFile, conf);
+    String vocabCountString = floatStringifier.toString(vocabCount);
+
+    log.info("Vocabulary Count");
+    conf.set("cnaivebayes.vocabCount", vocabCountString);
+    Float retvocabCount = floatStringifier.fromString(vocabCountString);
+    log.info("{}", retvocabCount);
+
+    client.setConf(conf);
+
+    JobClient.runJob(conf);
     
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerMapper.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerMapper.java Sun Aug 24 09:10:42 2008
@@ -26,6 +26,8 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.util.GenericsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.HashMap;
@@ -33,6 +35,8 @@
 public class BayesThetaNormalizerMapper extends MapReduceBase implements
     Mapper<Text, FloatWritable, Text, FloatWritable> {
 
+  private static final Logger log = LoggerFactory.getLogger(BayesThetaNormalizerMapper.class);
+
   public HashMap<String, Float> labelWeightSum = null;
 
   String labelWeightSumString = " ";
@@ -93,8 +97,7 @@
 
       }
     } catch (IOException ex) {
-
-      ex.printStackTrace();
+      log.warn(ex.toString(), ex);
     }
   }
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerReducer.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/BayesThetaNormalizerReducer.java Sun Aug 24 09:10:42 2008
@@ -31,9 +31,7 @@
 /**
  * Can also be used as a local Combiner beacuse only two values should be there
  * inside the values
- * 
  */
-
 public class BayesThetaNormalizerReducer extends MapReduceBase implements
     Reducer<Text, FloatWritable, Text, FloatWritable> {
 
@@ -49,21 +47,19 @@
 
   String vocabCountString = " ";
   
-  @SuppressWarnings("unused")
   public void reduce(Text key, Iterator<FloatWritable> values,
       OutputCollector<Text, FloatWritable> output, Reporter reporter)
       throws IOException {
     // Key is label,word, value is the number of times we've seen this label
     // word per local node. Output is the same
     
-    String token = key.toString();
+    //String token = key.toString();
 
     float weightSumPerLabel = 0.0f;
 
     while (values.hasNext()) {
       weightSumPerLabel += values.next().get();
     }
-    // System.out.println(token + "=>"+ weightSumPerLabel);
     output.collect(key, new FloatWritable(weightSumPerLabel));
 
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureDriver.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureDriver.java Sun Aug 24 09:10:42 2008
@@ -24,13 +24,20 @@
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.KeyValueTextInputFormat;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import java.io.IOException;
 
 /**
  * Create and run the Bayes Feature Reader Step.
- *
- **/
+ */
 public class BayesFeatureDriver {
+
+  private static final Logger log = LoggerFactory.getLogger(BayesFeatureDriver.class);  
+
   /**
    * Takes in two arguments:
    * <ol>
@@ -39,7 +46,7 @@
    * </ol>
    * @param args The args
    */
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     String input = args[0];
     String output = args[1];
 
@@ -52,18 +59,16 @@
    * @param input            the input pathname String
    * @param output           the output pathname String
    */
-
-  @SuppressWarnings("deprecation")
-  public static void runJob(String input, String output, int gramSize) {
+  public static void runJob(String input, String output, int gramSize) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(BayesFeatureDriver.class);
 
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(FloatWritable.class);
 
-    conf.setInputPath(new Path(input));
+    FileInputFormat.setInputPaths(conf, new Path(input));
     Path outPath = new Path(output);
-    conf.setOutputPath(outPath);
+    FileOutputFormat.setOutputPath(conf, outPath);
     conf.setNumMapTasks(100);
     //conf.setNumReduceTasks(1);
     conf.setMapperClass(BayesFeatureMapper.class);
@@ -73,27 +78,23 @@
     conf.setReducerClass(BayesFeatureReducer.class);    
     conf.setOutputFormat(BayesFeatureOutputFormat.class);
 
-    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
-    
-    try {
-      FileSystem dfs = FileSystem.get(conf);
-      if (dfs.exists(outPath))
-        dfs.delete(outPath, true);
-      
-      DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(conf, Integer.class);     
-      String gramSizeString = intStringifier.toString(new Integer(gramSize));
-      
-      Integer retGramSize = intStringifier.fromString(gramSizeString);      
-      System.out.println(retGramSize);
-      conf.set("bayes.gramSize", gramSizeString);
-      
-      client.setConf(conf);    
-      JobClient.runJob(conf);      
-      
-      
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
+    conf.set("io.serializations",
+             "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); 
+    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
+
+    FileSystem dfs = FileSystem.get(conf);
+    if (dfs.exists(outPath))
+      dfs.delete(outPath, true);
+
+    DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(conf, Integer.class);
+    String gramSizeString = intStringifier.toString(new Integer(gramSize));
+
+    Integer retGramSize = intStringifier.fromString(gramSizeString);
+    log.info("{}", retGramSize);
+    conf.set("bayes.gramSize", gramSizeString);
+
+    client.setConf(conf);
+    JobClient.runJob(conf);
     
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureMapper.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureMapper.java Sun Aug 24 09:10:42 2008
@@ -27,6 +27,8 @@
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.classifier.BayesFileFormatter;
 import org.apache.mahout.common.Model;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.HashMap;
@@ -34,12 +36,14 @@
 import java.util.List;
 
 /**
- * Reads the input train set(preprocessed using the {@link BayesFileFormatter}). 
- * 
+ * Reads the input train set(preprocessed using the {@link BayesFileFormatter}).
  */
 public class BayesFeatureMapper extends MapReduceBase implements
     Mapper<Text, Text, Text, FloatWritable> {
-  private final static FloatWritable one = new FloatWritable(1.0f);
+
+  private static final Logger log = LoggerFactory.getLogger(BayesFeatureMapper.class);
+
+  private static final FloatWritable one = new FloatWritable(1.0f);
 
   private final Text labelWord = new Text();
 
@@ -47,8 +51,10 @@
 
   /**
    * We need to count the number of times we've seen a term with a given label
-   * and we need to output that. But this Mapper does more than just outputing the count. It first does weight normalisation.
-   * Secondly, it outputs for each unique word in a document value 1 for summing up as the Term Document Frequency. Which later is used to calculate the Idf
+   * and we need to output that. But this Mapper does more than just outputing the count. It first does weight
+   * normalisation.
+   * Secondly, it outputs for each unique word in a document value 1 for summing up as the Term Document Frequency.
+   * Which later is used to calculate the Idf
    * Thirdly, it outputs for each label the number of times a document was seen(Also used in Idf Calculation)
    * 
    * @param key The label
@@ -123,8 +129,7 @@
       gramSize = intStringifier.fromString(gramSizeString);
 
     } catch (IOException ex) {
-
-      ex.printStackTrace();
+      log.warn(ex.toString(), ex);
     }
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java Sun Aug 24 09:10:42 2008
@@ -27,15 +27,20 @@
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.util.GenericsUtil;
 import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.util.HashMap;
 import java.util.Map;
+import java.io.IOException;
 
 /**
  * The Driver which drives the Tf-Idf Generation
- *
- **/
+ */
 public class BayesTfIdfDriver {
+
+  private static final Logger log = LoggerFactory.getLogger(BayesTfIdfDriver.class);
+
   /**
    * Takes in two arguments:
    * <ol>
@@ -44,7 +49,7 @@
    * </ol>
    * @param args The args
    */
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     String input = args[0];
     String output = args[1];
 
@@ -57,7 +62,7 @@
    * @param input            the input pathname String
    * @param output           the output pathname String
    */
-  public static void runJob(String input, String output) {
+  public static void runJob(String input, String output) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(BayesTfIdfDriver.class);
     
@@ -79,33 +84,27 @@
     conf.setOutputFormat(BayesTfIdfOutputFormat.class);
     
     conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
-     try {
-      FileSystem dfs = FileSystem.get(conf);
-      if (dfs.exists(outPath))
-        dfs.delete(outPath, true);
-      
-      SequenceFileModelReader reader = new SequenceFileModelReader();
-      
-      Path interimFile = new Path(output+"/trainer-docCount/part-*");      
-      
-      HashMap<String,Float> labelDocumentCounts= reader.readLabelDocumentCounts(dfs, interimFile, conf);
-
-      DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf,GenericsUtil.getClass(labelDocumentCounts));
-      
-      String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
-      System.out.println("Counts of documents in Each Label");
-      Map<String,Float> c = mapStringifier.fromString(labelDocumentCountString);
-      System.out.println(c);
-      
-      conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
-      
-      client.setConf(conf);    
-    
-      JobClient.runJob(conf);      
-      
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
-    
+    FileSystem dfs = FileSystem.get(conf);
+    if (dfs.exists(outPath))
+      dfs.delete(outPath, true);
+
+    SequenceFileModelReader reader = new SequenceFileModelReader();
+
+    Path interimFile = new Path(output+"/trainer-docCount/part-*");
+
+    HashMap<String,Float> labelDocumentCounts= reader.readLabelDocumentCounts(dfs, interimFile, conf);
+
+    DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf,GenericsUtil.getClass(labelDocumentCounts));
+
+    String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
+    log.info("Counts of documents in Each Label");
+    Map<String,Float> c = mapStringifier.fromString(labelDocumentCountString);
+    log.info("{}", c);
+
+    conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
+
+    client.setConf(conf);
+
+    JobClient.runJob(conf);
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java Sun Aug 24 09:10:42 2008
@@ -17,7 +17,6 @@
  * limitations under the License.
  */
 
-
 import org.apache.hadoop.io.DefaultStringifier;
 import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.io.Text;
@@ -27,6 +26,8 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.util.GenericsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.HashMap;
@@ -34,8 +35,11 @@
 public class BayesTfIdfMapper extends MapReduceBase implements
     Mapper<Text, FloatWritable, Text, FloatWritable> {
 
+  private static final Logger log = LoggerFactory.getLogger(BayesTfIdfMapper.class);  
+
   public HashMap<String, Float> labelDocumentCounts = null;
   String labelDocumentCountString =" ";
+
   /**
    * We need to calculate the Tf-Idf of each feature in each label
    * 
@@ -51,7 +55,6 @@
       throws IOException {
  
     String labelFeaturePair = key.toString();
-   
 
     if (labelFeaturePair.startsWith("-")) { // if it is the termDocumentCount
       labelFeaturePair = labelFeaturePair.substring(1);
@@ -66,34 +69,29 @@
       float logIdf = (float)Math.log(labelDocumentCount.floatValue()  / value.get());
       
       output.collect(new Text(labelFeaturePair), new FloatWritable(logIdf));
-    } 
-    else if (labelFeaturePair.startsWith(",")) {
+    } else if (labelFeaturePair.startsWith(",")) {
       output.collect(new Text("*vocabCount"), new FloatWritable(1.0f));
-    }
-    else {
+    } else {
       output.collect(key, value);
     }
   }
   
   @Override
   public void configure(JobConf job) {
-    try
-    {
-      if(labelDocumentCounts ==null){
+    try {
+      if (labelDocumentCounts == null){
         labelDocumentCounts = new HashMap<String, Float>();
 
-        DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(job,GenericsUtil.getClass(labelDocumentCounts));
+        DefaultStringifier<HashMap<String,Float>> mapStringifier =
+            new DefaultStringifier<HashMap<String,Float>>(job,GenericsUtil.getClass(labelDocumentCounts));
 
         labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);  
         labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
         
-        
         labelDocumentCounts = mapStringifier.fromString(labelDocumentCountString);
       }
-    }
-    catch(IOException ex){
-      
-      ex.printStackTrace();
+    } catch(IOException ex){
+      log.warn(ex.toString(), ex);
     }
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java Sun Aug 24 09:10:42 2008
@@ -22,38 +22,39 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.Iterator;
 
-
 /**
  *  Can also be used as a local Combiner beacuse only two values should be there inside the values
- *
- **/
-
+ */
 public class BayesTfIdfReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
-  public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+
+  private static final Logger log = LoggerFactory.getLogger(BayesTfIdfReducer.class);
+
+  public void reduce(Text key,
+                     Iterator<FloatWritable> values,
+                     OutputCollector<Text, FloatWritable> output,
+                     Reporter reporter) throws IOException {
     //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same
     String token = key.toString();  
-    if(token.startsWith("*vocabCount"))
-    {
+    if(token.startsWith("*vocabCount")) {
       float vocabCount = 0;
       while (values.hasNext()) {
         vocabCount += values.next().get();
       }
-      System.out.println(token + "\t"+vocabCount);
+      log.info("{}\t{}", token, vocabCount);
       output.collect(key, new FloatWritable(vocabCount));
-    }
-    else
-    {
+    } else {
       float idfTimes_D_ij = 1.0f;
-      int numberofValues = 0;
+      //int numberofValues = 0;
       while (values.hasNext()) {
         idfTimes_D_ij *= values.next().get();
-        numberofValues ++;
+        //numberofValues ++;
       }
-      //System.out.println(token + "\t" + numberofValues + "\t"+idfTimes_D_ij);
       //if(numberofValues!=2) throw new IOException("Number of values should be exactly 2");
       
       output.collect(key, new FloatWritable(idfTimes_D_ij));

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java Sun Aug 24 09:10:42 2008
@@ -25,12 +25,13 @@
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 
+import java.io.IOException;
 
 /**
  * Create and run the Bayes Trainer.
- *
- **/
+ */
 public class BayesWeightSummerDriver {
+
   /**
    * Takes in two arguments:
    * <ol>
@@ -39,7 +40,7 @@
    * </ol>
    * @param args The args
    */
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     String input = args[0];
     String output = args[1];
 
@@ -52,7 +53,7 @@
    * @param input            the input pathname String
    * @param output           the output pathname String
    */
-  public static void runJob(String input, String output) {
+  public static void runJob(String input, String output) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(BayesWeightSummerDriver.class);
     
@@ -71,17 +72,11 @@
     conf.setCombinerClass(BayesWeightSummerReducer.class);
     conf.setReducerClass(BayesWeightSummerReducer.class);    
     conf.setOutputFormat(BayesWeightSummerOutputFormat.class);
-      try {
-      FileSystem dfs = FileSystem.get(conf);
-      if (dfs.exists(outPath))
-        dfs.delete(outPath, true);
-      client.setConf(conf);    
-    
-      JobClient.runJob(conf);      
-      
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
-    
+    FileSystem dfs = FileSystem.get(conf);
+    if (dfs.exists(outPath))
+      dfs.delete(outPath, true);
+    client.setConf(conf);
+
+    JobClient.runJob(conf);
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java Sun Aug 24 09:10:42 2008
@@ -27,6 +27,8 @@
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.mahout.common.Model;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.HashMap;
@@ -34,10 +36,11 @@
 
 /**
  * This Class reads the different interim  files created during the Training stage as well as the Model File during testing.
- * 
  */
 public class SequenceFileModelReader {
 
+  private static final Logger log = LoggerFactory.getLogger(SequenceFileModelReader.class);  
+
   public Model loadModel(Model model, FileSystem fs, Map<String, Path> pathPatterns,
       Configuration conf) throws IOException {
 
@@ -63,7 +66,7 @@
     FileStatus[] outputFiles = fs.globStatus(pathPattern);
     for (FileStatus fileStatus : outputFiles) {
       Path path = fileStatus.getPath();
-      System.out.println(path.toString());
+      log.info("{}", path);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
 
       // the key is either _label_ or label,feature
@@ -90,7 +93,7 @@
     FileStatus[] outputFiles = fs.globStatus(pathPattern);
     for (FileStatus fileStatus : outputFiles) {
       Path path = fileStatus.getPath();
-      System.out.println(path.toString());
+      log.info("{}", path);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
 
       // the key is either _label_ or label,feature
@@ -114,7 +117,7 @@
     FileStatus[] outputFiles = fs.globStatus(pathPattern);
     for (FileStatus fileStatus : outputFiles) {
       Path path = fileStatus.getPath();
-      System.out.println(path.toString());
+      log.info("{}", path);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
 
       // the key is either _label_ or label,feature
@@ -139,7 +142,7 @@
     FileStatus[] outputFiles = fs.globStatus(pathPattern);
     for (FileStatus fileStatus : outputFiles) {
       Path path = fileStatus.getPath();
-      System.out.println(path.toString());
+      log.info("{}", path);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
 
       // the key is either _label_ or label,feature
@@ -164,7 +167,7 @@
     FileStatus[] outputFiles = fs.globStatus(pathPattern);
     for (FileStatus fileStatus : outputFiles) {
       Path path = fileStatus.getPath();
-      System.out.println(path.toString());
+      log.info("{}", path);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
 
       // the key is either _label_ or label,feature
@@ -174,7 +177,7 @@
         if (keyStr.startsWith("*")) { // Sum of weights for all Feature
           // and all Labels
           model.setSigma_jSigma_k(value.get());
-          System.out.println(value.get());
+          log.info("{}", value.get());
         }
       }
     }
@@ -193,7 +196,7 @@
     FileStatus[] outputFiles = fs.globStatus(pathPattern);
     for (FileStatus fileStatus : outputFiles) {
       Path path = fileStatus.getPath();
-      System.out.println(path.toString());
+      log.info("{}", path);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
       // the key is either _label_ or label,feature
       while (reader.next(key, value)) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java Sun Aug 24 09:10:42 2008
@@ -23,12 +23,18 @@
 import org.apache.mahout.classifier.bayes.common.BayesFeatureDriver;
 import org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver;
 import org.apache.mahout.classifier.bayes.common.BayesWeightSummerDriver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
 
 /**
  * Create and run the Bayes Trainer.
- * 
  */
 public class CBayesDriver {
+
+  private static final Logger log = LoggerFactory.getLogger(CBayesDriver.class);    
+
   /**
    * Takes in two arguments:
    * <ol>
@@ -41,7 +47,7 @@
    * 
    * @param args The args
    */
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     String input = args[0];
     String output = args[1];
 
@@ -53,74 +59,66 @@
    * 
    * @param input the input pathname String
    * @param output the output pathname String
-   * 
    */
-  @SuppressWarnings("deprecation")
-  public static void runJob(String input, String output, int gramSize) {
+  public static void runJob(String input, String output, int gramSize) throws IOException {
     JobConf conf = new JobConf(CBayesDriver.class);
-    try {
-      FileSystem dfs = FileSystem.get(conf);
-      Path outPath = new Path(output);
-      if (dfs.exists(outPath))
-        dfs.delete(outPath);
-      
-      System.out.println("Reading features...");
-      //Read the features in each document normalized by length of each document
-      BayesFeatureDriver.runJob(input, output, gramSize);
-      
-      System.out.println("Calculating Tf-Idf...");
-      //Calculate the TfIdf for each word in each label
-      BayesTfIdfDriver.runJob(input, output);
-      
-      System.out.println("Calculating weight sums for labels and features...");
-      //Calculate the Sums of weights for each label, for each feature and for each feature and for each label
-      BayesWeightSummerDriver.runJob(input, output);
-      
-      //System.out.println("Calculating the weight of the features of each label in the complement class...");
-      //Calculate the W_ij = log(Theta) for each label, feature. This step actually generates the complement class
-      //CBayesThetaDriver.runJob(input, output);
-      
-      System.out.println("Calculating the weight Normalisation factor for each complement class...");
-      //Calculate the normalization factor Sigma_W_ij for each complement class. 
-      CBayesThetaNormalizerDriver.runJob(input, output);
-      
-      //System.out.println("Calculating the final Weight Normalized Complementary Naive Bayes Model...");
-      //Calculate the normalization factor Sigma_W_ij for each complement class. 
-      //CBayesNormalizedWeightDriver.runJob(input, output);
-      
-      Path docCountOutPath = new Path(output+ "/trainer-docCount");
-      if (dfs.exists(docCountOutPath))
-        dfs.delete(docCountOutPath, true);
-      Path termDocCountOutPath = new Path(output+ "/trainer-termDocCount");
-      if (dfs.exists(termDocCountOutPath))
-        dfs.delete(termDocCountOutPath, true);
-      Path featureCountOutPath = new Path(output+ "/trainer-featureCount");
-      if (dfs.exists(featureCountOutPath))
-        dfs.delete(featureCountOutPath, true);
-      Path wordFreqOutPath = new Path(output+ "/trainer-wordFreq");
-      if (dfs.exists(wordFreqOutPath))
-        dfs.delete(wordFreqOutPath, true);
-      Path vocabCountPath = new Path(output+ "/trainer-tfIdf/trainer-vocabCount");
-      if (dfs.exists(vocabCountPath))
-        dfs.delete(vocabCountPath, true);
-      /*Path tfIdfOutPath = new Path(output+ "/trainer-tfIdf");
-      if (dfs.exists(tfIdfOutPath))
-        dfs.delete(tfIdfOutPath, true);*/
-      Path vocabCountOutPath = new Path(output+ "/trainer-vocabCount");
-      if (dfs.exists(vocabCountOutPath))
-        dfs.delete(vocabCountOutPath, true);
-     /* Path weightsOutPath = new Path(output+ "/trainer-weights");
-      if (dfs.exists(weightsOutPath))
-        dfs.delete(weightsOutPath, true);*/
-      /*Path thetaOutPath = new Path(output+ "/trainer-theta");
-      if (dfs.exists(thetaOutPath))
-        dfs.delete(thetaOutPath, true);*/
-      /*Path thetaNormalizerOutPath = new Path(output+ "/trainer-thetaNormalizer");
-      if (dfs.exists(thetaNormalizerOutPath))
-        dfs.delete(thetaNormalizerOutPath, true);*/
-      
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
+    FileSystem dfs = FileSystem.get(conf);
+    Path outPath = new Path(output);
+    if (dfs.exists(outPath))
+      dfs.delete(outPath, true);
+
+    log.info("Reading features...");
+    //Read the features in each document normalized by length of each document
+    BayesFeatureDriver.runJob(input, output, gramSize);
+
+    log.info("Calculating Tf-Idf...");
+    //Calculate the TfIdf for each word in each label
+    BayesTfIdfDriver.runJob(input, output);
+
+    log.info("Calculating weight sums for labels and features...");
+    //Calculate the Sums of weights for each label, for each feature and for each feature and for each label
+    BayesWeightSummerDriver.runJob(input, output);
+
+    //Calculate the W_ij = log(Theta) for each label, feature. This step actually generates the complement class
+    //CBayesThetaDriver.runJob(input, output);
+
+    log.info("Calculating the weight Normalisation factor for each complement class...");
+    //Calculate the normalization factor Sigma_W_ij for each complement class.
+    CBayesThetaNormalizerDriver.runJob(input, output);
+
+    //Calculate the normalization factor Sigma_W_ij for each complement class.
+    //CBayesNormalizedWeightDriver.runJob(input, output);
+
+    Path docCountOutPath = new Path(output+ "/trainer-docCount");
+    if (dfs.exists(docCountOutPath))
+      dfs.delete(docCountOutPath, true);
+    Path termDocCountOutPath = new Path(output+ "/trainer-termDocCount");
+    if (dfs.exists(termDocCountOutPath))
+      dfs.delete(termDocCountOutPath, true);
+    Path featureCountOutPath = new Path(output+ "/trainer-featureCount");
+    if (dfs.exists(featureCountOutPath))
+      dfs.delete(featureCountOutPath, true);
+    Path wordFreqOutPath = new Path(output+ "/trainer-wordFreq");
+    if (dfs.exists(wordFreqOutPath))
+      dfs.delete(wordFreqOutPath, true);
+    Path vocabCountPath = new Path(output+ "/trainer-tfIdf/trainer-vocabCount");
+    if (dfs.exists(vocabCountPath))
+      dfs.delete(vocabCountPath, true);
+    /*Path tfIdfOutPath = new Path(output+ "/trainer-tfIdf");
+    if (dfs.exists(tfIdfOutPath))
+      dfs.delete(tfIdfOutPath, true);*/
+    Path vocabCountOutPath = new Path(output+ "/trainer-vocabCount");
+    if (dfs.exists(vocabCountOutPath))
+      dfs.delete(vocabCountOutPath, true);
+   /* Path weightsOutPath = new Path(output+ "/trainer-weights");
+    if (dfs.exists(weightsOutPath))
+      dfs.delete(weightsOutPath, true);*/
+    /*Path thetaOutPath = new Path(output+ "/trainer-theta");
+    if (dfs.exists(thetaOutPath))
+      dfs.delete(thetaOutPath, true);*/
+    /*Path thetaNormalizerOutPath = new Path(output+ "/trainer-thetaNormalizer");
+    if (dfs.exists(thetaNormalizerOutPath))
+      dfs.delete(thetaNormalizerOutPath, true);*/
+
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java Sun Aug 24 09:10:42 2008
@@ -18,11 +18,15 @@
  */
 
 import org.apache.mahout.common.Model;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.util.Map;
 
 public class CBayesModel extends Model {
 
+  private static final Logger log = LoggerFactory.getLogger(CBayesModel.class);
+
   @Override
   protected float getWeight(Integer label, Integer feature) {
     float result = 0.0f;
@@ -61,7 +65,7 @@
     float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
 
     
-    System.out.println(thetaNormalizer);
+    log.info("{}", thetaNormalizer);
     for (Integer label : thetaNormalizer.keySet()) {
       float Sigma_W_ij = thetaNormalizer.get(label);
       if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
@@ -74,7 +78,7 @@
       thetaNormalizer.put(label, Sigma_W_ij
           / perLabelWeightSumNormalisationFactor);
     }
-    System.out.println(thetaNormalizer);
+    log.info("{}", thetaNormalizer);
     
     /*for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
       thetaNormalizer.put(label, new Float(0));
@@ -97,7 +101,7 @@
       }
     }
     perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
-    System.out.println(thetaNormalizer);
+    log.info("{}", thetaNormalizer);
     for (Integer label : thetaNormalizer.keySet()) {
       float Sigma_W_ij = thetaNormalizer.get(label);
       if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
@@ -110,12 +114,11 @@
       thetaNormalizer.put(label, Sigma_W_ij
           / perLabelWeightSumNormalisationFactor);
     }
-    System.out.println(thetaNormalizer);*/
+    log.info("{}", thetaNormalizer);*/
   }
 
   @Override
   public void GenerateModel() {
-    try {
       float vocabCount = featureList.size();
 
       float[] perLabelThetaNormalizer = new float[labelList.size()];
@@ -141,7 +144,7 @@
 
         }
       }
-      System.out.println("Normalizing Weights");
+      log.info("Normalizing Weights");
       for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
         float Sigma_W_ij = perLabelThetaNormalizer[label];
         if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
@@ -165,10 +168,6 @@
           setWeight(label, feature, normalizedWeight);
         }
       }
-    } catch (Exception e) {
-      // TODO Auto-generated catch block
-      e.printStackTrace();
-    }
   }
 
   

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java Sun Aug 24 09:10:42 2008
@@ -27,15 +27,20 @@
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.util.GenericsUtil;
 import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.util.HashMap;
 import java.util.Map;
+import java.io.IOException;
 
 /**
  * Create and run the Bayes Trainer.
- *
- **/
+ */
 public class CBayesNormalizedWeightDriver {
+
+  private static final Logger log = LoggerFactory.getLogger(CBayesNormalizedWeightDriver.class);      
+
   /**
    * Takes in two arguments:
    * <ol>
@@ -44,7 +49,7 @@
    * </ol>
    * @param args The args
    */
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     String input = args[0];
     String output = args[1];
 
@@ -57,7 +62,7 @@
    * @param input            the input pathname String
    * @param output           the output pathname String
    */
-  public static void runJob(String input, String output) {
+  public static void runJob(String input, String output) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(CBayesNormalizedWeightDriver.class);
     
@@ -75,48 +80,45 @@
     conf.setReducerClass(CBayesNormalizedWeightReducer.class);    
     conf.setOutputFormat(SequenceFileOutputFormat.class);
     
-    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
-     try {
-      FileSystem dfs = FileSystem.get(conf);
-      if (dfs.exists(outPath))
-        dfs.delete(outPath, true);
-      
-      SequenceFileModelReader reader = new SequenceFileModelReader();
-      
-      Path thetaNormalizationsFiles = new Path(output+"/trainer-thetaNormalizer/part*");         
-      HashMap<String,Float> thetaNormalizer= reader.readLabelSums(dfs, thetaNormalizationsFiles, conf);
-      float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
-      for(String label: thetaNormalizer.keySet())
-      {
-        
-        float Sigma_W_ij = thetaNormalizer.get(label);
-        if(perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)){
-          perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
-        }
-      } 
-      
-      for(String label: thetaNormalizer.keySet())
-      {        
-        float Sigma_W_ij = thetaNormalizer.get(label);
-        thetaNormalizer.put(label, Sigma_W_ij / perLabelWeightSumNormalisationFactor) ;      
+    conf.set("io.serializations",
+             "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
+    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
+    FileSystem dfs = FileSystem.get(conf);
+    if (dfs.exists(outPath))
+      dfs.delete(outPath, true);
+
+    SequenceFileModelReader reader = new SequenceFileModelReader();
+
+    Path thetaNormalizationsFiles = new Path(output+"/trainer-thetaNormalizer/part*");
+    HashMap<String,Float> thetaNormalizer= reader.readLabelSums(dfs, thetaNormalizationsFiles, conf);
+    float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+    for(String label: thetaNormalizer.keySet())
+    {
+
+      float Sigma_W_ij = thetaNormalizer.get(label);
+      if(perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)){
+        perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
       }
-      
-      
-      DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf, GenericsUtil.getClass(thetaNormalizer));     
-      String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer);
-      
-      Map<String,Float> c = mapStringifier.fromString(thetaNormalizationsString);
-      System.out.println(c);
-      conf.set("cnaivebayes.thetaNormalizations", thetaNormalizationsString);
-      
-     
-      client.setConf(conf);    
-    
-      JobClient.runJob(conf);      
-      
-    } catch (Exception e) {
-      throw new RuntimeException(e);
     }
-    
+
+    for(String label: thetaNormalizer.keySet())
+    {
+      float Sigma_W_ij = thetaNormalizer.get(label);
+      thetaNormalizer.put(label, Sigma_W_ij / perLabelWeightSumNormalisationFactor) ;
+    }
+
+
+    DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf, GenericsUtil.getClass(thetaNormalizer));
+    String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer);
+
+    Map<String,Float> c = mapStringifier.fromString(thetaNormalizationsString);
+    log.info("{}", c);
+    conf.set("cnaivebayes.thetaNormalizations", thetaNormalizationsString);
+
+
+    client.setConf(conf);
+
+    JobClient.runJob(conf);
+
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java Sun Aug 24 09:10:42 2008
@@ -26,6 +26,8 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.util.GenericsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.HashMap;
@@ -34,6 +36,8 @@
 public class CBayesNormalizedWeightMapper extends MapReduceBase implements
     Mapper<Text, FloatWritable, Text, FloatWritable> {
 
+  private static final Logger log = LoggerFactory.getLogger(CBayesNormalizedWeightMapper.class);    
+
   public HashMap<String, Float> thetaNormalizer = null;
 
   String thetaNormalizationsString = " ";
@@ -75,8 +79,7 @@
 
       }
     } catch (IOException ex) {
-
-      ex.printStackTrace();
+      log.warn(ex.toString(), ex);
     }
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java?rev=688522&r1=688521&r2=688522&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java Sun Aug 24 09:10:42 2008
@@ -23,6 +23,8 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.Iterator;
@@ -30,10 +32,11 @@
 
 /**
  *  Can also be used as a local Combiner beacuse only two values should be there inside the values
- *
  */
 public class CBayesNormalizedWeightReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
-  
+
+  private static final Logger log = LoggerFactory.getLogger(CBayesNormalizedWeightReducer.class);      
+
   public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
     //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same
     String token = key.toString();  
@@ -42,7 +45,7 @@
       weight += values.next().get();
     }
     if(token.equalsIgnoreCase("rec.motorcycles,miller"))
-      System.out.println(token + "=>" + weight);
+      log.info("{}=>{}", token, weight);
     output.collect(key, new FloatWritable(weight));
   }
 



Mime
View raw message