mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r789843 - in /lucene/mahout/trunk/core/src: main/java/org/apache/mahout/clustering/fuzzykmeans/ main/java/org/apache/mahout/clustering/kmeans/ main/java/org/apache/mahout/utils/ test/java/org/apache/mahout/clustering/kmeans/
Date Tue, 30 Jun 2009 17:39:24 GMT
Author: gsingers
Date: Tue Jun 30 17:39:24 2009
New Revision: 789843

URL: http://svn.apache.org/viewvc?rev=789843&view=rev
Log:
Convert Fuzzy KMeans to CLI2, hook in random initialization option, refactor random stuff
slightly to improve consumability

Added:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/HadoopUtil.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=789843&r1=789842&r2=789843&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
(original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
Tue Jun 30 17:39:24 2009
@@ -17,86 +17,195 @@
 
 package org.apache.mahout.clustering.fuzzykmeans;
 
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.KeyValueLineRecordReader;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.apache.mahout.clustering.kmeans.Cluster;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.matrix.SparseVector;
 import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.CommandLineUtil;
+import org.apache.mahout.utils.HadoopUtil;
+import org.apache.mahout.utils.SquaredEuclideanDistanceMeasure;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
 public class FuzzyKMeansDriver {
 
   private static final Logger log = LoggerFactory
-      .getLogger(FuzzyKMeansDriver.class);
+          .getLogger(FuzzyKMeansDriver.class);
 
 
   private FuzzyKMeansDriver() {
   }
 
-  private static void printMessage() {
-    System.out
-        .println("Usage: input clusterIn output measureClass convergenceDelta maxIterations
m [doClusteringOnly]");
-  }
 
-  public static void main(String[] args) throws ClassNotFoundException {
-    if (args.length < 8) {
-      System.out.println("Expected number of arguments: 7 or 8 : received:"
-          + args.length);
-      printMessage();
-    }
-    int index = 0;
-    String input = args[index++];
-    String clusters = args[index++];
-    String output = args[index++];
-    String measureClass = args[index++];
-    double convergenceDelta = Double.parseDouble(args[index++]);
-    int maxIterations = Integer.parseInt(args[index++]);
-    float m = Float.parseFloat(args[index++]);
-    String vectorClassName = args[index++];
-    Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
-    boolean doClustering = false;
-    if (args.length > 8){
-      doClustering = Boolean.parseBoolean(args[index++]);
-    }
-    if (doClustering) {
-      runClustering(input, clusters, output, measureClass, Double
-          .toString(convergenceDelta), 500, m, vectorClass);
-    } else {
-      runJob(input, clusters, output, measureClass, convergenceDelta,
-          maxIterations, 10, 10, m, vectorClass);
+  public static void main(String[] args) throws ClassNotFoundException, IOException, IllegalAccessException,
InstantiationException {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+            abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Path for input Vectors. Must be a SequenceFile of Writable,
Vector").withShortName("i").create();
+
+    Option clustersOpt = obuilder.withLongName("clusters").withRequired(true).withArgument(
+            abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).
+            withDescription("The input centroids, as Vectors.  Must be a SequenceFile of
Writable, Cluster/Canopy.  " +
+                    "If k is also specified, then a random set of vectors will be selected
and written out to this path first").withShortName("c").create();
+
+    Option kOpt = obuilder.withLongName("k").withRequired(false).withArgument(
+            abuilder.withName("k").withMinimum(1).withMaximum(1).create()).
+            withDescription("The k in k-Means.  If specified, then a random selection of
k Vectors will be chosen as the Centroid and written to the clusters output path.").withShortName("k").create();
+
+    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Path to put the output in").withShortName("o").create();
+
+
+    Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
+            abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Distance Measure to use.  Default is SquaredEuclidean").withShortName("dm").create();
+
+    Option convergenceDeltaOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(
+            abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).
+            withDescription("The threshold below which the clusters are considered to be
converged.  Default is 0.5").withShortName("d").create();
+
+    Option maxIterationsOpt = obuilder.withLongName("max").withRequired(false).withArgument(
+            abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
+            withDescription("The maximum number of iterations to perform.  Default is 20").withShortName("x").create();
+
+    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+            abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Vector implementation class name.  Default is SparseVector.class").withShortName("v").create();
+
+    Option helpOpt = obuilder.withLongName("help").
+            withDescription("Print out help").withShortName("h").create();
+
+    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).
+            withDescription("If set, overwrite the output directory").withShortName("w").create();
+
+    Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).
+            withDescription("If true, run clustering only (assumes the iterations have already
taken place").withShortName("l").create();
+
+    Option mOpt = obuilder.withLongName("m").withRequired(true).withArgument(
+            abuilder.withName("m").withMinimum(1).withMaximum(1).create()).
+            withDescription("coefficient normalization factor, must be greater than 1").withShortName("m").create();
+
+    Option numReduceTasksOpt = obuilder.withLongName("numReduce").withRequired(false).withArgument(
+            abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).
+            withDescription("The number of reduce tasks").withShortName("r").create();
+
+
+    Option numMapTasksOpt = obuilder.withLongName("numMap").withRequired(false).withArgument(
+            abuilder.withName("numMap").withMinimum(1).withMaximum(1).create()).
+            withDescription("The number of map tasks").withShortName("u").create();
+
+
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(measureClassOpt)
+            .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(kOpt).withOption(mOpt)
+            .withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).create();
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+      String input = cmdLine.getValue(inputOpt).toString();
+      String clusters = cmdLine.getValue(clustersOpt).toString();
+      String output = cmdLine.getValue(outputOpt).toString();
+      String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+      if (cmdLine.hasOption(measureClassOpt)) {
+        measureClass = cmdLine.getValue(measureClassOpt).toString();
+      }
+      double convergenceDelta = 0.5;
+      if (cmdLine.hasOption(convergenceDeltaOpt)) {
+        convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
+      }
+      float m = Float.parseFloat(cmdLine.getValue(mOpt).toString());
+
+      Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false
?
+              SparseVector.class
+              : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+
+
+      int numReduceTasks = 10;
+      if (cmdLine.hasOption(numReduceTasksOpt)) {
+        numReduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
+      }
+
+      int numMapTasks = 50;
+      if (cmdLine.hasOption(numMapTasksOpt)) {
+        numMapTasks = Integer.parseInt(cmdLine.getValue(numMapTasksOpt).toString());
+      }
+
+      int maxIterations = 20;
+      if (cmdLine.hasOption(maxIterationsOpt)) {
+        maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt).toString());
+      }
+
+      if (cmdLine.hasOption(overwriteOutput) == true) {
+        HadoopUtil.overwriteOutput(output);
+      }
+
+      if (cmdLine.hasOption(kOpt)) {
+        clusters = RandomSeedGenerator.buildRandom(input, clusters,
+                Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
+      }
+
+      if (cmdLine.hasOption(clusteringOpt)) {
+        runClustering(input, clusters, output, measureClass, convergenceDelta, numMapTasks,
m, vectorClass);
+      } else {
+        runJob(input, clusters, output, measureClass, convergenceDelta,
+                maxIterations, numMapTasks, numReduceTasks, m, vectorClass);
+      }
+
+
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
     }
 
+
   }
 
   /**
    * Run the job using supplied arguments
-   * 
-   * @param input the directory pathname for input points
-   * @param clustersIn the directory pathname for initial & computed clusters
-   * @param output the directory pathname for output points
-   * @param measureClass the classname of the DistanceMeasure
+   *
+   * @param input            the directory pathname for input points
+   * @param clustersIn       the directory pathname for initial & computed clusters
+   * @param output           the directory pathname for output points
+   * @param measureClass     the classname of the DistanceMeasure
    * @param convergenceDelta the convergence delta value
-   * @param maxIterations the maximum number of iterations
-   * @param numMapTasks the number of mapper tasks
-   * @param vectorClass
+   * @param maxIterations    the maximum number of iterations
+   * @param numMapTasks      the number of mapper tasks
+   * @param numReduceTasks   the number of reduce tasks
+   * @param m                the fuzzification factor, see http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+   * @param vectorClass     the {@link org.apache.mahout.matrix.Vector} implementation to
use
    */
   public static void runJob(String input, String clustersIn, String output,
                             String measureClass, double convergenceDelta, int maxIterations,
@@ -104,7 +213,6 @@
 
     boolean converged = false;
     int iteration = 0;
-    String delta = Double.toString(convergenceDelta);
 
     // iterate until the clusters converge
     while (!converged && iteration < maxIterations) {
@@ -113,7 +221,7 @@
       // point the output to a new directory per iteration
       String clustersOut = output + File.separator + "clusters-" + iteration;
       converged = runIteration(input, clustersIn, clustersOut, measureClass,
-          delta, numMapTasks, numReduceTasks, iteration, m);
+              convergenceDelta, numMapTasks, numReduceTasks, iteration, m);
 
       // now point the input to the old output directory
       clustersIn = output + File.separator + "clusters-" + iteration;
@@ -124,25 +232,25 @@
     log.info("Clustering ");
 
     runClustering(input, clustersIn, output + File.separator + "points",
-        measureClass, delta, numMapTasks, m, vectorClass);
+            measureClass, convergenceDelta, numMapTasks, m, vectorClass);
   }
 
   /**
    * Run the job using supplied arguments
-   * 
-   * @param input the directory pathname for input points
-   * @param clustersIn the directory pathname for iniput clusters
-   * @param clustersOut the directory pathname for output clusters
-   * @param measureClass the classname of the DistanceMeasure
+   *
+   * @param input            the directory pathname for input points
+   * @param clustersIn       the directory pathname for iniput clusters
+   * @param clustersOut      the directory pathname for output clusters
+   * @param measureClass     the classname of the DistanceMeasure
    * @param convergenceDelta the convergence delta value
-   * @param numMapTasks the number of map tasks
-   * @param iterationNumber the iteration number that is going to run
-   * @param m
+   * @param numMapTasks      the number of map tasks
+   * @param iterationNumber  the iteration number that is going to run
+   * @param m                the fuzzification factor - see http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
    * @return true if the iteration successfully runs
    */
   private static boolean runIteration(String input, String clustersIn,
-      String clustersOut, String measureClass, String convergenceDelta,
-      int numMapTasks, int numReduceTasks, int iterationNumber, float m) {
+                                      String clustersOut, String measureClass, double convergenceDelta,
+                                      int numMapTasks, int numReduceTasks, int iterationNumber,
float m) {
 
     JobConf conf = new JobConf(FuzzyKMeansJob.class);
     conf.setJobName("Fuzzy K Means{" + iterationNumber + '}');
@@ -164,10 +272,10 @@
     conf.setReducerClass(FuzzyKMeansReducer.class);
     conf.setNumMapTasks(numMapTasks);
     conf.setNumReduceTasks(numReduceTasks);
-    
+
     conf.set(SoftCluster.CLUSTER_PATH_KEY, clustersIn);
     conf.set(SoftCluster.DISTANCE_MEASURE_KEY, measureClass);
-    conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
+    conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
     conf.set(SoftCluster.M_KEY, String.valueOf(m));
 
     // uncomment it to run locally
@@ -185,17 +293,17 @@
 
   /**
    * Run the job using supplied arguments
-   * 
-   * @param input the directory pathname for input points
-   * @param clustersIn the directory pathname for input clusters
-   * @param output the directory pathname for output points
-   * @param measureClass the classname of the DistanceMeasure
+   *
+   * @param input            the directory pathname for input points
+   * @param clustersIn       the directory pathname for input clusters
+   * @param output           the directory pathname for output points
+   * @param measureClass     the classname of the DistanceMeasure
    * @param convergenceDelta the convergence delta value
-   * @param numMapTasks the number of map tasks
+   * @param numMapTasks      the number of map tasks
    */
   private static void runClustering(String input, String clustersIn,
-      String output, String measureClass, String convergenceDelta,
-      int numMapTasks, float m, Class<? extends Vector> vectorClass) {
+                                    String output, String measureClass, double convergenceDelta,
+                                    int numMapTasks, float m, Class<? extends Vector>
vectorClass) {
 
     JobConf conf = new JobConf(FuzzyKMeansDriver.class);
     conf.setJobName("Fuzzy K Means Clustering");
@@ -220,7 +328,7 @@
     conf.setNumReduceTasks(0);
     conf.set(SoftCluster.CLUSTER_PATH_KEY, clustersIn);
     conf.set(SoftCluster.DISTANCE_MEASURE_KEY, measureClass);
-    conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
+    conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
     conf.set(SoftCluster.M_KEY, String.valueOf(m));
     try {
       JobClient.runJob(conf);
@@ -231,15 +339,15 @@
 
   /**
    * Return if all of the Clusters in the filePath have converged or not
-   * 
+   *
    * @param filePath the file path to the single file containing the clusters
-   * @param conf the JobConf
-   * @param fs the FileSystem
+   * @param conf     the JobConf
+   * @param fs       the FileSystem
    * @return true if all Clusters are converged
    * @throws IOException if there was an IO error
    */
   private static boolean isConverged(String filePath, JobConf conf,
-      FileSystem fs) throws IOException {
+                                     FileSystem fs) throws IOException {
 
     Path clusterPath = new Path(filePath);
     List<Path> result = new ArrayList<Path>();
@@ -252,7 +360,7 @@
     };
 
     FileStatus[] matches = fs.listStatus(FileUtil.stat2Paths(fs.globStatus(
-        clusterPath, clusterFileFilter)), clusterFileFilter);
+            clusterPath, clusterFileFilter)), clusterFileFilter);
 
     for (FileStatus match : matches) {
       result.add(fs.makeQualified(match.getPath()));
@@ -265,8 +373,8 @@
 
       try {
         reader = new SequenceFile.Reader(fs, p, conf);
-                /*new KeyValueLineRecordReader(conf, new FileSplit(p, 0, fs
-            .getFileStatus(p).getLen(), (String[]) null));*/
+        /*new KeyValueLineRecordReader(conf, new FileSplit(p, 0, fs
+      .getFileStatus(p).getLen(), (String[]) null));*/
         Text key = new Text();
         SoftCluster value = new SoftCluster();
         while (converged && reader.next(key, value)) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java?rev=789843&r1=789842&r2=789843&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
(original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
Tue Jun 30 17:39:24 2009
@@ -22,12 +22,10 @@
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.KeyValueLineRecordReader;
-import org.apache.hadoop.mapred.RecordReader;
+import org.apache.mahout.clustering.kmeans.Cluster;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -77,15 +75,32 @@
         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
         try {
           //recordReader = new KeyValueLineRecordReader(job, new FileSplit(path, 0, fs.getFileStatus(path).getLen(),
(String[]) null));
-          Text key = new Text();
-          SoftCluster value = new SoftCluster();
-          //int counter = 1;
-          while (reader.next(key, value)) {
-            //get the cluster info
-            // add the center so the centroid will be correct on output
-            // formatting
-//            cluster.addPoint(cluster.getCenter(), 1);
-            clusters.add(value);
+          Class valueClass = reader.getValueClass();
+          Writable key = null;
+          try {
+            key = (Writable) reader.getKeyClass().newInstance();
+          } catch (InstantiationException e) {//Should not be possible
+            log.error("Exception", e);
+            throw new RuntimeException(e);
+          } catch (IllegalAccessException e) {
+            log.error("Exception", e);
+            throw new RuntimeException(e);
+          }
+          if (valueClass.equals(Cluster.class)){
+            Cluster value = new Cluster();
+            while (reader.next(key, value)) {
+              // get the cluster info
+              SoftCluster theCluster = new SoftCluster(value.getCenter(), value.getId());
+              clusters.add(theCluster);
+              value = new Cluster();
+            }
+          } else if (valueClass.equals(SoftCluster.class)){
+            SoftCluster value = new SoftCluster();
+            while (reader.next(key, value)) {
+              // get the cluster info
+              clusters.add(value);
+              value = new SoftCluster();
+            }
           }
         } finally {
           if (reader != null) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=789843&r1=789842&r2=789843&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
(original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
Tue Jun 30 17:39:24 2009
@@ -38,6 +38,7 @@
 import org.apache.mahout.matrix.SparseVector;
 import org.apache.mahout.matrix.Vector;
 import org.apache.mahout.utils.CommandLineUtil;
+import org.apache.mahout.utils.HadoopUtil;
 import org.apache.mahout.utils.SquaredEuclideanDistanceMeasure;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -146,10 +147,10 @@
         numReduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
       }
       if (cmdLine.hasOption(overwriteOutput) == true) {
-        overwriteOutput(output);
+        HadoopUtil.overwriteOutput(output);
       }
       if (cmdLine.hasOption(kOpt)) {
-        clusters = buildRandom(input, clusters, Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
+        clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
       }
       runJob(input, clusters, output, measureClass, convergenceDelta,
               maxIterations, numReduceTasks, vectorClass);
@@ -159,22 +160,7 @@
     }
   }
 
-  public static void overwriteOutput(String output) throws IOException {
-    JobConf conf = new JobConf(KMeansDriver.class);
-    Path outPath = new Path(output);
-    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
-    boolean exists = fs.exists(outPath);
-    if (exists == true) {
-      log.warn("Deleting " + outPath);
-      fs.delete(outPath, true);
-    }
-    fs.mkdirs(outPath);
 
-  }
-
-  public static Path buildRandom(String input, String clusters, int k) throws IOException,
IllegalAccessException, InstantiationException {
-    return RandomSeedGenerator.runJob(input, clusters, k);
-  }
 
   /**
    * Run the job using supplied arguments

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java?rev=789843&r1=789842&r2=789843&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
(original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
Tue Jun 30 17:39:24 2009
@@ -25,7 +25,7 @@
   private transient static Log log = LogFactory.getLog(RandomSeedGenerator.class);
   public static final String K = "k";
 
-  public static Path runJob(String input, String output,
+  public static Path buildRandom(String input, String output,
                             int k ) throws IOException, IllegalAccessException, InstantiationException
{
     // delete the output directory
     JobConf conf = new JobConf(RandomSeedGenerator.class);

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/HadoopUtil.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/HadoopUtil.java?rev=789843&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/HadoopUtil.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/HadoopUtil.java Tue Jun
30 17:39:24 2009
@@ -0,0 +1,32 @@
+package org.apache.mahout.utils;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+
+import java.io.IOException;
+
+
+/**
+ *
+ *
+ **/
+public class HadoopUtil {
+  private transient static Log log = LogFactory.getLog(HadoopUtil.class);
+
+  public static void overwriteOutput(String output) throws IOException {
+    JobConf conf = new JobConf(KMeansDriver.class);
+    Path outPath = new Path(output);
+    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
+    boolean exists = fs.exists(outPath);
+    if (exists == true) {
+      log.warn("Deleting " + outPath);
+      fs.delete(outPath, true);
+    }
+    fs.mkdirs(outPath);
+
+  }
+}

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=789843&r1=789842&r2=789843&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
(original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
Tue Jun 30 17:39:24 2009
@@ -19,31 +19,32 @@
 
 
 import junit.framework.TestCase;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.ClusteringTestUtils;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.matrix.DenseVector;
 import org.apache.mahout.matrix.SparseVector;
 import org.apache.mahout.matrix.Vector;
 import org.apache.mahout.utils.DistanceMeasure;
 import org.apache.mahout.utils.DummyOutputCollector;
 import org.apache.mahout.utils.EuclideanDistanceMeasure;
+import org.apache.mahout.utils.HadoopUtil;
 import org.apache.mahout.utils.ManhattanDistanceMeasure;
 
 import java.io.BufferedReader;
 import java.io.File;
-import java.io.InputStreamReader;
 import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.nio.charset.Charset;
 
 public class TestKmeansClustering extends TestCase {
 
@@ -402,7 +403,7 @@
       }
       writer.close();
       // now run the Job
-      KMeansDriver.overwriteOutput("output");
+      HadoopUtil.overwriteOutput("output");
       KMeansDriver.runJob("testdata/points", "testdata/clusters", "output",
           EuclideanDistanceMeasure.class.getName(), 0.001, 10, k + 1, SparseVector.class);
       // now compare the expected clusters with actual



Mime
View raw message