mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlyubi...@apache.org
Subject svn commit: r1213842 - in /mahout/trunk/core/src: main/java/org/apache/mahout/math/hadoop/stochasticsvd/ main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/ test/java/org/apache/mahout/math/hadoop/stochasticsvd/
Date Tue, 13 Dec 2011 18:36:06 GMT
Author: dlyubimov
Date: Tue Dec 13 18:36:05 2011
New Revision: 1213842

URL: http://svn.apache.org/viewvc?rev=1213842&view=rev
Log:
MAHOUT-922:optional p, AB' tweaks, faster tests, style overhaul.

Added:
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/ABtDenseOutJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java
      - copied, changed from r1213676, mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GrammSchmidt.java
Removed:
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BBtJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GrammSchmidt.java
Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/QJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototype.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SparseRowBlockWritable.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UpperTriangular.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRFirstStep.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRLastStep.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDTestsHelper.java

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/ABtDenseOutJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/ABtDenseOutJob.java?rev=1213842&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/ABtDenseOutJob.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/ABtDenseOutJob.java Tue Dec 13 18:36:05 2011
@@ -0,0 +1,504 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.stochasticsvd;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.text.NumberFormat;
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.regex.Matcher;
+
+import org.apache.commons.lang.Validate;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.IOUtils;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.stochasticsvd.qr.QRFirstStep;
+
+/**
+ * Computes ABt products, then first step of QR which is pushed down to the
+ * reducer.
+ * 
+ */
+@SuppressWarnings("deprecation")
+public class ABtDenseOutJob {
+
+  public static final String PROP_BT_PATH = "ssvd.Bt.path";
+
+  private ABtDenseOutJob() {
+  }
+
+  /**
+   * So, here, i preload A block into memory.
+   * <P>
+   * 
+   * A sparse matrix seems to be ideal for that but there are two reasons why i
+   * am not using it:
+   * <UL>
+   * <LI>1) I don't know the full block height. so i may need to reallocate it
+   * from time to time. Although this probably not a showstopper.
+   * <LI>2) I found that RandomAccessSparseVectors seem to take much more memory
+   * than the SequentialAccessSparseVectors.
+   * </UL>
+   * <P>
+   * 
+   */
+  public static class ABtMapper
+      extends
+      Mapper<Writable, VectorWritable, SplitPartitionedWritable, DenseBlockWritable> {
+
+    private SplitPartitionedWritable outKey;
+    private final Deque<Closeable> closeables = new ArrayDeque<Closeable>();
+    private SequenceFileDirIterator<IntWritable, VectorWritable> btInput;
+    private Vector[] aCols;
+    private double[][] yiCols;
+    private int aRowCount;
+    private int kp;
+    private int blockHeight;
+
+    @Override
+    protected void map(Writable key, VectorWritable value, Context context)
+      throws IOException, InterruptedException {
+
+      Vector vec = value.get();
+
+      int vecSize = vec.size();
+      if (aCols == null) {
+        aCols = new Vector[vecSize];
+      } else if (aCols.length < vecSize) {
+        aCols = Arrays.copyOf(aCols, vecSize);
+      }
+
+      if (vec.isDense()) {
+        for (int i = 0; i < vecSize; i++) {
+          extendAColIfNeeded(i, aRowCount + 1);
+          aCols[i].setQuick(aRowCount, vec.getQuick(i));
+        }
+      } else if (vec.size() > 0) {
+        for (Iterator<Vector.Element> vecIter = vec.iterateNonZero(); vecIter
+          .hasNext();) {
+          Vector.Element vecEl = vecIter.next();
+          int i = vecEl.index();
+          extendAColIfNeeded(i, aRowCount + 1);
+          aCols[i].setQuick(aRowCount, vecEl.get());
+        }
+      }
+      aRowCount++;
+    }
+
+    private void extendAColIfNeeded(int col, int rowCount) {
+      if (aCols[col] == null) {
+        aCols[col] =
+          new SequentialAccessSparseVector(rowCount < blockHeight ? blockHeight
+              : rowCount, 1);
+      } else if (aCols[col].size() < rowCount) {
+        Vector newVec =
+          new SequentialAccessSparseVector(rowCount + blockHeight,
+                                           aCols[col]
+                                             .getNumNondefaultElements() << 1);
+        newVec.viewPart(0, aCols[col].size()).assign(aCols[col]);
+        aCols[col] = newVec;
+      }
+    }
+
+    @Override
+    protected void cleanup(Context context) throws IOException,
+      InterruptedException {
+      try {
+
+        int lastRowIndex = -1;
+
+        yiCols = new double[kp][];
+
+        for (int i = 0; i < kp; i++) {
+          yiCols[i] = new double[Math.min(aRowCount, blockHeight)];
+        }
+
+        final int numPasses = (aRowCount - 1) / blockHeight + 1;
+
+        String propBtPathStr = context.getConfiguration().get(PROP_BT_PATH);
+        Validate.notNull(propBtPathStr, "Bt input is not set");
+        final Path btPath = new Path(propBtPathStr);
+        final DenseBlockWritable dbw = new DenseBlockWritable();
+
+        /*
+         * so it turns out that it may be much more efficient to do a few
+         * independent passes over Bt accumulating the entire block in memory
+         * than pass huge amount of blocks out to combiner. so we aim of course
+         * to fit entire s x (k+p) dense block in memory where s is the number
+         * of A rows in this split. If A is much sparser than (k+p) avg # of
+         * elements per row then the block may exceed the split size. if this
+         * happens, and if the given blockHeight is not high enough to
+         * accomodate this (because of memory constraints), then we start
+         * splitting s into several passes. since computation is cpu-bound
+         * anyway, it should be o.k. for supersparse inputs. (as ok it can be
+         * that projection is thicker than the original anyway, why would one
+         * use that many k+p then).
+         */
+        for (int pass = 0; pass < numPasses; pass++) {
+
+          btInput =
+            new SequenceFileDirIterator<IntWritable, VectorWritable>(btPath,
+                                                                     PathType.GLOB,
+                                                                     null,
+                                                                     null,
+                                                                     true,
+                                                                     context
+                                                                       .getConfiguration());
+          closeables.addFirst(btInput);
+
+          int aRowBegin = pass * blockHeight;
+          int bh = Math.min(blockHeight, aRowCount - aRowBegin);
+
+          /*
+           * check if we need to trim block allocation
+           */
+          if (pass > 0) {
+            if (bh != blockHeight) {
+
+              for (int i = 0; i < kp; i++)
+                yiCols[i] = null;
+              for (int i = 0; i < kp; i++)
+                yiCols[i] = new double[bh];
+            } else {
+              for (int i = 0; i < kp; i++)
+                Arrays.fill(yiCols[i], 0.0);
+            }
+          }
+
+          while (btInput.hasNext()) {
+            Pair<IntWritable, VectorWritable> btRec = btInput.next();
+            int btIndex = btRec.getFirst().get();
+            Vector btVec = btRec.getSecond().get();
+            Vector aCol;
+            if (btIndex > aCols.length || (aCol = aCols[btIndex]) == null
+                || aCol.size() == 0) {
+
+              /* 100% zero A column in the block, skip it as sparse */
+              continue;
+            }
+            int j = -1;
+            for (Iterator<Vector.Element> aColIter = aCol.iterateNonZero(); aColIter
+              .hasNext();) {
+              Vector.Element aEl = aColIter.next();
+              j = aEl.index();
+
+              /*
+               * now we compute only swathes between aRowBegin..aRowBegin+bh
+               * exclusive. it seems like a deficiency but in fact i think it
+               * will balance itself out: either A is dense and then we
+               * shouldn't have more than one pass and therefore filter
+               * conditions will never kick in. Or, the only situation where we
+               * can't fit Y_i block in memory is when A input is much sparser
+               * than k+p per row. But if this is the case, then we'd be looking
+               * at very few elements without engaging them in any operations so
+               * even then it should be ok.
+               */
+              if (j < aRowBegin)
+                continue;
+              else if (j >= aRowBegin + bh)
+                break;
+
+              /*
+               * assume btVec is dense
+               */
+              for (int s = 0; s < kp; s++) {
+                yiCols[s][j - aRowBegin] += aEl.get() * btVec.getQuick(s);
+              }
+
+            }
+            if (lastRowIndex < j) {
+              lastRowIndex = j;
+            }
+          }
+
+          /*
+           * so now we have stuff in yi
+           */
+          dbw.setBlock(yiCols);
+          outKey.setTaskItemOrdinal(pass);
+          context.write(outKey, dbw);
+
+          closeables.remove(btInput);
+          btInput.close();
+        }
+
+      } finally {
+        IOUtils.close(closeables);
+      }
+    }
+
+    @Override
+    protected void setup(final Context context) throws IOException,
+      InterruptedException {
+
+      int k =
+        Integer.parseInt(context.getConfiguration().get(QRFirstStep.PROP_K));
+      int p =
+        Integer.parseInt(context.getConfiguration().get(QRFirstStep.PROP_P));
+      kp = k + p;
+
+      outKey = new SplitPartitionedWritable(context);
+
+      blockHeight =
+        context.getConfiguration().getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT,
+                                          -1);
+
+    }
+  }
+
+  /**
+   * QR first step pushed down to reducer.
+   * 
+   */
+  public static class QRReducer
+      extends
+      Reducer<SplitPartitionedWritable, DenseBlockWritable, SplitPartitionedWritable, VectorWritable> {
+
+    /*
+     * HACK: partition number formats in hadoop, copied. this may stop working
+     * if it gets out of sync with newer hadoop version. But unfortunately rules
+     * of forming output file names are not sufficiently exposed so we need to
+     * hack it if we write the same split output from either mapper or reducer.
+     * alternatively, we probably can replace it by our own output file naming
+     * management completely and bypass MultipleOutputs entirely.
+     */
+
+    private static final NumberFormat NUMBER_FORMAT = NumberFormat
+      .getInstance();
+    static {
+      NUMBER_FORMAT.setMinimumIntegerDigits(5);
+      NUMBER_FORMAT.setGroupingUsed(false);
+    }
+
+    private final Deque<Closeable> closeables = new LinkedList<Closeable>();
+
+    protected int blockHeight;
+
+    protected int accumSize;
+    protected int lastTaskId = -1;
+
+    protected OutputCollector<Writable, DenseBlockWritable> qhatCollector;
+    protected OutputCollector<Writable, VectorWritable> rhatCollector;
+    protected QRFirstStep qr;
+    protected Vector yiRow;
+
+    @Override
+    protected void setup(Context context) throws IOException,
+      InterruptedException {
+      blockHeight =
+        context.getConfiguration().getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT,
+                                          -1);
+
+    }
+
+    protected void setupBlock(Context context, SplitPartitionedWritable spw)
+      throws InterruptedException, IOException {
+      IOUtils.close(closeables);
+      qhatCollector =
+        createOutputCollector(QJob.OUTPUT_QHAT,
+                              spw,
+                              context,
+                              DenseBlockWritable.class);
+      rhatCollector =
+        createOutputCollector(QJob.OUTPUT_RHAT,
+                              spw,
+                              context,
+                              VectorWritable.class);
+      qr =
+        new QRFirstStep(context.getConfiguration(),
+                        qhatCollector,
+                        rhatCollector);
+      closeables.addFirst(qr);
+      lastTaskId = spw.getTaskId();
+
+    }
+
+    @Override
+    protected void reduce(SplitPartitionedWritable key,
+                          Iterable<DenseBlockWritable> values,
+                          Context context) throws IOException,
+      InterruptedException {
+
+      if (key.getTaskId() != lastTaskId) {
+        setupBlock(context, key);
+      }
+
+      Iterator<DenseBlockWritable> iter = values.iterator();
+      DenseBlockWritable dbw = iter.next();
+      double[][] yiCols = dbw.getBlock();
+      if (iter.hasNext()) {
+        throw new IOException("Unexpected extra Y_i block in reducer input.");
+      }
+
+      long blockBase = key.getTaskItemOrdinal() * blockHeight;
+      int bh = yiCols[0].length;
+      if (yiRow == null) {
+        yiRow = new DenseVector(yiCols.length);
+      }
+
+      for (int k = 0; k < bh; k++) {
+        for (int j = 0; j < yiCols.length; j++)
+          yiRow.setQuick(j, yiCols[j][k]);
+
+        key.setTaskItemOrdinal(blockBase + k);
+        qr.collect(key, yiRow);
+      }
+
+    }
+
+    private Path getSplitFilePath(String name,
+                                  SplitPartitionedWritable spw,
+                                  Context context) throws InterruptedException,
+      IOException {
+      String uniqueFileName = FileOutputFormat.getUniqueFile(context, name, "");
+      uniqueFileName = uniqueFileName.replaceFirst("-r-", "-m-");
+      uniqueFileName =
+        uniqueFileName.replaceFirst("\\d+$", Matcher
+          .quoteReplacement(NUMBER_FORMAT.format(spw.getTaskId())));
+      return new Path(FileOutputFormat.getWorkOutputPath(context),
+                      uniqueFileName);
+    }
+
+    /**
+     * key doesn't matter here, only value does. key always gets substituted by
+     * SPW.
+     * 
+     * @param <K>
+     *          bogus
+     * @param <V>
+     * @param name
+     * @param spw
+     * @param ctx
+     * @param valueClass
+     * @return
+     * @throws IOException
+     * @throws InterruptedException
+     */
+    private <K, V> OutputCollector<K, V>
+        createOutputCollector(String name,
+                              final SplitPartitionedWritable spw,
+                              Context ctx,
+                              Class<V> valueClass) throws IOException,
+          InterruptedException {
+      Path outputPath = getSplitFilePath(name, spw, ctx);
+      final SequenceFile.Writer w =
+        SequenceFile.createWriter(FileSystem.get(ctx.getConfiguration()),
+                                  ctx.getConfiguration(),
+                                  outputPath,
+                                  SplitPartitionedWritable.class,
+                                  valueClass);
+      closeables.addFirst(w);
+      return new OutputCollector<K, V>() {
+        @Override
+        public void collect(K key, V val) throws IOException {
+          w.append(spw, val);
+        }
+      };
+    }
+
+    @Override
+    protected void cleanup(Context context) throws IOException,
+      InterruptedException {
+
+      IOUtils.close(closeables);
+    }
+
+  }
+
+  public static void run(Configuration conf,
+                         Path[] inputAPaths,
+                         Path inputBtGlob,
+                         Path outputPath,
+                         int aBlockRows,
+                         int minSplitSize,
+                         int k,
+                         int p,
+                         int outerProdBlockHeight,
+                         int numReduceTasks) throws ClassNotFoundException,
+    InterruptedException, IOException {
+
+    JobConf oldApiJob = new JobConf(conf);
+
+    Job job = new Job(oldApiJob);
+    job.setJobName("ABt-job");
+    job.setJarByClass(ABtDenseOutJob.class);
+
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    FileInputFormat.setInputPaths(job, inputAPaths);
+    if (minSplitSize > 0) {
+      FileInputFormat.setMinInputSplitSize(job, minSplitSize);
+    }
+
+    FileOutputFormat.setOutputPath(job, outputPath);
+
+    SequenceFileOutputFormat.setOutputCompressionType(job,
+                                                      CompressionType.BLOCK);
+
+    job.setMapOutputKeyClass(SplitPartitionedWritable.class);
+    job.setMapOutputValueClass(DenseBlockWritable.class);
+
+    job.setOutputKeyClass(SplitPartitionedWritable.class);
+    job.setOutputValueClass(VectorWritable.class);
+
+    job.setMapperClass(ABtMapper.class);
+    job.setReducerClass(QRReducer.class);
+
+    job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows);
+    job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT,
+                                  outerProdBlockHeight);
+    job.getConfiguration().setInt(QRFirstStep.PROP_K, k);
+    job.getConfiguration().setInt(QRFirstStep.PROP_P, p);
+    job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString());
+
+    job.setNumReduceTasks(numReduceTasks);
+
+    job.submit();
+    job.waitForCompletion(false);
+
+    if (!job.isSuccessful()) {
+      throw new IOException("ABt job unsuccessful.");
+    }
+
+  }
+
+}

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java Tue Dec 13 18:36:05 2011
@@ -144,8 +144,6 @@ public final class BtJob {
           for (int j = 0; j < kp; j++) {
             btRow.setQuick(j, mul * qRow.getQuick(j));
           }
-          // btKey.set(el.index());
-          // context.write(btKey, btValue);
           btCollector.collect((long) el.index(), btRow);
         }
       } else {
@@ -155,8 +153,6 @@ public final class BtJob {
           for (int j = 0; j < kp; j++) {
             btRow.setQuick(j, mul * qRow.getQuick(j));
           }
-          // btKey.set(i);
-          // context.write(btKey, btValue);
           btCollector.collect((long) i, btRow);
         }
       }
@@ -169,11 +165,11 @@ public final class BtJob {
 
       Path qJobPath = new Path(context.getConfiguration().get(PROP_QJOB_PATH));
 
-      // actually this is kind of dangerous
-      // becuase this routine thinks we need to create file name for
-      // our current job and this will use -m- so it's just serendipity we are
-      // calling
-      // it from the mapper too as the QJob did.
+      /*
+       * actually this is kind of dangerous because this routine thinks we need
+       * to create file name for our current job and this will use -m- so it's
+       * just serendipity we are calling it from the mapper too as the QJob did.
+       */
       Path qInputPath =
         new Path(qJobPath, FileOutputFormat.getUniqueFile(context,
                                                           QJob.OUTPUT_QHAT,
@@ -187,8 +183,10 @@ public final class BtJob {
                                                             .getConfiguration());
       closeables.addFirst(qhatInput);
 
-      // read all r files _in order of task ids_, i.e. partitions (aka group
-      // nums)
+      /*
+       * read all r files _in order of task ids_, i.e. partitions (aka group
+       * nums)
+       */
 
       Path rPath = new Path(qJobPath, QJob.OUTPUT_RHAT + "-*");
 
@@ -206,9 +204,11 @@ public final class BtJob {
 
       qr = new QRLastStep(qhatInput, rhatInput, blockNum);
       closeables.addFirst(qr);
-      // it's so happens that current QRLastStep's implementation
-      // preloads R sequence into memory in the constructor
-      // so it's ok to close rhat input now.
+      /*
+       * it's so happens that current QRLastStep's implementation preloads R
+       * sequence into memory in the constructor so it's ok to close rhat input
+       * now.
+       */
       if (!rhatInput.hasNext()) {
         closeables.remove(rhatInput);
         rhatInput.close();
@@ -240,7 +240,6 @@ public final class BtJob {
       extends
       Reducer<Writable, SparseRowBlockWritable, Writable, SparseRowBlockWritable> {
 
-    // protected final VectorWritable outValue = new VectorWritable();
     protected final SparseRowBlockWritable accum = new SparseRowBlockWritable();
     protected final Deque<Closeable> closeables = new ArrayDeque<Closeable>();
     protected int blockHeight;
@@ -322,9 +321,10 @@ public final class BtJob {
         accum.plusBlock(bw);
       }
 
-      // at this point, sum of rows should be in accum,
-      // so we just generate outer self product of it and add to
-      // BBt accumulator.
+      /*
+       * at this point, sum of rows should be in accum, so we just generate
+       * outer self product of it and add to BBt accumulator.
+       */
 
       for (int k = 0; k < accum.getNumRows(); k++) {
         Vector btRow = accum.getRows()[k];
@@ -406,10 +406,11 @@ public final class BtJob {
                         VectorWritable.class);
     }
 
-    // hack: we use old api multiple outputs
-    // since they are not available in the new api of
-    // either 0.20.2 or 0.20.203 but wrap it into a new api
-    // job so we can use new api interfaces.
+    /*
+     * HACK: we use old api multiple outputs since they are not available in the
+     * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we
+     * can use new api interfaces.
+     */
 
     Job job = new Job(oldApiJob);
     job.setJobName("Bt-job");
@@ -423,13 +424,9 @@ public final class BtJob {
     }
     FileOutputFormat.setOutputPath(job, outputPath);
 
-    // MultipleOutputs.addNamedOutput(job, OUTPUT_Bt,
-    // SequenceFileOutputFormat.class,
-    // QJobKeyWritable.class,QJobValueWritable.class);
-
-    // Warn: tight hadoop integration here:
+    // WARN: tight hadoop integration here:
     job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT);
-    // FileOutputFormat.setCompressOutput(job, true);
+
     FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
     SequenceFileOutputFormat.setOutputCompressionType(job,
                                                       CompressionType.BLOCK);
@@ -443,10 +440,7 @@ public final class BtJob {
     job.setMapperClass(BtMapper.class);
     job.setCombinerClass(OuterProductCombiner.class);
     job.setReducerClass(OuterProductReducer.class);
-    // job.setPartitionerClass(QPartitioner.class);
 
-    // job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE,aBlockRows );
-    // job.getConfiguration().setLong(PROP_OMEGA_SEED, seed);
     job.getConfiguration().setInt(QJob.PROP_K, k);
     job.getConfiguration().setInt(QJob.PROP_P, p);
     job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString());
@@ -454,10 +448,6 @@ public final class BtJob {
                                       outputBBtProducts);
     job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight);
 
-    // number of reduce tasks doesn't matter. we don't actually
-    // send anything to reducers. in fact, the only reason
-    // we need to configure reduce step is so that combiners can fire.
-    // so reduce here is purely symbolic.
     job.setNumReduceTasks(numReduceTasks);
 
     job.submit();

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java Tue Dec 13 18:36:05 2011
@@ -24,14 +24,16 @@ import java.util.Arrays;
 import org.apache.hadoop.io.Writable;
 
 /**
- * <p></p>Ad-hoc substitution for {@link org.apache.mahout.math.MatrixWritable}. Perhaps more useful for
- * situations with mostly dense data (such as Q-blocks) but reduces GC by
- * reusing the same block memory between loads and writes.</p>
+ * Ad-hoc substitution for {@link org.apache.mahout.math.MatrixWritable}.
+ * Perhaps more useful for situations with mostly dense data (such as Q-blocks)
+ * but reduces GC by reusing the same block memory between loads and writes.
+ * <p>
  * 
- * <p></p>in case of Q blocks, it doesn't even matter if they this data is dense cause
+ * in case of Q blocks, it doesn't even matter if they this data is dense cause
  * we need to unpack it into dense for fast access in computations anyway and
  * even if it is not so dense the block compressor in sequence files will take
- * care of it for the serialized size.</p>
+ * care of it for the serialized size.
+ * <p>
  */
 public class DenseBlockWritable implements Writable {
   private double[][] block;

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/QJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/QJob.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/QJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/QJob.java Tue Dec 13 18:36:05 2011
@@ -137,7 +137,6 @@ public final class QJob {
     @Override
     protected void map(Writable key, VectorWritable value, Context context)
       throws IOException, InterruptedException {
-      // omega.computeYRow(value.get(), yRow);
       omega.computeYRow(value.get(), yRow);
       qr.collect(key, yRow);
     }
@@ -205,8 +204,10 @@ public final class QJob {
     job.getConfiguration().setInt(PROP_K, k);
     job.getConfiguration().setInt(PROP_P, p);
 
-    // number of reduce tasks doesn't matter. we don't actually
-    // send anything to reducers.
+    /*
+     * number of reduce tasks doesn't matter. we don't actually send anything to
+     * reducers.
+     */
 
     job.setNumReduceTasks(0 /* numReduceTasks */);
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java Tue Dec 13 18:36:05 2011
@@ -44,31 +44,35 @@ public class SSVDCli extends AbstractJob
     addInputOption();
     addOutputOption();
     addOption("rank", "k", "decomposition rank", true);
-    addOption("oversampling", "p", "oversampling", true);
-    addOption("blockHeight", "r", "Y block height (must be > (k+p))", "10000");
+    addOption("oversampling", "p", "oversampling", String.valueOf(15));
+    addOption("blockHeight", "r", "Y block height (must be > (k+p))", String.valueOf(10000));
     addOption("outerProdBlockHeight",
               "oh",
-              "block height of outer products during multiplication, increase for sparse input",
-              "10000");
-    addOption("minSplitSize", "s", "minimum split size", "-1");
-    addOption("computeU", "U", "compute U (true/false)", "true");
+              "block height of outer products during multiplication, increase for sparse inputs",
+              String.valueOf(30000));
+    addOption("abtBlockHeight", 
+              "abth",
+              "block height of Y_i in ABtJob during AB' multiplication, increase for extremely sparse inputs",
+              String.valueOf(200000));
+    addOption("minSplitSize", "s", "minimum split size", String.valueOf(-1));
+    addOption("computeU", "U", "compute U (true/false)", String.valueOf(true));
     addOption("uHalfSigma",
               "uhs",
               "Compute U as UHat=U x pow(Sigma,0.5)",
-              "false");
-    addOption("computeV", "V", "compute V (true/false)", "true");
+              String.valueOf(false));
+    addOption("computeV", "V", "compute V (true/false)", String.valueOf(true));
     addOption("vHalfSigma",
               "vhs",
               "compute V as VHat= V x pow(Sigma,0.5)",
-              "false");
+              String.valueOf(false));
     addOption("reduceTasks",
               "t",
               "number of reduce tasks (where applicable)",
-              "1");
+              String.valueOf(1));
     addOption("powerIter",
               "q",
               "number of additional power iterations (0..2 is good)",
-              "0");
+              String.valueOf(0));
     addOption(DefaultOptionCreator.overwriteOption().create());
 
     Map<String, String> pargs = parseArguments(args);
@@ -76,13 +80,11 @@ public class SSVDCli extends AbstractJob
       return -1;
     }
 
-    String input = pargs.get("--input");
-    String output = pargs.get("--output");
-    String tempDir = pargs.get("--tempDir");
     int k = Integer.parseInt(pargs.get("--rank"));
     int p = Integer.parseInt(pargs.get("--oversampling"));
     int r = Integer.parseInt(pargs.get("--blockHeight"));
     int h = Integer.parseInt(pargs.get("--outerProdBlockHeight"));
+    int abh = Integer.parseInt(pargs.get("--abtBlockHeight"));
     int q = Integer.parseInt(pargs.get("--powerIter"));
     int minSplitSize = Integer.parseInt(pargs.get("--minSplitSize"));
     boolean computeU = Boolean.parseBoolean(pargs.get("--computeU"));
@@ -100,10 +102,9 @@ public class SSVDCli extends AbstractJob
 
     SSVDSolver solver =
       new SSVDSolver(conf,
-                     new Path[] { new Path(input) },
-                     new Path(tempDir),
+                     new Path[] { getInputPath() },
+                     getTempPath(),
                      r,
-                     h,
                      k,
                      p,
                      reduceTasks);
@@ -112,6 +113,8 @@ public class SSVDCli extends AbstractJob
     solver.setComputeV(computeV);
     solver.setcUHalfSigma(cUHalfSigma);
     solver.setcVHalfSigma(cVHalfSigma);
+    solver.setOuterBlockHeight(h);
+    solver.setAbtBlockHeight(abh);
     solver.setQ(q);
     solver.setOverwrite(overwrite);
 
@@ -120,16 +123,16 @@ public class SSVDCli extends AbstractJob
     // housekeeping
     FileSystem fs = FileSystem.get(conf);
 
-    Path outPath = new Path(output);
-    fs.mkdirs(outPath);
+    fs.mkdirs(getOutputPath());
 
-    SequenceFile.Writer sigmaW =
-      SequenceFile.createWriter(fs,
+    SequenceFile.Writer sigmaW = null;
+
+    try {
+      sigmaW = SequenceFile.createWriter(fs,
                                 conf,
-                                new Path(outPath, "sigma"),
+                                getOutputPath("sigma"),
                                 NullWritable.class,
                                 VectorWritable.class);
-    try {
       Writable sValues =
         new VectorWritable(new DenseVector(Arrays.copyOf(solver
           .getSingularValues(), k), true));
@@ -143,7 +146,7 @@ public class SSVDCli extends AbstractJob
       FileStatus[] uFiles = fs.globStatus(new Path(solver.getUPath()));
       if (uFiles != null) {
         for (FileStatus uf : uFiles) {
-          fs.rename(uf.getPath(), outPath);
+          fs.rename(uf.getPath(), getOutputPath());
         }
       }
     }
@@ -151,7 +154,7 @@ public class SSVDCli extends AbstractJob
       FileStatus[] vFiles = fs.globStatus(new Path(solver.getVPath()));
       if (vFiles != null) {
         for (FileStatus vf : vFiles) {
-          fs.rename(vf.getPath(), outPath);
+          fs.rename(vf.getPath(), getOutputPath());
         }
       }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototype.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototype.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototype.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototype.java Tue Dec 13 18:36:05 2011
@@ -150,15 +150,6 @@ public class SSVDPrototype {
   public static void testThinQr(int dims, int kp) throws IOException {
 
     DenseMatrix mx = new DenseMatrix(dims << 2, dims);
-    // mx.assign(new UnaryFunction() {
-    //
-    // Random m_rnd = new Random(rndSeed);
-    //
-    // @Override
-    // public double apply(double arg0) {
-    // return m_rnd.nextDouble()*1000;
-    // }
-    // });
 
     Random rnd = RandomUtils.getRandom();
     for (int i = 0; i < mx.rowSize(); i++) {
@@ -264,15 +255,6 @@ public class SSVDPrototype {
   public static void testBlockQrWithSSVD(int dims, int kp, int r, long rndSeed) throws IOException {
 
     DenseMatrix mx = new DenseMatrix(dims << 2, dims);
-    // mx.assign(new UnaryFunction() {
-    //
-    // Random m_rnd = new Random(rndSeed);
-    //
-    // @Override
-    // public double apply(double arg0) {
-    // return (m_rnd.nextDouble()-0.5)*1000;
-    // }
-    // });
 
     Random rnd = RandomUtils.getRandom();
     for (int i = 0; i < mx.rowSize(); i++) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java Tue Dec 13 18:36:05 2011
@@ -22,14 +22,11 @@ import java.io.IOException;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.Deque;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Random;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -40,12 +37,16 @@ import org.apache.hadoop.io.SequenceFile
 import org.apache.hadoop.io.Writable;
 import org.apache.mahout.common.IOUtils;
 import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.ssvd.EigenSolverWrapper;
 
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+
 /**
  * Stochastic SVD solver (API class).
  * <P>
@@ -100,13 +101,14 @@ public class SSVDSolver {
   private boolean computeV = true;
   private String uPath;
   private String vPath;
+  private int outerBlockHeight = 30000;
+  private int abtBlockHeight = 200000;
 
   // configured stuff
   private final Configuration conf;
   private final Path[] inputPath;
   private final Path outputPath;
   private final int ablockRows;
-  private final int outerBlockHeight;
   private final int k;
   private final int p;
   private int q;
@@ -144,14 +146,12 @@ public class SSVDSolver {
                     Path[] inputPath,
                     Path outputPath,
                     int ablockRows,
-                    int outerBlockHeight,
                     int k,
                     int p,
                     int reduceTasks) {
     this.conf = conf;
     this.inputPath = inputPath;
     this.outputPath = outputPath;
-    this.outerBlockHeight = outerBlockHeight;
     this.ablockRows = ablockRows;
     this.k = k;
     this.p = p;
@@ -252,6 +252,38 @@ public class SSVDSolver {
     this.overwrite = overwrite;
   }
 
+  public int getOuterBlockHeight() {
+    return outerBlockHeight;
+  }
+
+  /**
+   * The height of outer blocks during Q'A multiplication. Higher values allow
+   * to produce less keys for combining and shuffle and sort therefore somewhat
+   * improving running time; but require larger blocks to be formed in RAM (so
+   * setting this too high can lead to OOM).
+   * 
+   * @param outerBlockHeight
+   */
+  public void setOuterBlockHeight(int outerBlockHeight) {
+    this.outerBlockHeight = outerBlockHeight;
+  }
+
+  public int getAbtBlockHeight() {
+    return abtBlockHeight;
+  }
+
+  /**
+   * the block height of Y_i during power iterations. It is probably important
+   * to set it higher than default 200,000 for extremely sparse inputs and when
+   * more ram is available. y_i block height and ABt job would occupy approx.
+   * abtBlockHeight x (k+p) x sizeof (double) (as dense).
+   * 
+   * @param abtBlockHeight
+   */
+  public void setAbtBlockHeight(int abtBlockHeight) {
+    this.abtBlockHeight = abtBlockHeight;
+  }
+
   /**
    * run all SSVD jobs.
    * 
@@ -260,7 +292,7 @@ public class SSVDSolver {
    */
   public void run() throws IOException {
 
-    Deque<Closeable> closeables = new LinkedList<Closeable>();
+    Deque<Closeable> closeables = Lists.<Closeable>newLinkedList();
     try {
       Class<? extends Writable> labelType =
         sniffInputLabelType(inputPath, conf);
@@ -301,7 +333,7 @@ public class SSVDSolver {
                 k,
                 p,
                 outerBlockHeight,
-                reduceTasks > 1000 ? 1000 : reduceTasks,
+                Math.min(1000, reduceTasks),
                 labelType,
                 q <= 0);
 
@@ -309,16 +341,17 @@ public class SSVDSolver {
       for (int i = 0; i < q; i++) {
 
         qPath = new Path(outputPath, String.format("ABt-job-%d", i + 1));
-        ABtJob.run(conf,
-                   inputPath,
-                   new Path(btPath, BtJob.OUTPUT_BT + "-*"),
-                   qPath,
-                   ablockRows,
-                   minSplitSize,
-                   k,
-                   p,
-                   outerBlockHeight,
-                   reduceTasks);
+        Path btPathGlob = new Path(btPath, BtJob.OUTPUT_BT + "-*");
+        ABtDenseOutJob.run(conf,
+                           inputPath,
+                           btPathGlob,
+                           qPath,
+                           ablockRows,
+                           minSplitSize,
+                           k,
+                           p,
+                           abtBlockHeight,
+                           reduceTasks);
 
         btPath = new Path(outputPath, String.format("Bt-job-%d", i + 1));
 
@@ -330,14 +363,11 @@ public class SSVDSolver {
                   k,
                   p,
                   outerBlockHeight,
-                  reduceTasks > 1000 ? 1000 : reduceTasks,
+                  Math.min(1000, reduceTasks),
                   labelType,
                   i == q - 1);
       }
 
-      // we don't need BBt now.
-      // BBtJob.run(conf, new Path(btPath, BtJob.OUTPUT_BT + "-*"), bbtPath, 1);
-
       UpperTriangular bbt =
         loadAndSumUpperTriangularMatrices(fs, new Path(btPath, BtJob.OUTPUT_BBT
             + "-*"), conf);
@@ -367,7 +397,6 @@ public class SSVDSolver {
       }
 
       // save/redistribute UHat
-      //
       double[][] uHat = eigenWrapper.getUHat();
 
       fs.mkdirs(uHatPath);
@@ -465,9 +494,17 @@ public class SSVDSolver {
       if (fstats == null || fstats.length == 0) {
         continue;
       }
-      SequenceFile.Reader r =
-        new SequenceFile.Reader(fs, fstats[0].getPath(), conf);
+
+      FileStatus firstSeqFile;
+      if (!fstats[0].isDir()) {
+        firstSeqFile = fstats[0];
+      } else {
+        firstSeqFile = fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0];
+      }
+
+      SequenceFile.Reader r = null;
       try {
+        r = new SequenceFile.Reader(fs, firstSeqFile.getPath(), conf);
         return r.getKeyClass().asSubclass(Writable.class);
       } finally {
         Closeables.closeQuietly(r);
@@ -476,8 +513,8 @@ public class SSVDSolver {
     throw new IOException("Unable to open input files to determine input label type.");
   }
 
-  private static final Pattern OUTPUT_FILE_PATTERN = Pattern
-    .compile("(\\w+)-(m|r)-(\\d+)(\\.\\w+)?");
+  private static final Pattern OUTPUT_FILE_PATTERN =
+    Pattern.compile("(\\w+)-(m|r)-(\\d+)(\\.\\w+)?");
 
   static final Comparator<FileStatus> PARTITION_COMPARATOR =
     new Comparator<FileStatus>() {
@@ -529,15 +566,15 @@ public class SSVDSolver {
 
     List<double[]> denseData = Lists.newArrayList();
 
-    // int m=0;
 
-    // assume it is partitioned output, so we need to read them up
-    // in order of partitions.
+    /*
+     * assume it is partitioned output, so we need to read them up in order of
+     * partitions.
+     */
     Arrays.sort(files, PARTITION_COMPARATOR);
 
     for (FileStatus fstat : files) {
-      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat
-                                                                                  .getPath(),
+      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(),
                                                                                 true,
                                                                                 conf)) {
         Vector v = value.get();
@@ -573,14 +610,15 @@ public class SSVDSolver {
       return null;
     }
 
-    // assume it is partitioned output, so we need to read them up
-    // in order of partitions.
+    /*
+     * assume it is partitioned output, so we need to read them up in order of
+     * partitions.
+     */
     Arrays.sort(files, PARTITION_COMPARATOR);
 
     DenseVector result = null;
     for (FileStatus fstat : files) {
-      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat
-                                                                                  .getPath(),
+      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(),
                                                                                 true,
                                                                                 conf)) {
         Vector v = value.get();
@@ -618,14 +656,15 @@ public class SSVDSolver {
       return null;
     }
 
-    // assume it is partitioned output, so we need to read them up
-    // in order of partitions.
+    /*
+     * assume it is partitioned output, so we need to read them up in order of
+     * partitions.
+     */
     Arrays.sort(files, PARTITION_COMPARATOR);
 
     UpperTriangular result = null;
     for (FileStatus fstat : files) {
-      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat
-                                                                                  .getPath(),
+      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(),
                                                                                 true,
                                                                                 conf)) {
         Vector v = value.get();

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SparseRowBlockWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SparseRowBlockWritable.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SparseRowBlockWritable.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SparseRowBlockWritable.java Tue Dec 13 18:36:05 2011
@@ -83,8 +83,10 @@ public class SparseRowBlockWritable impl
   }
 
   public void plusRow(int index, Vector row) {
-    // often accumulation goes in row-increasing order,
-    // so check for this to avoid binary search (another log Height multiplier).
+    /*
+     * often accumulation goes in row-increasing order, so check for this to
+     * avoid binary search (another log Height multiplier).
+     */
 
     int pos =
       numRows == 0 || rowIndices[numRows - 1] < index ? -numRows - 1 : Arrays
@@ -119,8 +121,10 @@ public class SparseRowBlockWritable impl
    *          block to add
    */
   public void plusBlock(SparseRowBlockWritable bOther) {
-    // since we maintained row indices in a sorted order, we can run
-    // sort merge to expedite this operation
+    /*
+     * since we maintained row indices in a sorted order, we can run sort merge
+     * to expedite this operation
+     */
     int i = 0;
     int j = 0;
     while (i < numRows && j < bOther.numRows) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UJob.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UJob.java Tue Dec 13 18:36:05 2011
@@ -65,7 +65,7 @@ public class UJob {
     FileInputFormat.setInputPaths(job, inputPathQ);
     FileOutputFormat.setOutputPath(job, outputPath);
 
-    // Warn: tight hadoop integration here:
+    // WARN: tight hadoop integration here:
     job.getConfiguration().set("mapreduce.output.basename", OUTPUT_U);
     FileOutputFormat.setCompressOutput(job, true);
     FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UpperTriangular.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UpperTriangular.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UpperTriangular.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/UpperTriangular.java Tue Dec 13 18:36:05 2011
@@ -121,8 +121,10 @@ public class UpperTriangular extends Abs
   }
 
   private int getL(int row, int col) {
-    // each row starts with some zero elements that we don't store.
-    // this accumulates an offset of (row+1)*row/2
+    /*
+     * each row starts with some zero elements that we don't store. this
+     * accumulates an offset of (row+1)*row/2
+     */
     return col + row * numCols() - (row + 1) * row / 2;
   }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java Tue Dec 13 18:36:05 2011
@@ -37,11 +37,6 @@ import org.apache.mahout.math.Matrix;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
-/**
- * Computes U=Q*Uhat of SSVD
- * 
- * 
- */
 public class VJob {
   private static final String OUTPUT_V = "v";
   private static final String PROP_UHAT_PATH = "ssvd.uhat.path";

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java Tue Dec 13 18:36:05 2011
@@ -59,12 +59,13 @@ public class YtYJob {
     private Omega omega;
     private UpperTriangular mYtY;
 
-    // we keep yRow in a dense form here
-    // but keep an eye not to dense up while doing YtY products.
-    // I am not sure that sparse vector would create much performance
-    // benefits since we must to assume that y would be more often
-    // dense than sparse, so for bulk dense operations that would perform
-    // somewhat better than a RandomAccessSparse vector frequent updates.
+    /*
+     * we keep yRow in a dense form here but keep an eye not to dense up while
+     * doing YtY products. I am not sure that sparse vector would create much
+     * performance benefits since we must to assume that y would be more often
+     * dense than sparse, so for bulk dense operations that would perform
+     * somewhat better than a RandomAccessSparse vector frequent updates.
+     */
     private Vector yRow;
 
     @Override
@@ -109,9 +110,11 @@ public class YtYJob {
           }
         }
       } else {
-        // the disadvantage of using sparse vector (aside from the fact that we
-        // are creating some short-lived references) here is that we obviously
-        // do two times more iterations then necessary if y row is pretty dense.
+        /*
+         * the disadvantage of using sparse vector (aside from the fact that we
+         * are creating some short-lived references) here is that we obviously
+         * do two times more iterations then necessary if y row is pretty dense.
+         */
         for (Iterator<Vector.Element> iterI = yRow.iterateNonZero(); iterI
             .hasNext();) {
           Vector.Element eli = iterI.next();
@@ -195,11 +198,12 @@ public class YtYJob {
     job.getConfiguration().setInt(PROP_K, k);
     job.getConfiguration().setInt(PROP_P, p);
 
-    // we must reduce to just one matrix which means
-    // we need only one reducer.
-    // But it's ok since each mapper outputs only one
-    // vector (a packed UpperTriangular) so even if
-    // there're thousands of mappers, one reducer should cope just fine.
+    /*
+     * we must reduce to just one matrix which means we need only one reducer.
+     * But it's ok since each mapper outputs only one vector (a packed
+     * UpperTriangular) so even if there're thousands of mappers, one reducer
+     * should cope just fine.
+     */
     job.setNumReduceTasks(1);
 
     job.submit();

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java Tue Dec 13 18:36:05 2011
@@ -34,14 +34,8 @@ import org.apache.mahout.math.hadoop.sto
  */
 public class GivensThinSolver {
 
-  // private static final double s_epsilon = 1e-10;
-
-  // private double[][] m_rTilde;
-  // private Vector m_aRowV;
   private double[] vARow;
   private double[] vQtRow;
-  // private UpperTriangular m_rTilde;
-  // private TriangularRowView m_rTildeRowView, m_rTildeRowView2;
   private final double[][] mQt;
   private final double[][] mR;
   private int qtStartRow;
@@ -62,7 +56,6 @@ public class GivensThinSolver {
     mQt = new double[n][];
     mR = new double[n][];
     vARow = new double[n];
-    // m_aRowV = new DenseVector(m_aRow, true);
     vQtRow = new double[m];
 
     for (int i = 0; i < n; i++) {
@@ -109,7 +102,8 @@ public class GivensThinSolver {
 
   public void adjust(int newM) {
     if (newM == m) {
-      return; // no adjustment is required.
+      // no adjustment is required.
+      return; 
     }
     if (newM < n) {
       throw new IllegalArgumentException("new m can't be less than n");
@@ -153,10 +147,11 @@ public class GivensThinSolver {
       throw new IllegalStateException("thin QR solver fed more rows than initialized for");
     }
     try {
-      // moving pointers around is inefficient but
-      // for the sanity's sake i am keeping it this way so i don't
-      // have to guess how R-tilde index maps to actual block index
-
+      /*
+       * moving pointers around is inefficient but for the sanity's sake i am
+       * keeping it this way so i don't have to guess how R-tilde index maps to
+       * actual block index
+       */
       Arrays.fill(vQtRow, 0);
       vQtRow[m - cnt - 1] = 1;
       int height = cnt > n ? n : cnt;
@@ -175,25 +170,16 @@ public class GivensThinSolver {
         applyGivensInPlace(cs[0], cs[1], getQtRow(i - 1), getQtRow(i), 0,
             m);
       }
-      // push qt and r-tilde 1 row down
-      // just sqp the references to reduce GC churning
+      /*
+       * push qt and r-tilde 1 row down
+       * 
+       * just swap the references to reduce GC churning
+       */
       pushQtDown();
       double[] swap = getQtRow(0);
       setQtRow(0, vQtRow);
       vQtRow = swap;
 
-      // triangular push -- obviously, less efficient than
-      // this is terribly inefficient. for each row we are basically
-      // moving ~ 2-4Mb of memory around.
-      // for (int i = m_n - 1; i > 0; i--) {
-      // // copy (i-1)th row into i-th row ignoring main diagonal item
-      // // which must be 0 now
-      // assert m_rTilde.getQuick(i - 1, i - 1) <= s_epsilon;
-      // for (int j = i; j < m_n; j++)
-      // m_rTilde.setQuick(i, j, m_rTilde.getQuick(i - 1, j));
-      // }
-      // for (int i = 0; i < m_n; i++)
-      // m_rTilde.setQuick(0, i, m_aRow[i]);
       pushRDown();
       swap = getRRow(0);
       setRRow(0, vARow);
@@ -230,8 +216,10 @@ public class GivensThinSolver {
     rStartRow = rStartRow == 0 ? n - 1 : rStartRow - 1;
   }
 
-  // warning: both of these return actually n+1 rows with the last one being
-  // not interesting.
+  /*
+   * warning: both of these return actually n+1 rows with the last one being //
+   * not interesting.
+   */
   public UpperTriangular getRTilde() {
     UpperTriangular packedR = new UpperTriangular(n);
     for (int i = 0; i < n; i++) {
@@ -242,9 +230,12 @@ public class GivensThinSolver {
 
   public double[][] getThinQtTilde() {
     if (qtStartRow != 0) {
-      // rotate qt rows into place
-      double[][] qt = new double[n][]; // double[~500][], once per block, not
-                                         // a big deal.
+      /*
+       * rotate qt rows into place
+       * 
+       * double[~500][], once per block, not a big deal.
+       */
+      double[][] qt = new double[n][]; 
       System.arraycopy(mQt, qtStartRow, qt, 0, n - qtStartRow);
       System.arraycopy(mQt, 0, qt, n - qtStartRow, qtStartRow);
       return qt;
@@ -400,8 +391,10 @@ public class GivensThinSolver {
     assert qt2[0].length == r;
     double[] cs = new double[2];
 
-    // pairwise givens(a,b) so that a come off main diagonal in r1
-    // and bs come off u-th upper subdiagonal in r2.
+    /*
+     * pairwise givens(a,b) so that a come off main diagonal in r1 and bs come
+     * off u-th upper subdiagonal in r2.
+     */
     for (int v = 0; v < kp; v++) {
       for (int u = v; u < kp; u++) {
         givens(r1[u][u], r2[u - v][u], cs);

Copied: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java (from r1213676, mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GrammSchmidt.java)
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java?p2=mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java&p1=mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GrammSchmidt.java&r1=1213676&r2=1213842&rev=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GrammSchmidt.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java Tue Dec 13 18:36:05 2011
@@ -21,13 +21,13 @@ import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.function.DoubleFunction;
 
 /**
- * Gramm Schmidt quick helper.
+ * Gram Schmidt quick helper.
  * 
  * 
  */
-public class GrammSchmidt {
+public class GramSchmidt {
 
-  private GrammSchmidt() {
+  private GramSchmidt() {
   }
 
   public static void orthonormalizeColumns(Matrix mx) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRFirstStep.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRFirstStep.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRFirstStep.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRFirstStep.java Tue Dec 13 18:36:05 2011
@@ -22,7 +22,6 @@ import java.io.IOException;
 import java.util.Arrays;
 import java.util.Deque;
 import java.util.Iterator;
-import java.util.LinkedList;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
@@ -67,7 +66,7 @@ public class QRFirstStep implements Clos
   private final DenseBlockWritable value = new DenseBlockWritable();
   private final Writable tempKey = new IntWritable();
   private MultipleOutputs outputs;
-  private final Deque<Closeable> closeables = new LinkedList<Closeable>();
+  private final Deque<Closeable> closeables = Lists.<Closeable>newLinkedList();
   private SequenceFile.Writer tempQw;
   private Path tempQPath;
   private final List<UpperTriangular> rSubseq = Lists.newArrayList();
@@ -103,12 +102,12 @@ public class QRFirstStep implements Clos
     value.setBlock(qt);
     getTempQw().append(tempKey, value);
 
-    // this probably should be
-    // a sparse row matrix,
-    // but compressor should get it for disk and in memory we want it
-    // dense anyway, sparse random implementations would be
-    // a mostly a memory management disaster consisting of rehashes and GC
-    // thrashing. (IMHO)
+    /*
+     * this probably should be a sparse row matrix, but compressor should get it
+     * for disk and in memory we want it dense anyway, sparse random
+     * implementations would be a mostly a memory management disaster consisting
+     * of rehashes and GC // thrashing. (IMHO)
+     */
     value.setBlock(null);
     qSolver.reset();
   }
@@ -116,10 +115,11 @@ public class QRFirstStep implements Clos
   // second pass to run a modified version of computeQHatSequence.
   private void flushQBlocks() throws IOException {
     if (blockCnt == 1) {
-      // only one block, no temp file, no second pass. should be the default
-      // mode
-      // for efficiency in most cases. Sure mapper should be able to load
-      // the entire split in memory -- and we don't require even that.
+      /*
+       * only one block, no temp file, no second pass. should be the default
+       * mode for efficiency in most cases. Sure mapper should be able to load
+       * the entire split in memory -- and we don't require even that.
+       */
       value.setBlock(qSolver.getThinQtTilde());
       outputQHat(value);
       outputR(new VectorWritable(new DenseVector(qSolver.getRTilde().getData(),
@@ -152,8 +152,10 @@ public class QRFirstStep implements Clos
                                                 new CopyConstructorIterator<UpperTriangular>(rSubseq
                                                   .iterator())));
       if (qCnt == 1) {
-        // just merge r[0] <- r[1] so it doesn't have to repeat
-        // in subsequent computeQHat iterators
+        /*
+         * just merge r[0] <- r[1] so it doesn't have to repeat in subsequent
+         * computeQHat iterators
+         */
         GivensThinSolver.mergeR(rSubseq.get(0), rSubseq.remove(1));
       } else {
         qCnt++;
@@ -251,12 +253,12 @@ public class QRFirstStep implements Clos
 
   private SequenceFile.Writer getTempQw() throws IOException {
     if (tempQw == null) {
-      // temporary Q output
-      // hopefully will not exceed size of IO cache in which case it is only
-      // good since it
-      // is going to be maanged by kernel, not java GC. And if IO cache is not
-      // good enough,
-      // then at least it is always sequential.
+      /*
+       * temporary Q output hopefully will not exceed size of IO cache in which
+       * case it is only good since it is going to be managed by kernel, not
+       * java GC. And if IO cache is not good enough, then at least it is always
+       * sequential.
+       */
       String taskTmpDir = System.getProperty("java.io.tmpdir");
 
       FileSystem localFs = FileSystem.getLocal(jobConf);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRLastStep.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRLastStep.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRLastStep.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/QRLastStep.java Tue Dec 13 18:36:05 2011
@@ -65,8 +65,10 @@ public class QRLastStep implements Close
                     int blockNum) {
     this.blockNum = blockNum;
     this.qHatInput = qHatInput;
-    // in this implementation we actually preload all Rs into memory to make R
-    // sequence modifications more efficient.
+    /*
+     * in this implementation we actually preload all Rs into memory to make R
+     * sequence modifications more efficient.
+     */
     int block = 0;
     while (rHatInput.hasNext()) {
       Vector value = rHatInput.next().get();
@@ -119,7 +121,10 @@ public class QRLastStep implements Close
       throw new NoSuchElementException();
     }
     Validate.isTrue(hasNext(), "Q input overrun");
-    int qRowIndex = r - cnt - 1; // because QHats are initially stored in
+    /*
+     * because Q blocks are initially stored in inverse order
+     */
+    int qRowIndex = r - cnt - 1; 
     for (int j = 0; j < kp; j++) {
       qRow.setQuick(j, mQt[j][qRowIndex]);
     }
@@ -135,7 +140,6 @@ public class QRLastStep implements Close
   @Override
   public void close() throws IOException {
     mQt = null;
-
     mRs.clear();
   }
 

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java Tue Dec 13 18:36:05 2011
@@ -48,17 +48,20 @@ public class LocalSSVDSolverDenseTest ex
   private static final double s_precisionPct = 10;
 
   @Test
-  public void testSSVDSolverDense() throws IOException { 
+  public void testSSVDSolverDense() throws IOException {
     runSSVDSolver(0);
   }
-  
+
   @Test
-  public void testSSVDSolverPowerIterations1() throws IOException { 
+  public void testSSVDSolverPowerIterations1() throws IOException {
     runSSVDSolver(1);
   }
 
-  @Test
-  public void testSSVDSolverPowerIterations2() throws IOException { 
+  /*
+   * remove from active tests to save time.
+   */
+  /* @Test */
+  public void testSSVDSolverPowerIterations2() throws IOException {
     runSSVDSolver(2);
   }
 
@@ -71,9 +74,6 @@ public class LocalSSVDSolverDenseTest ex
     // conf.set("mapred.job.tracker","localhost:11011");
     // conf.set("fs.default.name","hdfs://localhost:11010/");
 
-    // Deque<Closeable> closeables = new LinkedList<Closeable>();
-    // Random rnd = RandomUtils.getRandom();
-
     File tmpDir = getTestTempDir("svdtmp");
     conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());
 
@@ -88,7 +88,7 @@ public class LocalSSVDSolverDenseTest ex
     // make input equivalent to 2 mln non-zero elements.
     // With 100mln the precision turns out to be only better (LLN law i guess)
     // With oversampling of 100, i don't get any error at all.
-    int n = 1000;
+    int n = 100;
     int m = 2000;
     Vector singularValues =
       new DenseVector(new double[] { 10, 4, 1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
@@ -121,12 +121,15 @@ public class LocalSSVDSolverDenseTest ex
                      new Path[] { aPath },
                      svdOutPath,
                      ablockRows,
-                     500,
                      k,
                      p,
                      3);
-    // ssvd.setcUHalfSigma(true);
-    // ssvd.setcVHalfSigma(true);
+    /*
+     * these are only tiny-test values to simulate high load cases, in reality
+     * one needs much bigger
+     */
+    ssvd.setOuterBlockHeight(500);
+    ssvd.setAbtBlockHeight(400);
     ssvd.setOverwrite(true);
     ssvd.setQ(q);
     ssvd.run();
@@ -164,30 +167,29 @@ public class LocalSSVDSolverDenseTest ex
 
     for (int i = 0; i < k; i++) {
       assertTrue(Math.abs((singularValues.getQuick(i) - stochasticSValues[i])
-            / singularValues.getQuick(i)) <= s_precisionPct / 100);
+          / singularValues.getQuick(i)) <= s_precisionPct / 100);
     }
 
     double[][] mQ =
       SSVDSolver.loadDistributedRowMatrix(fs, new Path(svdOutPath, "Bt-job/"
           + BtJob.OUTPUT_Q + "-*"), conf);
 
-    SSVDPrototypeTest
-      .assertOrthonormality(new DenseMatrix(mQ), false, s_epsilon);
+    SSVDPrototypeTest.assertOrthonormality(new DenseMatrix(mQ),
+                                           false,
+                                           s_epsilon);
 
     double[][] u =
       SSVDSolver.loadDistributedRowMatrix(fs,
                                           new Path(svdOutPath, "U/[^_]*"),
                                           conf);
 
-    SSVDPrototypeTest
-      .assertOrthonormality(new DenseMatrix(u), false, s_epsilon);
+    SSVDPrototypeTest.assertOrthonormality(new DenseMatrix(u), false, s_epsilon);
     double[][] v =
       SSVDSolver.loadDistributedRowMatrix(fs,
                                           new Path(svdOutPath, "V/[^_]*"),
                                           conf);
 
-    SSVDPrototypeTest
-      .assertOrthonormality(new DenseMatrix(v), false, s_epsilon);
+    SSVDPrototypeTest.assertOrthonormality(new DenseMatrix(v), false, s_epsilon);
   }
 
   static void dumpSv(double[] s) {

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java Tue Dec 13 18:36:05 2011
@@ -54,7 +54,10 @@ public class LocalSSVDSolverSparseSequen
 
   private static final double s_epsilon = 1.0E-10d;
 
-  @Test
+  /*
+   * removing from tests to reduce test running time
+   */
+  /* @Test */
   public void testSSVDSolverSparse() throws IOException { 
     runSSVDSolver(0);
   }
@@ -93,7 +96,7 @@ public class LocalSSVDSolverSparseSequen
     closeables.addFirst(w);
 
     int n = 100;
-    int m = 20000;
+    int m = 2000;
     double percent = 5;
 
     VectorWritable vw = new VectorWritable();
@@ -134,12 +137,19 @@ public class LocalSSVDSolverSparseSequen
                      new Path[] { aPath },
                      svdOutPath,
                      ablockRows,
-                     500,
                      k,
                      p,
                      3);
-    // ssvd.setcUHalfSigma(true);
-    // ssvd.setcVHalfSigma(true);
+    ssvd.setOuterBlockHeight(500);
+    ssvd.setAbtBlockHeight(251);
+    
+    /*
+     * removing V,U jobs from this test to reduce running time. i will keep them
+     * put in the dense test though.
+     */
+    ssvd.setComputeU(false);
+    ssvd.setComputeV(false);
+    
     ssvd.setOverwrite(true);
     ssvd.setQ(q);
     ssvd.run();
@@ -171,6 +181,12 @@ public class LocalSSVDSolverSparseSequen
     SSVDPrototypeTest
       .assertOrthonormality(new DenseMatrix(mQ), false, s_epsilon);
 
+    /*
+     * removing tests on U and V to keep this test leaner. I will keep U,V
+     * computation and assertions in the dense tests though.
+     */
+
+    /*
     double[][] u =
       SSVDSolver.loadDistributedRowMatrix(fs,
                                           new Path(svdOutPath, "U/[^_]*"),
@@ -185,6 +201,7 @@ public class LocalSSVDSolverSparseSequen
 
     SSVDPrototypeTest
       .assertOrthonormality(new DenseMatrix(v), false, s_epsilon);
+    */
   }
 
   static void dumpSv(double[] s) {

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDTestsHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDTestsHelper.java?rev=1213842&r1=1213841&r2=1213842&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDTestsHelper.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDTestsHelper.java Tue Dec 13 18:36:05 2011
@@ -30,9 +30,10 @@ import org.apache.mahout.common.RandomUt
 import org.apache.mahout.math.DenseMatrix;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.hadoop.stochasticsvd.qr.GrammSchmidt;
+import org.apache.mahout.math.hadoop.stochasticsvd.qr.GramSchmidt;
 
 public class SSVDTestsHelper {
 
@@ -106,7 +107,7 @@ public class SSVDTestsHelper {
         result.setQuick(i, j, rnd.nextDouble() - 0.5);
       }
     }
-    GrammSchmidt.orthonormalizeColumns(result);
+    GramSchmidt.orthonormalizeColumns(result);
     SSVDPrototypeTest.assertOrthonormality(result, false, 1.0e-10);
     return result;
   }
@@ -118,11 +119,49 @@ public class SSVDTestsHelper {
     FileSystem dfs = FileSystem.getLocal(conf);
     Path outputDir=new Path("/tmp/DRM");
     dfs.mkdirs(outputDir);
-    for ( int i = 1; i <= 10; i++ ) {
-      generateDenseInput(new Path(outputDir,String.format("part-%05d",i)),dfs,
-                         new DenseVector ( new double[] {
-                             15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0.8,0.3,0.1,0.01
-                         }),1200,10000,(i-1)*1200);
+//    for ( int i = 1; i <= 10; i++ ) {
+//      generateDenseInput(new Path(outputDir,String.format("part-%05d",i)),dfs,
+//                         new DenseVector ( new double[] {
+//                             15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0.8,0.3,0.1,0.01
+//                         }),1200,10000,(i-1)*1200);
+//    }
+    
+    /*
+     *  create 2Gb sparse 4.5 m x 4.5m input . (similar to wikipedia graph).
+     *  
+     *  In order to get at 2Gb, we need to generate ~ 40 non-zero items per row average.
+     *   
+     */
+    
+    outputDir = new Path("/tmp/DRM-sparse");
+    int n = 4500000;
+    int avgNZero = 40;
+    Random rnd = new Random();
+
+    SequenceFile.Writer w =
+      SequenceFile.createWriter(dfs,
+                                dfs.getConf(),
+                                new Path(outputDir, "sparse.seq"),
+                                IntWritable.class,
+                                VectorWritable.class);
+
+    try {
+
+      IntWritable iw = new IntWritable();
+      VectorWritable vw = new VectorWritable();
+      for (int i = 1; i < n; i++) {
+        RandomAccessSparseVector vector = new RandomAccessSparseVector(n);
+        double nz = Math.round(avgNZero * (rnd.nextGaussian() + 1));
+        if (nz < 0)
+          nz = 0;
+        for (int j = 1; j < nz; j++)
+          vector.set(rnd.nextInt(n), rnd.nextGaussian() * 25 + 3);
+        iw.set(i);
+        vw.set(vector);
+        w.append(iw, vw);
+      }
+    } finally {
+      w.close();
     }
     
   }



Mime
View raw message