mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From build...@apache.org
Subject svn commit: r900522 - in /websites/staging/mahout/trunk/content: ./ users/dim-reduction/ users/dim-reduction/ssvd.html
Date Sat, 08 Mar 2014 05:31:38 GMT
Author: buildbot
Date: Sat Mar  8 05:31:37 2014
New Revision: 900522

Log:
Staging update by buildbot for mahout

Added:
    websites/staging/mahout/trunk/content/users/dim-reduction/
    websites/staging/mahout/trunk/content/users/dim-reduction/ssvd.html
Modified:
    websites/staging/mahout/trunk/content/   (props changed)

Propchange: websites/staging/mahout/trunk/content/
------------------------------------------------------------------------------
--- cms:source-revision (original)
+++ cms:source-revision Sat Mar  8 05:31:37 2014
@@ -1 +1 @@
-1574904
+1575489

Added: websites/staging/mahout/trunk/content/users/dim-reduction/ssvd.html
==============================================================================
--- websites/staging/mahout/trunk/content/users/dim-reduction/ssvd.html (added)
+++ websites/staging/mahout/trunk/content/users/dim-reduction/ssvd.html Sat Mar  8 05:31:37
2014
@@ -0,0 +1,356 @@
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta
http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data framework, data integration,
+        data matching, data mining, data mining algorithms, data mining analysis, data mining
data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning methods,
+        learning techniques, lucene, machine learning, machine translation, mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data mining">
+  <link rel="shortcut icon" type="image/x-icon" href="http://mahout.apache.org/images/favicon.ico">
+  <script type="text/javascript" src="/js/prototype.js"></script>
+  <script type="text/javascript" src="/js/effects.js"></script>
+  <script type="text/javascript" src="/js/search.js"></script>
+  <script type="text/javascript" src="/js/slides.js"></script>
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-responsive.css" rel="stylesheet">
+  <link rel="stylesheet" href="/css/global.css" type="text/css">
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/overview.html"></a></div>
+  <div id="search">
+    <script type="text/javascript">
+      function getBlank(input, stdValue) {
+      if (input.value == stdValue) {
+      input.value = '';
+      }
+      return true;
+      }
+
+      function selectProvider(form) {
+      provider = form.elements['searchProvider'].value;
+      if (provider == "any") {
+      if (Math.random() > 0.5) {
+      provider = "lucid";
+      } else {
+      provider = "sl";
+      }
+      }
+
+      if (provider == "lucid") {
+      form.action = "http://search.lucidimagination.com/p:mahout";
+      } else if (provider == "sl") {
+      form.action = "http://search-lucene.com/mahout";
+      }
+
+      days = 90; // cookie will be valid for 90 days
+      date = new Date();
+      date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+      expires = "; expires=" + date.toGMTString();
+      document.cookie = "searchProvider=" + provider + expires + "; path=/";
+      return true;
+      }
+    </script>
+    <form id="quick-search" method="GET" onsubmit="return selectProvider(this)" action="http://search-lucene.com/mahout"
name="searchform">
+      <fieldset>
+        <input type="search" name="q" value="Search with Apache Solr..." class="class1
class2 hint" accesskey="q" onfocus="getBlank(this, &#39;Search with Apache Solr...&#39;)">
+        <span style="color:white">@</span>
+        <select name="searchProvider" id="searchProvider">
+          <option value="any">select provider</option>
+          <option value="lucid">Lucid Find</option>
+          <option value="sl">Search-Lucene</option>
+        </select>
+      </fieldset>
+      <script type="text/javascript">
+        if (document.cookie.length>0) {
+        cStart=document.cookie.indexOf("searchProvider=");
+        if (cStart!=-1) {
+        cStart=cStart + "searchProvider=".length;
+        cEnd=document.cookie.indexOf(";", cStart);
+        if (cEnd==-1) {
+        cEnd=document.cookie.length;
+        }
+        provider = unescape(document.cookie.substring(cStart,cEnd));
+        document.forms['searchform'].elements['searchProvider'].value = provider;
+        }
+        }
+      </script>
+    </form>
+  </div>
+
+    <div class="navbar navbar-inverse" style="position:absolute;padding-right:5px;padding-left:5px;top:84px;right:0">
+      <div class="navbar-inner">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development Project</a>
-->
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="/">Home</a></li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b
class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/general/downloads.html">Downloads</a>
+                  <li><a href="/general/who-we-are.html">Who we are</a>
+                  <li><a href="/general/mailing-lists,-irc-and-archives.html">Contact</a>

+                  <li><a href="/general/books-tutorials-and-talks.html">Books,
Tutorials, Talks</a></li>
+                  <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a>
+                  <li><a href="/general/professional-support.html">Professional
Support</a>
+                  <li><a href="https://cwiki.apache.org/confluence/display/MAHOUT/Mahout+Wiki">Mahout
Wiki</a>
+                  <li><a href="/users/basics/collections.html">Collection of
data sets</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Resources</li>
+                  <li><a href="/general/reference-reading.html">Reference Reading</a>
+		  <li><a href="/general/faq.html">FAQ</a>
+		  <li class="divider"></li>
+		  <li class="nav-header">Legal</li>
+		  <li><a href="http://www.apache.org/licenses/">License</a></li>
+		  <li><a href="http://www.apache.org/security/">Security</a></li>
+                  <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b
class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/developers/developer-resources.html">Developer
resources</a></li>
+                  <li><a href="/developers/version-control.html">Version control</a></li>
+                  <li><a href="/developers/buildingmahout.html">Build from source</a></li>
+                  <li><a href="/developers/issue-tracker.html">Issue tracker</a></li>
+      		  <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code
quality reports</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Contributions</li>
+                  <li><a href="/developers/how-to-contribute.html">How to contribute</a></li>
+                  <li><a href="/developers/how-to-become-a-committer.html">How
to become a committer</a></li>
+                  <li><a href="/developers/gsoc.html">GSoC</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">For committers</li>
+                  <li><a href="/developers/how-to-update-the-website.html">How
to update the website</a></li>
+                  <li><a href="/developers/patch-check-list.html">Patch check
list</a></li>
+                  <li><a href="/developers/how-to-release.html">How to release</a></li>
+                  <li><a href="/developers/thirdparty-dependencies.html">Third
party dependencies</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Basics<b
class="caret"></b></a>
+                 <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of algorithms</a>
+                  <li><a href="/users/basics/quickstart.html">Quickstart</a>
+                  <li><a href="/users/basics/system-requirements.html">System
requirements</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Collections</li>
+ 
+                  <li><a href="/users/basics/mahout-collections.html">Mahout
collections</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Creating vectors</li>
+                  <li><a href="/users/basics/creating-vectors.html">General vector
creation</a>
+                  <li><a href="/users/basics/creating-vectors-from-text.html">Creating
vectors from text</a>
+                  <li><a href="/users/basics/tf-idf---term-frequency-inverse-document-frequency.html">TF-IDF</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Other</li>
+                  <li><a href="/users/basics/collocations.html">Co-Locations</a>
+                  <li><a href="/users/basics/dimensional-reduction.html">Dimensional
reduction</a>
+                  <li><a href="/users/basics/principal-components-analysis.html">Principal
components analysis</a>
+                   <li><a href="/users/basics/svd---singular-value-decomposition.html">SVD</a>
+                </ul>
+                 </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Classification<b
class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li class="divider"></li>
+                <li class="nav-header">Design and Background</li>
+                <li><a href="/users/classification/classifyingyourdata.html">Classifying
data</a></li>
+		<li><a href="/users/classification/bayesian.html">Bayes design doc</a></li>
+                <li><a href="/users/classification/naivebayes.html">Design naive
bayes</a></li>
+                <li><a href="http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf">Design
complimentary bayes</a></li>
+		<li><a href="/users/classification/class-discovery.html">Class discovery</a></li>
+                <li><a href="/users/stuff/hidden-markov-models.html">HMM</a></li>
+	        <li><a href="/users/classification/random-forests.html">Design random
forests</a></li>
+                <li><a href="/users/classification/logistic-regression.html">Design
logisitic regression</a></li>
+                <li><a href="/users/stuff/partial-implementation.html">Partial
Impl</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Commandline usage</li>
+                <li><a href="/users/classification/bayesian-commandline.html">Bayes
command line</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Examples</li>
+                <li><a href="/users/classification/wikipedia-bayes-example.html">Wikipedia
Example</a></li>
+                <li><a href="/users/clustering/20newsgroups.html">20 newsgroups
example</a></li>
+                <li><a href="/users/clustering/twenty-newsgroups.html">20 newsgroups
example</a></li>
+                <li><a href="/users/classification/breiman-example.html">Breiman
example</a></li>
+                </ul></li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Clustering<b
class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a href="/users/clustering/clusteringyourdata.html">Overview</a></li>
+                <li><a href="/users/clustering/cluster-dumper.html">Cluster dumper
tool</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Algorithms</li>
+                <li><a href="/users/clustering/k-means-clustering.html">Design
k-Means clustering</a></li>
+                <li><a href="/users/clustering/canopy-clustering.html">Design
Canopy clustering</a></li>
+                <li><a href="/users/clustering/fuzzy-k-means.html">Design Fuzzy
k-Means</a></li>
+                <li><a href="/users/clustering/latent-dirichlet-allocation.html">Design
LDA</a></li>
+                <li><a href="/users/clustering/spectral-clustering.html">Design
Spectral</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Commandline usage</li>
+                <li><a href="/users/clustering/k-means-commandline.html">Commandline
k-Means</a></li>
+                <li><a href="/users/clustering/canopy-commandline.html">Commandline
Canopy clustering</a></li>
+		<li><a href="/users/clustering/fuzzy-k-means-commandline.html">Commandline
Fuzzy k-Means</a></li>
+                <li><a href="/users/clustering/lda-commandline.html">Commandline
LDA</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Examples</li>
+                <li><a href="/users/clustering/clustering-of-synthetic-control-data.html">Example:
Synthetic data</a></li>
+                <li><a href="/users/clustering/clustering-seinfeld-episodes.html">Example:
Seinfeld</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Post processing</li>
+                <li><a href="/users/clustering/viewing-result.html">Viewing results,
part a</a></li>
+                <li><a href="/users/clustering/viewing-results.html">Viewing
results, part b</a></li>
+                <li><a href="/users/clustering/visualizing-sample-clusters.html">Cluster
visualisation</a></li>
+                </ul></li>
+                <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b
class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a href="/users/recommender/recommender-first-timer-faq.html">First
Timer FAQ</a></li>
+	        <li><a href="/users/recommender/recommender-documentation.html">General</a></li>
+                </ul></li>
+           </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div class="downloadNow">
+    <a href="http://mahout.apache.org/general/downloads.html">
+      <img alt="Download Apache Mahout" src="/images/download.png"></a>
+  </div>
+  <div id="sidebar-wrap">
+    <ul class="sidemenu">
+      <li>Latest release version: Mahout 0.9</li>
+      <li>Latest development version: Mahout 1.0-SNAPSHOT</li>
+    </ul>
+    <h2>Twitter</h2>
+	<ul class="sidemenu">
+		<li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets
by @ApacheMahout</a>
+<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+	</ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html">How the
ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li>
+      <li><a href="http://www.apache.org/dev/">Developer Resources</a></li>
+      <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
+      <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/">Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/">Hadoop</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+    <h2 id="stochastic-singular-value-decomposition">Stochastic Singular Value Decomposition</h2>
+<p>Stochastic SVD method in Mahout produces reduced rank Singular Value Decomposition
output in its strict mathematical definition: A=USV'</p>
+<p><strong>The benefits over other methods are:</strong></p>
+<p>reduced flops required compared to Krylov subspace methods
+In map-reduce world, a fixed number of MR iterations required regardless of rank requested
+Tweak precision/speed balance with options.
+A is a Distributed Row Matrix where rows may be identified by any Writable (such as a document
path). As such, it would work directly on the output of seq2sparse.
+As of 0.7 trunk, includes PCA and dimensionality reduction workflow (EXPERIMENTAL! Feedback
on performance/other PCA related issues/ blogs is greatly appreciated.)
+map-reduce characteristics: 
+SSVD uses at most 3 MR sequential steps (map-only + map-reduce + 2 optional parallel map-reduce
jobs) to produce reduced rank approximation of U, V and S matrices. Additionally, two more
map-reduce steps are added for each power iteration step if requested.</p>
+<p><strong>Potential drawbacks:</strong></p>
+<p>potentially less precise (but adding even one power iteration seems to fix that
quite a bit).
+Documentation
+Overview and Usage
+Note: Please use 0.6 or later! for PCA workflow, please use 0.7 or later.</p>
+<p><strong>Publications</strong></p>
+<p><a href="http://amath.colorado.edu/faculty/martinss/Pubs/2012_halko_dissertation.pdf">Nathan
Halko's dissertation</a> "Randomized methods for computing low-rank
+approximations of matrices" contains comprehensive definition of parallelization strategy
taken in Mahout SSVD implementation and also some precision/scalability benchmarks, esp. w.r.t.
Mahout Lanczos implementation on a typical corpus data set.</p>
+<p><strong>R simulation</strong></p>
+<p>Non-parallel SSVD simulation in R with power iterations and PCA options. Note that
this implementation is not most optimal for sequential flow solver, but it is for demonstration
purposes only.</p>
+<p>However, try this R code to simulate a meaningful input:</p>
+<div class="codehilite"><pre>   tests.R
+n<span class="o">&lt;-</span><span class="m">1000</span>
+m<span class="o">&lt;-</span><span class="m">2000</span>
+k<span class="o">&lt;-</span><span class="m">10</span>
+
+qi<span class="o">&lt;-</span><span class="m">1</span>
+
+<span class="c1">#simulated input</span>
+svalsim<span class="o">&lt;-</span>diag<span class="p">(</span>k:<span
class="m">1</span><span class="p">)</span>
+
+usim<span class="o">&lt;-</span> qr.Q<span class="p">(</span>qr<span
class="p">(</span>matrix<span class="p">(</span>rnorm<span class="p">(</span>m<span
class="o">*</span>k<span class="p">,</span> mean<span class="o">=</span><span
class="m">3</span><span class="p">),</span> nrow<span class="o">=</span>m<span
class="p">,</span>ncol<span class="o">=</span>k<span class="p">)))</span>
+vsim<span class="o">&lt;-</span> qr.Q<span class="p">(</span>qr<span
class="p">(</span> matrix<span class="p">(</span>rnorm<span class="p">(</span>n<span
class="o">*</span>k<span class="p">,</span>mean<span class="o">=</span><span
class="m">5</span><span class="p">),</span> nrow<span class="o">=</span>n<span
class="p">,</span>ncol<span class="o">=</span>k<span class="p">)))</span>
+
+
+x<span class="o">&lt;-</span> usim <span class="o">%*%</span>
svalsim <span class="o">%*%</span> t<span class="p">(</span>vsim<span
class="p">)</span>
+</pre></div>
+
+
+<p>and try to compare ssvd.svd(x) and stock svd(x) performance for the same rank k,
notice the difference in the running time. Also play with power iterations (qIter) and compare
accuracies of standard svd and SSVD.</p>
+<p>Note: numerical stability of R algorithms may differ from that of Mahout's distributed
version. We haven't studied accuracy of the R simulation. For study of accuracy of Mahout's
version, please refer to Nathan's dissertation as referenced above.</p>
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014 The Apache Software Foundation, Licensed under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
+        <br />
+        Apache and the Apache feather logos are trademarks of The Apache Software Foundation.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/js/jquery-1.9.1.min.js"></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>



Mime
View raw message