mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r1243022 [15/38] - in /mahout/site/new_website: ./ MAHOUT/ MAHOUT/2010/ MAHOUT/2010/09/ MAHOUT/2010/09/14/ MAHOUT/2011/ MAHOUT/2011/10/ MAHOUT/2011/10/21/ MAHOUT/books-tutorials-and-talks.data/ MAHOUT/books-tutorials-talks.data/ MAHOUT/book...
Date Sat, 11 Feb 2012 10:22:31 GMT
Added: mahout/site/new_website/MAHOUT/k-means-clustering.html
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means-clustering.html?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/k-means-clustering.html (added)
+++ mahout/site/new_website/MAHOUT/k-means-clustering.html Sat Feb 11 10:22:15 2012
@@ -0,0 +1,320 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<HTML>
+  <HEAD>
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/space.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/wiki-content.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/abs.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/tables.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/panels.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/renderer-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/content-types.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/login.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/information-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/layout-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/default-theme.css">
+    <LINK type="text/css" rel="stylesheet" href="resources/space.css">
+    <STYLE type="text/css">
+      .footer {
+        background-image:      url('https://cwiki.apache.org/confluence/images/border/border_bottom.gif');
+        background-repeat:     repeat-x;
+        background-position:   left top;
+        padding-top:           4px;
+        color:                 #666;
+      }
+    </STYLE>
+    <SCRIPT type="text/javascript" language="javascript">
+      var hide = null;
+      var show = null;
+      var children = null;
+
+      function init() {
+        /* Search form initialization */
+        var form = document.forms['search'];
+        if (form != null) {
+          form.elements['domains'].value = location.hostname;
+          form.elements['sitesearch'].value = location.hostname;
+        }
+
+        /* Children initialization */
+        hide = document.getElementById('hide');
+        show = document.getElementById('show');
+        children = document.all != null ?
+                   document.all['children'] :
+                   document.getElementById('children');
+        if (children != null) {
+          children.style.display = 'none';
+          show.style.display = 'inline';
+          hide.style.display = 'none';
+        }
+      }
+
+      function showChildren() {
+        children.style.display = 'block';
+        show.style.display = 'none';
+        hide.style.display = 'inline';
+      }
+
+      function hideChildren() {
+        children.style.display = 'none';
+        show.style.display = 'inline';
+        hide.style.display = 'none';
+      }
+    </SCRIPT>
+    <TITLE>K-Means Clustering</TITLE>
+  <META http-equiv="Content-Type" content="text/html;charset=UTF-8"></HEAD>
+  <BODY onload="init()">
+    <TABLE border="0" cellpadding="2" cellspacing="0" width="100%">
+      <TR class="topBar">
+        <TD align="left" valign="middle" class="topBarDiv" align="left" nowrap="">
+          &nbsp;<A href="mahout-wiki.html" title="Apache Mahout">Apache Mahout</A>&nbsp;&gt;&nbsp;<A href="mahout-wiki.html" title="Mahout Wiki">Mahout Wiki</A>&nbsp;&gt;&nbsp;<A href="algorithms.html" title="Algorithms">Algorithms</A>&nbsp;&gt;&nbsp;<A href="" title="K-Means Clustering">K-Means Clustering</A>
+        </TD>
+        <TD align="right" valign="middle" nowrap="">
+          <FORM name="search" action="http://www.google.com/search" method="get">
+            <INPUT type="hidden" name="ie" value="UTF-8">
+            <INPUT type="hidden" name="oe" value="UTF-8">
+            <INPUT type="hidden" name="domains" value="">
+            <INPUT type="hidden" name="sitesearch" value="">
+            <INPUT type="text" name="q" maxlength="255" value="">        
+            <INPUT type="submit" name="btnG" value="Google Search">
+          </FORM>
+        </TD>
+      </TR> 
+    </TABLE>
+
+    <DIV id="PageContent">
+      <DIV class="pageheader" style="padding: 6px 0px 0px 0px;">
+        <!-- We'll enable this once we figure out how to access (and save) the logo resource -->
+        <!--img src="/wiki/images/confluence_logo.gif" style="float: left; margin: 4px 4px 4px 10px;" border="0"-->
+        <DIV style="margin: 0px 10px 0px 10px" class="smalltext">Apache Mahout</DIV>
+        <DIV style="margin: 0px 10px 8px 10px" class="pagetitle">K-Means Clustering</DIV>
+
+        <DIV class="greynavbar" align="right" style="padding: 2px 10px; margin: 0px;">
+          <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=75159">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/notep_16.gif" height="16" width="16" border="0" align="absmiddle" title="Edit Page"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=75159">Edit Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/browse_space.gif" height="16" width="16" border="0" align="absmiddle" title="Browse Space"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">Browse Space</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=75159">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_page_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add Page"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=75159">Add Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=75159">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_blogentry_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add News"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=75159">Add News</A>
+        </DIV>
+      </DIV>
+
+      <DIV class="pagecontent">
+        <DIV class="wiki-content">
+          <P>k-Means is a rather simple but well known algorithms for grouping objects, clustering. Again all objects need to be represented as a set of numerical features. In addition the user has to specify the number of groups (referred to as <EM>k</EM>) he wishes to identify.<BR>
+Each object can be thought of as being represented by some feature vector in an <EM>n</EM> dimensional space, <EM>n</EM> being the number of all features used to describe the objects to cluster. The algorithm then randomly chooses <EM>k</EM> points in that vector space, these point serve as the initial centers of the clusters. Afterwards all objects are each assigned to center they are closest to. Usually the distance measure is chosen by the user and determined by the learning task.<BR>
+After that, for each cluster a new center is computed by averaging the feature vectors of all objects assigned to it. The process of assigning objects and recomputing centers is repeated until the process converges. The algorithm can be proven to converge after a finite number of iterations.<BR>
+Several tweaks concerning distance measure, initial center choice and computation of new average centers have been explored, as well as the estimation of the number of clusters <EM>k</EM>. Yet the main principle always remains the same.</P>
+
+
+
+<H2><A name="K-MeansClustering-Quickstart"></A>Quickstart</H2>
+
+<P><A href="k-means-clustering.data/quickstart-kmeans.sh">Here</A> is a short shell script outline that will get you started quickly with k-Means. This does the following:</P>
+
+<UL>
+	<LI>Get the Reuters dataset</LI>
+	<LI>Run org.apache.lucene.benchmark.utils.ExtractReuters to generate reuters-out from reuters-sgm(the downloaded archive)</LI>
+	<LI>Run seqdirectory to convert reuters-out to SequenceFile format</LI>
+	<LI>Run seq2sparse to convert SequenceFiles to sparse vector format</LI>
+	<LI>Finally, run kMeans with 20 clusters.</LI>
+</UL>
+
+
+<P>After following through the output that scrolls past, reading the code will offer you a better understanding.</P>
+
+
+<H2><A name="K-MeansClustering-Strategyforparallelization"></A>Strategy for parallelization</H2>
+
+<P>Some ideas can be found in <A href="http://code.google.com/edu/content/submissions/mapreduce-minilecture/listing.html" class="external-link" rel="nofollow">Cluster computing and MapReduce</A> lecture video series [by Google(r)]; k-Means clustering is discussed in <A href="http://www.youtube.com/watch?v=1ZDybXl212Q" class="external-link" rel="nofollow">lecture #4</A>. Slides can be found <A href="http://code.google.com/edu/content/submissions/mapreduce-minilecture/lec4-clustering.ppt" class="external-link" rel="nofollow">here</A>.</P>
+
+<P>Interestingly, Hadoop based implementation using <A href="http://en.wikipedia.org/wiki/Canopy_clustering_algorithm" class="external-link" rel="nofollow">Canopy-clustering</A> seems to be here: <A href="http://code.google.com/p/canopy-clustering/" class="external-link" rel="nofollow">http://code.google.com/p/canopy-clustering/</A> (GPL 3 licence)</P>
+
+<P>Here's another useful paper <A href="http://www2.chass.ncsu.edu/garson/PA765/cluster.htm" class="external-link" rel="nofollow">http://www2.chass.ncsu.edu/garson/PA765/cluster.htm</A>.</P>
+
+<H2><A name="K-MeansClustering-Designofimplementation"></A>Design of implementation</H2>
+
+<P>The implementation accepts two input directories: one for the data points and one for the initial clusters. The data directory contains multiple input files of SequenceFile(key, VectorWritable), while the clusters directory contains one or more SequenceFiles(Text, Cluster | Canopy) containing <EM>k</EM> initial clusters or canopies. None of the input directories are modified by the implementation, allowing experimentation with initial clustering and convergence values.</P>
+
+<P>The program iterates over the input points and clusters, outputting a new directory &quot;clusters-N&quot; containing SequenceFile(Text, Cluster) files for each iteration N. This process uses a mapper/combiner/reducer/driver as follows:</P>
+<UL>
+	<LI>KMeansMapper - reads the input clusters during its setup() method, then assigns and outputs each input point to its nearest cluster as defined by the user-supplied distance measure. Output key is: cluster identifier. Output value is: ClusterObservation.</LI>
+	<LI>KMeansCombiner - receives all key:value pairs from the mapper and produces partial sums of the input vectors for each cluster. Output key is: cluster identifier. Output value is ClusterObservation.</LI>
+	<LI>KMeansReducer - a single reducer receives all key:value pairs from all combiners and sums them to produce a new centroid for the cluster which is output. Output key is: encoded cluster identifier. Output value is: Cluster. The reducer encodes unconverged clusters with a 'Cn' cluster Id and converged clusters with 'Vn' clusterId.</LI>
+	<LI>KMeansDriver - iterates over the points and clusters until all output clusters have converged (Vn clusterIds) or until a maximum number of iterations has been reached. During iterations, a new clusters directory &quot;clusters-N&quot; is produced with the output clusters from the previous iteration used for input to the next. A final optional pass over the data using the KMeansClusterMapper clusters all points to an output directory &quot;clusteredPoints&quot; and has no combiner or reducer steps.</LI>
+</UL>
+
+
+<P>Canopy clustering can be used to compute the initial clusters for k-KMeans:</P>
+<BLOCKQUOTE>
+<P>// run the CanopyDriver job<BR>
+CanopyDriver.runJob(&quot;testdata&quot;, &quot;output&quot; ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, false);</P>
+
+<P>// now run the KMeansDriver job<BR>
+KMeansDriver.runJob(&quot;testdata&quot;, &quot;output/clusters-0&quot;, &quot;output&quot;, EuclideanDistanceMeasure.class.getName(), &quot;0.001&quot;, &quot;10&quot;, true);</P></BLOCKQUOTE>
+
+<P>In the above example, the input data points are stored in 'testdata' and the CanopyDriver is configured to output to the 'output/clusters-0' directory. Once the driver executes it will contain the canopy definition files. Upon running the KMeansDriver the output directory will have two or more new directories: 'clusters-N'' containining the clusters for each iteration and 'clusteredPoints' will contain the clustered data points.</P>
+
+<P>This diagram shows the examplary dataflow of the k-Means example implementation provided by Mahout:</P>
+
+
+<MAP name="GLIFFY_MAP_75159_Example_implementation_of_k-Means_provided_with_Mahout"></MAP>
+<TABLE width="100%">
+    <TR>
+        <TD align="left">
+            <TABLE>
+                <CAPTION align="bottom">
+                    
+                        
+                        <A href="https://cwiki.apache.org/confluence/plugins/gliffy/viewlargediagram.action?name=Example%20implementation%20of%20k-Means%20provided%20with%20Mahout&ceoid=75159&key=MAHOUT&pageId=75159" target="">Full Size</A>
+                    
+                         | 
+                        <A href="https://cwiki.apache.org/confluence/plugins/gliffy/showgliffyeditor.action?name=Example%20implementation%20of%20k-Means%20provided%20with%20Mahout&ceoid=75159&key=MAHOUT&lastPage=%2Fdisplay%2FMAHOUT%2FK-Means%20Clustering&pageId=75159" target="">Edit Diagram</A>
+                                    </CAPTION>
+                <TR>
+                    <TD>
+                        <IMG style="border: none; width: 822px; height: 816px;" usemap="#GLIFFY_MAP_75159_Example_implementation_of_k-Means_provided_with_Mahout" src="k-means-clustering.data/Example%20implementation%20of%20k-Means%20provided%20with%20Mahout.png" alt="A&#32;Gliffy&#32;Diagram&#32;named&#58;&#32;Example&#32;implementation&#32;of&#32;k&#45;Means&#32;provided&#32;with&#32;Mahout">
+                    </TD>
+                </TR>
+            </TABLE>
+        </TD>
+    </TR>
+</TABLE>
+
+
+
+
+<P>This diagram doesn't consider CanopyClustering:</P>
+
+<TABLE style="background-color: #2C8FCF;  height: 38px">
+    <TR>
+        <TD style="vertical-align: middle; padding-top: 0px; padding-bottom: 0px;">
+            <DIV style="padding: 3px"><IMG src="https://cwiki.apache.org/confluence/download/resources/com.gliffy.integration.confluence:gliffy-macro-key/resources/icons/gliffylogo32x32.PNG">
+            </DIV>
+        </TD>
+        <TD style="border-right: 2px solid white; padding:3px">
+            <SPAN style="font-size:120%; font-weight: bold; color: white">Macro Error</SPAN>
+        </TD>
+        <TD style="padding: 3px;">
+            <SPAN style="color:white;">
+                Cannot find the diagram with these parameters: <BR>
+                <STRONG>name:</STRONG> k-Means Example <BR>
+                <STRONG>version:</STRONG>  <BR>
+                <STRONG>pageName:</STRONG> k-Means <BR>
+                <STRONG>pageId:</STRONG>  <BR>
+                <STRONG>spaceKey:</STRONG> MAHOUT <BR>
+            </SPAN>
+
+                    </TD>
+    </TR>
+</TABLE>
+
+
+<H2><A name="K-MeansClustering-RunningkMeansClustering"></A>Running k-Means Clustering</H2>
+
+<P>The k-Means clustering algorithm may be run using a command-line invocation on KMeansDriver.main or by making a Java call to KMeansDriver.runJob(). </P>
+
+<P>Invocation using the command line takes the form:</P>
+
+<DIV class="preformatted panel" style="border-width: 1px;"><DIV class="preformattedContent panelContent">
+<PRE>bin/mahout kmeans \
+    -i &lt;input vectors directory&gt; \
+    -c &lt;input clusters directory&gt; \
+    -o &lt;output working directory&gt; \
+    -k &lt;optional number of initial clusters to sample from input vectors&gt; \
+    -dm &lt;DistanceMeasure&gt; \
+    -x &lt;maximum number of iterations&gt; \
+    -cd &lt;optional convergence delta. Default is 0.5&gt; \
+    -ow &lt;overwrite output directory if present&gt;
+    -cl &lt;run input vector clustering after computing Canopies&gt;
+    -xm &lt;execution method: sequential or mapreduce&gt;
+</PRE>
+</DIV></DIV>
+
+<P>Note: if the -k argument is supplied, any clusters in the -c directory will be overwritten and -k random points will be sampled from the input vectors to become the initial cluster centers.</P>
+
+<P>Invocation using Java involves supplying the following arguments:</P>
+
+<OL>
+	<LI>input: a file path string to a directory containing the input data set a SequenceFile(WritableComparable, VectorWritable). The sequence file <EM>key</EM> is not used.</LI>
+	<LI>clusters: a file path string to a directory containing the initial clusters, a SequenceFile(key, Cluster | Canopy). Both KMeans clusters and Canopy canopies may be used for the initial clusters.</LI>
+	<LI>output: a file path string to an empty directory which is used for all output from the algorithm.</LI>
+	<LI>distanceMeasure: the fully-qualified class name of an instance of DistanceMeasure which will be used for the clustering.</LI>
+	<LI>convergenceDelta: a double value used to determine if the algorithm has converged (clusters have not moved more than the value in the last iteration)</LI>
+	<LI>maxIter: the maximum number of iterations to run, independent of the convergence specified</LI>
+	<LI>runClustering: a boolean indicating, if true, that the clustering step is to be executed after clusters have been determined.</LI>
+	<LI>runSequential: a boolean indicating, if true, that the k-means sequential implementation is to be used to process the input data.</LI>
+</OL>
+
+
+<P>After running the algorithm, the output directory will contain:</P>
+<OL>
+	<LI>clusters-N: directories containing SequenceFiles(Text, Cluster) produced by the algorithm for each iteration. The Text <EM>key</EM> is a cluster identifier string.</LI>
+	<LI>clusteredPoints: (if --clustering enabled) a directory containing SequenceFile(IntWritable, WeightedVectorWritable). The IntWritable <EM>key</EM> is the clusterId. The WeightedVectorWritable <EM>value</EM> is a bean containing a double <EM>weight</EM> and a VectorWritable <EM>vector</EM> where the weight indicates the probability that the vector is a member of the cluster. For k-Means clustering, the weights are all 1.0 since the algorithm selects only a single, most likely cluster for each point.</LI>
+</OL>
+
+
+<H1><A name="K-MeansClustering-Examples"></A>Examples</H1>
+
+<P>The following images illustrate k-Means clustering applied to a set of randomly-generated 2-d data points. The points are generated using a normal distribution centered at a mean location and with a constant standard deviation. See the README file in the <A href="http://svn.apache.org/repos/asf/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/README.txt" class="external-link" rel="nofollow">/examples/src/main/java/org/apache/mahout/clustering/display/README.txt</A> for details on running similar examples.</P>
+
+<P>The points are generated as follows:</P>
+
+<UL>
+	<LI>500 samples m=[1.0, 1.0] sd=3.0</LI>
+	<LI>300 samples m=[1.0, 0.0] sd=0.5</LI>
+	<LI>300 samples m=[0.0, 2.0] sd=0.1</LI>
+</UL>
+
+
+<P>In the first image, the points are plotted and the 3-sigma boundaries of their generator are superimposed. </P>
+
+<P><SPAN class="image-wrap" style=""><IMG src="k-means-clustering.data/SampleData.png" style="border: 0px solid black"></SPAN></P>
+
+<P>In the second image, the resulting clusters (k=3) are shown superimposed upon the sample data. As k-Means is an iterative algorithm, the centers of the clusters in each recent iteration are shown using different colors. Bold red is the final clustering and previous iterations are shown in [orange, yellow, green, blue, violet and gray]. Although it misses a lot of the points and cannot capture the original, superimposed cluster centers, it does a decent job of clustering this data.</P>
+
+<P><SPAN class="image-wrap" style=""><IMG src="k-means-clustering.data/KMeans.png" style="border: 0px solid black"></SPAN></P>
+
+<P>The third image shows the results of running k-Means on a different data set (see <A href="dirichlet-process-clustering.html" title="Dirichlet Process Clustering">Dirichlet Process Clustering</A> for details) which is generated using asymmetrical standard deviations. K-Means does a fair job handling this data set as well.</P>
+
+<P><SPAN class="image-wrap" style=""><IMG src="k-means-clustering.data/2dKMeans.png" style="border: 0px solid black"></SPAN></P>
+        </DIV>
+
+        
+      </DIV>
+    </DIV>
+    <DIV class="footer">
+      Generated by
+      <A href="http://www.atlassian.com/confluence/">Atlassian Confluence</A> (Version: 3.4.9 Build: 2042 Feb 14, 2011)
+      <A href="http://could.it/autoexport/">Auto Export Plugin</A> (Version: 1.0.0-dkulp)
+    </DIV>
+<SCRIPT type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-17359171-1']);
+  _gaq.push(['_setDomainName', 'none']);
+  _gaq.push(['_setAllowLinker', true]);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</SCRIPT>
+  </BODY>
+</HTML>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/k-means-commandline.html
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means-commandline.html?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/k-means-commandline.html (added)
+++ mahout/site/new_website/MAHOUT/k-means-commandline.html Sat Feb 11 10:22:15 2012
@@ -0,0 +1,217 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<HTML>
+  <HEAD>
+    <LINK type="text/css" rel="stylesheet" href="resources/space.css">
+    <STYLE type="text/css">
+      .footer {
+        background-image:      url('https://cwiki.apache.org/confluence/images/border/border_bottom.gif');
+        background-repeat:     repeat-x;
+        background-position:   left top;
+        padding-top:           4px;
+        color:                 #666;
+      }
+    </STYLE>
+    <SCRIPT type="text/javascript" language="javascript">
+      var hide = null;
+      var show = null;
+      var children = null;
+
+      function init() {
+        /* Search form initialization */
+        var form = document.forms['search'];
+        if (form != null) {
+          form.elements['domains'].value = location.hostname;
+          form.elements['sitesearch'].value = location.hostname;
+        }
+
+        /* Children initialization */
+        hide = document.getElementById('hide');
+        show = document.getElementById('show');
+        children = document.all != null ?
+                   document.all['children'] :
+                   document.getElementById('children');
+        if (children != null) {
+          children.style.display = 'none';
+          show.style.display = 'inline';
+          hide.style.display = 'none';
+        }
+      }
+
+      function showChildren() {
+        children.style.display = 'block';
+        show.style.display = 'none';
+        hide.style.display = 'inline';
+      }
+
+      function hideChildren() {
+        children.style.display = 'none';
+        show.style.display = 'inline';
+        hide.style.display = 'none';
+      }
+    </SCRIPT>
+    <TITLE>k-means-commandline</TITLE>
+  <META http-equiv="Content-Type" content="text/html;charset=UTF-8"></HEAD>
+  <BODY onload="init()">
+    <TABLE border="0" cellpadding="2" cellspacing="0" width="100%">
+      <TR class="topBar">
+        <TD align="left" valign="middle" class="topBarDiv" align="left" nowrap="">
+          &nbsp;<A href="mahout-wiki.html" title="Apache Mahout">Apache Mahout</A>&nbsp;&gt;&nbsp;<A href="mahout-wiki.html" title="Mahout Wiki">Mahout Wiki</A>&nbsp;&gt;&nbsp;<A href="quickstart.html" title="Quickstart">Quickstart</A>&nbsp;&gt;&nbsp;<A href="clusteringyourdata.html" title="ClusteringYourData">ClusteringYourData</A>&nbsp;&gt;&nbsp;<A href="" title="k-means-commandline">k-means-commandline</A>
+        </TD>
+        <TD align="right" valign="middle" nowrap="">
+          <FORM name="search" action="http://www.google.com/search" method="get">
+            <INPUT type="hidden" name="ie" value="UTF-8">
+            <INPUT type="hidden" name="oe" value="UTF-8">
+            <INPUT type="hidden" name="domains" value="">
+            <INPUT type="hidden" name="sitesearch" value="">
+            <INPUT type="text" name="q" maxlength="255" value="">        
+            <INPUT type="submit" name="btnG" value="Google Search">
+          </FORM>
+        </TD>
+      </TR> 
+    </TABLE>
+
+    <DIV id="PageContent">
+      <DIV class="pageheader" style="padding: 6px 0px 0px 0px;">
+        <!-- We'll enable this once we figure out how to access (and save) the logo resource -->
+        <!--img src="/wiki/images/confluence_logo.gif" style="float: left; margin: 4px 4px 4px 10px;" border="0"-->
+        <DIV style="margin: 0px 10px 0px 10px" class="smalltext">Apache Mahout</DIV>
+        <DIV style="margin: 0px 10px 8px 10px" class="pagetitle">k-means-commandline</DIV>
+
+        <DIV class="greynavbar" align="right" style="padding: 2px 10px; margin: 0px;">
+          <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=5964006">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/notep_16.gif" height="16" width="16" border="0" align="absmiddle" title="Edit Page"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=5964006">Edit Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/browse_space.gif" height="16" width="16" border="0" align="absmiddle" title="Browse Space"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">Browse Space</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=5964006">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_page_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add Page"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=5964006">Add Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=5964006">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_blogentry_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add News"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=5964006">Add News</A>
+        </DIV>
+      </DIV>
+      <DIV class="pagesubheading" style="margin: 0px 10px 0px 10px;">
+        #editReport()
+      </DIV>
+
+      <DIV class="pagecontent">
+        <DIV class="wiki-content">
+          <H1><A name="k-means-commandline-Introduction"></A>Introduction</H1>
+
+<P>This quick start page describes how to run the kMeans clustering algorithm on a Hadoop cluster. </P>
+
+<H1><A name="k-means-commandline-Steps"></A>Steps</H1>
+
+<P>Mahout's k-Means clustering can be launched from the same command line invocation whether you are running on a single machine in stand-alone mode or on a larger Hadoop cluster. The difference is determined by the $HADOOP_HOME and $HADOOP_CONF_DIR environment variables. If both are set to an operating Hadoop cluster on the target machine then the invocation will run k-Means on that cluster. If either of the environment variables are missing then the stand-alone Hadoop configuration will be invoked instead.</P>
+
+<DIV class="code panel" style="border-width: 1px;"><DIV class="codeContent panelContent">
+<PRE class="code-java">
+./bin/mahout kmeans &lt;OPTIONS&gt;
+</PRE>
+</DIV></DIV>
+
+<UL>
+	<LI>In $MAHOUT_HOME/, build the jar containing the job (mvn install) The job will be generated in $MAHOUT_HOME/core/target/ and it's name will contain the Mahout version number. For example, when using Mahout 0.3 release, the job will be mahout-core-0.3.job</LI>
+</UL>
+
+
+
+<H2><A name="k-means-commandline-Testingitononesinglemachinew%2Focluster"></A>Testing it on one single machine w/o cluster</H2>
+
+<UL>
+	<LI>Put the data: cp &lt;PATH TO DATA&gt; testdata</LI>
+	<LI>Run the Job:
+<DIV class="code panel" style="border-width: 1px;"><DIV class="codeContent panelContent">
+<PRE class="code-java">
+./bin/mahout kmeans -i testdata -o output -c clusters -dm org.apache.mahout.common.distance.CosineDistanceMeasure -x 5 -ow -cd 1 -k 25
+</PRE>
+</DIV></DIV></LI>
+</UL>
+
+
+<H2><A name="k-means-commandline-Runningitonthecluster"></A>Running it on the cluster</H2>
+
+<UL>
+	<LI>(As needed) Start up Hadoop: $HADOOP_HOME/bin/start-all.sh</LI>
+	<LI>Put the data: $HADOOP_HOME/bin/hadoop fs -put &lt;PATH TO DATA&gt; testdata</LI>
+	<LI>Run the Job:
+<DIV class="code panel" style="border-width: 1px;"><DIV class="codeContent panelContent">
+<PRE class="code-java">
+export HADOOP_HOME=&lt;Hadoop Home Directory&gt;
+export HADOOP_CONF_DIR=$HADOOP_HOME/conf
+./bin/mahout kmeans -i testdata -o output -c clusters -dm org.apache.mahout.common.distance.CosineDistanceMeasure -x 5 -ow -cd 1 -k 25
+</PRE>
+</DIV></DIV></LI>
+	<LI>Get the data out of HDFS and have a look. Use bin/hadoop fs -lsr output to view all outputs.</LI>
+</UL>
+
+
+<H1><A name="k-means-commandline-Commandlineoptions"></A>Command line options</H1>
+<DIV class="code panel" style="border-width: 1px;"><DIV class="codeContent panelContent">
+<PRE class="code-java">
+  --input (-i) input                           Path to job input directory.     
+                                               Must be a SequenceFile of        
+                                               VectorWritable                   
+  --clusters (-c) clusters                     The input centroids, as Vectors. 
+                                               Must be a SequenceFile of        
+                                               Writable, Cluster/Canopy.  If k  
+                                               is also specified, then a random 
+                                               set of vectors will be selected  
+                                               and written out to <SPAN class="code-keyword">this</SPAN> path     
+                                               first                            
+  --output (-o) output                         The directory pathname <SPAN class="code-keyword">for</SPAN>       
+                                               output.                          
+  --distanceMeasure (-dm) distanceMeasure      The classname of the             
+                                               DistanceMeasure. Default is      
+                                               SquaredEuclidean                 
+  --convergenceDelta (-cd) convergenceDelta    The convergence delta value.     
+                                               Default is 0.5                   
+  --maxIter (-x) maxIter                       The maximum number of            
+                                               iterations.                      
+  --maxRed (-r) maxRed                         The number of reduce tasks.      
+                                               Defaults to 2                    
+  --k (-k) k                                   The k in k-Means.  If specified, 
+                                               then a random selection of k     
+                                               Vectors will be chosen as the    
+                                               Centroid and written to the      
+                                               clusters input path.             
+  --overwrite (-ow)                            If present, overwrite the output 
+                                               directory before running job     
+  --help (-h)                                  Print out help                   
+  --clustering (-cl)                           If present, run clustering after 
+                                               the iterations have taken place  
+</PRE>
+</DIV></DIV>
+        </DIV>
+
+        
+      </DIV>
+    </DIV>
+    <DIV class="footer">
+      Generated by
+      <A href="http://www.atlassian.com/confluence/">Atlassian Confluence</A> (Version: 3.2 Build: 1810 Mar 16, 2010)
+      <A href="http://could.it/autoexport/">Auto Export Plugin</A> (Version: 1.0.0-dkulp)
+    </DIV>
+<SCRIPT type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-17359171-1']);
+  _gaq.push(['_setDomainName', 'none']);
+  _gaq.push(['_setAllowLinker', true]);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</SCRIPT>
+  </BODY>
+</HTML>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/k-means.data/Example implementation of k-Means provided with Mahout
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means.data/Example%20implementation%20of%20k-Means%20provided%20with%20Mahout?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/k-means.data/Example implementation of k-Means provided with Mahout (added)
+++ mahout/site/new_website/MAHOUT/k-means.data/Example implementation of k-Means provided with Mahout Sat Feb 11 10:22:15 2012
@@ -0,0 +1 @@
+<stage keygen_seq="4"><pageObj drawingHeight="821" drawingWidth="809" istt="true" print_scale="0" print_grid="0" print_paper="LETTER" print_layout="0" stg="1" pb="0" gr="1" fill="16777215" height="5000" width="5000"><objects><object order="0" ceoid="75159" filename="k-Means in Mahout.jpg" dshad="false" gradon="false" linew="1" linec="0" fill="16777215" text-horizontal-pos="center" text-vertical-pos="middle" lock="false" fixed-aspect="true" rot="0" height="685" width="592.525" y="418" x="453" libraryid="com.gliffy.images" shp_id="2" class="GliffyImageShape"><text/><connlines/></object></objects></pageObj></stage>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/k-means.data/k-Means Example
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means.data/k-Means%20Example?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/k-means.data/k-Means Example (added)
+++ mahout/site/new_website/MAHOUT/k-means.data/k-Means Example Sat Feb 11 10:22:15 2012
@@ -0,0 +1 @@
+<stage keygen_seq="64"><pageObj istt="true" stg="0" pb="0" gr="0" fill="16777215" height="754" width="576"><objects><object order="8" path="0,0;44.4315030580294,0;44.4315030580294,18;59.4315030580294,18;" numSegs="3" linep="0" linec="0" linew="1" endStyle="0" beginStyle="0" buff="11" height="22" width="63.4" y="119" x="220.568496941971" ln_id="12" class="TheOrthoLine"><text/></object><object order="9" path="0,0;62.4687500000001,0;62.4687500000001,-15;77.4687500000001,-15;" numSegs="3" linep="0" linec="0" linew="1" endStyle="0" beginStyle="0" buff="11" height="19" width="81.45" y="137" x="360" ln_id="13" class="TheOrthoLine"><text/></object><object order="10" path="0,0;0,33;4,33;4,48;" numSegs="3" linep="0" linec="0" linew="1" endStyle="0" beginStyle="0" buff="11" height="52" width="8" y="167" x="505" ln_id="14" class="TheOrthoLine"><text/></object><object order="11" path="0,0;-19.2625000000002,0;-19.2625000000002,4;-34.2625000000002,4;" numSegs="3" linep="0" linec="0" linew=
 "1" endStyle="0" beginStyle="0" buff="11" height="8" width="38.25" y="270" x="447.8125" ln_id="15" class="TheOrthoLine"><text/></object><object order="12" path="0,0;-74.4500000000001,0;-74.4500000000001,-4;-89.4500000000001,-4;" numSegs="3" linep="0" linec="0" linew="1" endStyle="0" beginStyle="0" buff="11" height="8" width="93.45" y="274" x="258.45" ln_id="16" class="TheOrthoLine"><text/></object><object order="15" path="0,0;0,15;87,15;87,42.75;" numSegs="3" linep="0" linec="0" linew="1" endStyle="0" beginStyle="0" buff="11" height="46.75" width="91" y="389.5" x="341" ln_id="19" class="TheOrthoLine"><text/></object><object order="16" path="0,0;0,15;22.2125,15;22.2125,32;" numSegs="3" linep="0" linec="0" linew="1" endStyle="0" beginStyle="0" buff="11" height="36" width="26.2" y="300" x="110" ln_id="21" class="TheOrthoLine"><text/></object><object order="17" path="0,0;41.1625000000001,0;41.1625000000001,-16;56.1625000000001,-16;" numSegs="3" linep="0" linec="0" linew="1" endS
 tyle="0" beginStyle="0" buff="11" height="20" width="60.15" y="373" x="207.9" ln_id="22" class="TheOrthoLine"><text/></object><object order="18" path="0,0;0,15;13,15;13,30;" numSegs="3" linep="0" linec="0" linew="1" endStyle="0" beginStyle="0" buff="11" height="34" width="17" y="414" x="132" ln_id="23" class="TheOrthoLine"><text/></object><object order="0" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height="53.0369938839414" width="149.136993883941" y="119" x="146" shp_id="3" class="fc_data"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">Points to Cluster:</FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">(1,1),(2,1),(1,2),(2,2),(3,3)</FONT></P>]]></text><connlines><connline type="start" ln_id="12" cpnum="3"/></connlines></object><object order="1
 " dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height="110" width="122.375" y="270" x="509" shp_id="4" class="rectangle"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">Selected 2 k-Means Centers as (1,1), (1,2) and write to Sequence File: ../testdata/clusters/part-00000</FONT></P>]]></text><connlines><connline type="start" ln_id="15" cpnum="1"/><connline type="end" ln_id="14" cpnum="2"/></connlines></object><object order="2" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height="57" width="155.1" y="274" x="336" shp_id="6" class="fc_data"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">C0C0:[s2,0:1.0,1:1.0,]</FONT></P><P ALIGN
 ="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">C1C1:[s2,0:1.0,1:2.0,]</FONT></P>]]></text><connlines><connline type="start" ln_id="16" cpnum="1"/><connline type="end" ln_id="15" cpnum="3"/></connlines></object><object order="3" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height="93" width="80" y="137" x="320" shp_id="7" class="rectangle"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">Converted into Input File: ../testdata/points/input.txt</FONT></P>]]></text><connlines><connline type="end" ln_id="12" cpnum="1"/><connline type="start" ln_id="13" cpnum="3"/></connlines></object><object order="4" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height=
 "90" width="135.0625" y="122" x="505" shp_id="8" class="fc_data"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">[s2, 0:1.0, 1:1.0, ] </FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">[s2, 0:2.0, 1:1.0, ] </FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">[s2, 0:1.0, 1:2.0, ] </FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">[s2, 0:2.0, 1:2.0, ] </FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">[s2, 0:3.0, 1:3.0, ]</FONT></P>]]></text><connlines><connline type="start" ln_id="14" cpnum="0"/><connline type="end" ln_id="13" cpnum="1"/></connlines></object><object order="5" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fix
 ed-aspect="false" rot="0" height="60" width="118" y="270" x="110" shp_id="9" class="rectangle"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">Run k-Means Algorithm: k = 2</FONT></P>]]></text><connlines><connline type="start" ln_id="21" cpnum="0"/><connline type="end" ln_id="16" cpnum="3"/></connlines></object><object order="6" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height="58" width="153.95" y="473" x="145" shp_id="10" class="fc_data"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">V0V0: [s2, 0:1.5, 1:1.5, ]</FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">V1V1: [s2, 0:4.0, 1:4.0, ]  </FONT></P>]]></text><connlines><connline type="end" ln_id="23" cpnum="2"/></connlines></object><ob
 ject order="7" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height="93.5" width="273.9875" y="479" x="428" shp_id="11" class="fc_data"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">V0: [s2, 0:1.5, 1:1.5, ] 	[s2, 0:1.0, 1:1.0, ] </FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">V0: [s2, 0:1.5, 1:1.5, ] 	[s2, 0:2.0, 1:1.0, ] </FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">V0: [s2, 0:1.5, 1:1.5, ] 	[s2, 0:1.0, 1:2.0, ] </FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">V0: [s2, 0:1.5, 1:1.5, ] 	[s2, 0:2.0, 1:2.0, ] </FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">V1: [s2, 0:4.0, 1:4.0, ]
  	[s2, 0:3.0, 1:3.0, ]</FONT></P>]]></text><connlines><connline type="end" ln_id="19" cpnum="2"/></connlines></object><object order="13" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height="82" width="151.375" y="373" x="132" shp_id="17" class="rectangle"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">Find k-Means Centers and write to File:</FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">../output/clusters-2/part-00000</FONT></P>]]></text><connlines><connline type="start" ln_id="23" cpnum="0"/><connline type="end" ln_id="21" cpnum="2"/><connline type="start" ln_id="22" cpnum="3"/></connlines></object><object order="14" dsy="8" dsx="8" dshad="true" gradon="true" linew="1" linec="0" fill="8113609" text-horizontal-pos="center" text-vertical-pos="mi
 ddle" fixed-aspect="false" rot="0" height="65" width="153.875" y="357" x="341" shp_id="18" class="rectangle"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">Cluster points to Centers and write to File:</FONT></P><P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">../output/points/part-00000</FONT></P>]]></text><connlines><connline type="start" ln_id="19" cpnum="0"/><connline type="end" ln_id="22" cpnum="1"/></connlines></object><object order="19" dshad="false" gradon="false" linew="1" linec="0" fill="16777215" text-horizontal-pos="center" text-vertical-pos="middle" fixed-aspect="false" rot="0" height="20" width="204.275" y="68" x="342" shp_id="43" class="rectangle"><text><![CDATA[<P ALIGN="CENTER"><FONT FACE="Arial" SIZE="12" COLOR="#000000" LETTERSPACING="0" KERNING="0">Steps in k-Means Alogrithm: k = 2</FONT></P>]]></text><connlines/></object></objects></pageObj></stage>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/k-means.data/k-Means in Mahout.jpg
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means.data/k-Means%20in%20Mahout.jpg?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/k-means.data/k-Means in Mahout.jpg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/k-means.data/k-Means in Mahout.jpg.jpeg
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means.data/k-Means%20in%20Mahout.jpg.jpeg?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/k-means.data/k-Means in Mahout.jpg.jpeg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/k-means.data/k-Means in Mahout.png
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means.data/k-Means%20in%20Mahout.png?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/k-means.data/k-Means in Mahout.png
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/k-means.data/k-Means in Mahout.png.jpeg
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means.data/k-Means%20in%20Mahout.png.jpeg?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/k-means.data/k-Means in Mahout.png.jpeg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/k-means.data/quickstart-kmeans.sh
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means.data/quickstart-kmeans.sh?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/k-means.data/quickstart-kmeans.sh (added)
+++ mahout/site/new_website/MAHOUT/k-means.data/quickstart-kmeans.sh Sat Feb 11 10:22:15 2012
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+cd examples/bin/
+mkdir -p work
+if [ ! -e work/reuters-out ]; then
+  if [ ! -e work/reuters-sgm ]; then
+    if [ ! -f work/reuters21578.tar.gz ]; then
+      echo "Downloading Reuters-21578"
+      curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz  -o work/reuters21578.tar.gz
+    fi
+    mkdir -p work/reuters-sgm
+    echo "Extracting..."
+    cd work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd .. && cd ..
+  fi
+fi
+
+cd ../..
+./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters ./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
+./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8
+./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
+./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -k 20 -w
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/k-means.html
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/k-means.html?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/k-means.html (added)
+++ mahout/site/new_website/MAHOUT/k-means.html Sat Feb 11 10:22:15 2012
@@ -0,0 +1,242 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<HTML>
+  <HEAD>
+    <LINK type="text/css" rel="stylesheet" href="resources/space.css">
+    <STYLE type="text/css">
+      .footer {
+        background-image:      url('http://cwiki.apache.org/confluence/images/border/border_bottom.gif');
+        background-repeat:     repeat-x;
+        background-position:   left top;
+        padding-top:           4px;
+        color:                 #666;
+      }
+    </STYLE>
+    <SCRIPT type="text/javascript" language="javascript">
+      var hide = null;
+      var show = null;
+      var children = null;
+
+      function init() {
+        /* Search form initialization */
+        var form = document.forms['search'];
+        if (form != null) {
+          form.elements['domains'].value = location.hostname;
+          form.elements['sitesearch'].value = location.hostname;
+        }
+
+        /* Children initialization */
+        hide = document.getElementById('hide');
+        show = document.getElementById('show');
+        children = document.all != null ?
+                   document.all['children'] :
+                   document.getElementById('children');
+        if (children != null) {
+          children.style.display = 'none';
+          show.style.display = 'inline';
+          hide.style.display = 'none';
+        }
+      }
+
+      function showChildren() {
+        children.style.display = 'block';
+        show.style.display = 'none';
+        hide.style.display = 'inline';
+      }
+
+      function hideChildren() {
+        children.style.display = 'none';
+        show.style.display = 'inline';
+        hide.style.display = 'none';
+      }
+    </SCRIPT>
+    <TITLE>k-Means</TITLE>
+  <META http-equiv="Content-Type" content="text/html;charset=UTF-8"></HEAD>
+  <BODY onload="init()">
+    <TABLE border="0" cellpadding="2" cellspacing="0" width="100%">
+      <TR class="topBar">
+        <TD align="left" valign="middle" class="topBarDiv" align="left" nowrap="">
+          &nbsp;<A href="index.html" title="Apache Lucene Mahout">Apache Lucene Mahout</A>&nbsp;&gt;&nbsp;<A href="index.html" title="index">index</A>&nbsp;&gt;&nbsp;<A href="" title="k-Means">k-Means</A>
+        </TD>
+        <TD align="right" valign="middle" nowrap="">
+          <FORM name="search" action="http://www.google.com/search" method="get">
+            <INPUT type="hidden" name="ie" value="UTF-8">
+            <INPUT type="hidden" name="oe" value="UTF-8">
+            <INPUT type="hidden" name="domains" value="">
+            <INPUT type="hidden" name="sitesearch" value="">
+            <INPUT type="text" name="q" maxlength="255" value="">        
+            <INPUT type="submit" name="btnG" value="Google Search">
+          </FORM>
+        </TD>
+      </TR> 
+    </TABLE>
+
+    <DIV id="PageContent">
+      <DIV class="pageheader" style="padding: 6px 0px 0px 0px;">
+        <!-- We'll enable this once we figure out how to access (and save) the logo resource -->
+        <!--img src="/wiki/images/confluence_logo.gif" style="float: left; margin: 4px 4px 4px 10px;" border="0"-->
+        <DIV style="margin: 0px 10px 0px 10px" class="smalltext">Apache Lucene Mahout</DIV>
+        <DIV style="margin: 0px 10px 8px 10px" class="pagetitle">k-Means</DIV>
+
+        <DIV class="greynavbar" align="right" style="padding: 2px 10px; margin: 0px;">
+          <A href="http://cwiki.apache.org/confluence/pages/editpage.action?pageId=75159">
+            <IMG src="http://cwiki.apache.org/confluence/images/icons/notep_16.gif" height="16" width="16" border="0" align="absmiddle" title="Edit Page"></A>
+            <A href="http://cwiki.apache.org/confluence/pages/editpage.action?pageId=75159">Edit Page</A>
+          &nbsp;
+          <A href="http://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">
+            <IMG src="http://cwiki.apache.org/confluence/images/icons/browse_space.gif" height="16" width="16" border="0" align="absmiddle" title="Browse Space"></A>
+            <A href="http://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">Browse Space</A>
+          &nbsp;
+          <A href="http://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=75159">
+            <IMG src="http://cwiki.apache.org/confluence/images/icons/add_page_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add Page"></A>
+          <A href="http://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=75159">Add Page</A>
+          &nbsp;
+          <A href="http://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=75159">
+            <IMG src="http://cwiki.apache.org/confluence/images/icons/add_blogentry_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add News"></A>
+          <A href="http://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=75159">Add News</A>
+        </DIV>
+      </DIV>
+      <DIV class="pagesubheading" style="margin: 0px 10px 0px 10px;">
+        #editReport()
+      </DIV>
+
+      <DIV class="pagecontent">
+        <DIV class="wiki-content">
+          <H1><A name="k-Means-kMeans"></A>k-Means</H1>
+
+<P>k-Means is a rather simple but well known algorithms for grouping objects, clustering. Again all objects need to be represented as a set of numerical features. In addition the user has to specify the number of groups (referred to as <EM>k</EM>) he wishes to identify.<BR>
+Each object can be thought of as being represented by some feature vector in an <EM>n</EM> dimensional space, <EM>n</EM> being the number of all features used to describe the objects to cluster. The algorithm then randomly chooses <EM>k</EM> points in that vector space, these point serve as the initial centers of the clusters. Afterwards all objects are each assigned to center they are closest to. Usually the distance measure is chosen by the user and determined by the learning task.<BR>
+After that, for each cluster a new center is computed by averaging the feature vectors of all objects assigned to it. The process of assigning objects and recomputing centers is repeated until the process converges. The algorithm can be proven to converge after a finite number of iterations.<BR>
+Several tweaks concerning distance measure, initial center choice and computation of new average centers have been explored, as well as the estimation of the number of clusters <EM>k</EM>. Yet the main principle always remains the same.</P>
+
+
+
+<H2><A name="k-Means-Quickstart"></A>Quickstart</H2>
+
+<P><A href="k-means.data/quickstart-kmeans.sh">Here</A> is a short shell script outline that will get you started quickly with k-Means. This does the following:</P>
+
+<UL>
+	<LI>Get the Reuters dataset</LI>
+	<LI>Run org.apache.lucene.benchmark.utils.ExtractReuters to generate reuters-out from reuters-sgm(the downloaded archive)</LI>
+	<LI>Run seqdirectory to convert reuters-out to SequenceFile format</LI>
+	<LI>Run seq2sparse to convert SequenceFiles to sparse vector format</LI>
+	<LI>Finally, run kMeans with 20 clusters.</LI>
+</UL>
+
+
+<P>After following through the output that scrolls past, reading the code will offer you a better understanding.</P>
+
+
+<H2><A name="k-Means-Strategyforparallelization"></A>Strategy for parallelization</H2>
+
+<P>Some ideas can be found in <A href="http://code.google.com/edu/content/submissions/mapreduce-minilecture/listing.html" class="external-link" rel="nofollow">Cluster computing and MapReduce</A> lecture video series [by Google(r)]; k-Means clustering is discussed in <A href="http://www.youtube.com/watch?v=1ZDybXl212Q" class="external-link" rel="nofollow">lecture #4</A>. Slides can be found <A href="http://code.google.com/edu/content/submissions/mapreduce-minilecture/lec4-clustering.ppt" class="external-link" rel="nofollow">here</A>.</P>
+
+<P>Interestingly, Hadoop based implementation using <A href="http://en.wikipedia.org/wiki/Canopy_clustering_algorithm" class="external-link" rel="nofollow">Canopy-clustering</A> seems to be here: <A href="http://code.google.com/p/canopy-clustering/" class="external-link" rel="nofollow">http://code.google.com/p/canopy-clustering/</A> (GPL 3 licence)</P>
+
+<P>Here's another useful paper <A href="http://www2.chass.ncsu.edu/garson/PA765/cluster.htm" class="external-link" rel="nofollow">http://www2.chass.ncsu.edu/garson/PA765/cluster.htm</A>.</P>
+
+<H2><A name="k-Means-Designofimplementation"></A>Design of implementation</H2>
+
+<P>The implementation accepts two input directories: one for the data points and one for the initial clusters. The data directory contains multiple input files of SequenceFile(key, VectorWritable), while the clusters directory contains one or more SequenceFiles(Text, Cluster | Canopy) containing <EM>k</EM> initial clusters or canopies. None of the input directories are modified by the implementation, allowing experimentation with initial clustering and convergence values.</P>
+
+<P>The program iterates over the input points and clusters, outputting a new directory &quot;clusters-N&quot; containing SequenceFile(Text, Cluster) files for each iteration N. This process uses a mapper/combiner/reducer/driver as follows:</P>
+<UL>
+	<LI>KMeansMapper - reads the input clusters during its configure() method, then assigns and outputs each input point to its nearest cluster as defined by the user-supplied distance measure. Output key is: encoded cluster. Output value is: input point.</LI>
+	<LI>KMeansCombiner - receives all key:value pairs from the mapper and produces partial sums of the input vectors for each cluster. Output key is: encoded cluster. Output value is &quot;&lt;number of points in partial sum&gt;, &lt;partial sum vector summing all such points&gt;&quot;.</LI>
+	<LI>KMeansReducer - a single reducer receives all key:value pairs from all combiners and sums them to produce a new centroid for the cluster which is output. Output key is: encoded cluster identifier (e.g. &quot;C14&quot;. Output value is: formatted cluster (e.g. &quot;C14 - [c1, c2, ..., cn, ]). The reducer encodes unconverged clusters with a 'Cn' cluster Id and converged clusters with 'Vn' clusterId.</LI>
+	<LI>KMeansDriver - iterates over the points and clusters until all output clusters have converged (Vn clusterIds) or until a maximum number of iterations has been reached. During iterations, a new clusters directory &quot;clusters-N&quot; is produced with the output clusters from the previous iteration used for input to the next. A final optional pass over the data using the KMeansClusterMapper clusters all points to an output directory &quot;clusteredPoints&quot; and has no combiner or reducer steps.</LI>
+</UL>
+
+
+<P>Canopy clustering can be used to compute the initial clusters for k-KMeans:</P>
+<BLOCKQUOTE>
+<P>// run the CanopyDriver job<BR>
+CanopyDriver.runJob(&quot;testdata&quot;, &quot;output&quot; ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, false);</P>
+
+<P>// now run the KMeansDriver job<BR>
+KMeansDriver.runJob(&quot;testdata&quot;, &quot;output/clusters-0&quot;, &quot;output&quot;, EuclideanDistanceMeasure.class.getName(), &quot;0.001&quot;, &quot;10&quot;, true);</P></BLOCKQUOTE>
+
+<P>In the above example, the input data points are stored in 'testdata' and the CanopyDriver is configured to output to the 'output/clusters-0' directory. Once the driver executes it will contain the canopy definition files. Upon running the KMeansDriver the output directory will have two or more new directories: 'clusters-N'' containining the clusters for each iteration and 'clusteredPoints' will contain the clustered data points.</P>
+
+<P>This diagram shows the examplary dataflow of the k-Means example implementation provided by Mahout:</P>
+
+<TABLE width="100%">
+    <TR><TD align="left"> 
+        <TABLE>     
+            <CAPTION align="bottom">
+                                 
+                                            
+                        <A href="http://cwiki.apache.org/confluence/spaces/gliffy/viewlargediagram.action?name=Example%20implementation%20of%20k-Means%20provided%20with%20Mahout&ceoid=75159&key=MAHOUT&pageId=75159" target="">Full Size</A>
+                        
+                                  
+                         |                     
+                        <A href="http://cwiki.apache.org/confluence/plugins/gliffy/showgliffyeditor.action?name=Example%20implementation%20of%20k-Means%20provided%20with%20Mahout&ceoid=75159&key=MAHOUT&lastPage=%2Fconfluence%2Fdisplay%2FMAHOUT%2Fk-Means&pageId=75159" target="">Edit Diagram</A>
+                        
+                          </CAPTION>
+
+            <TR><TD>
+                            <IMG style="border: none" usemap="#GLIFFY_MAP_75159_Example_implementation_of_k-Means_provided_with_Mahout" src="http://cwiki.apache.org/confluence/plugins/servlet/gliffyapi/clientdiagramjpeg?cb=979845575&pk=pub&name=Example%20implementation%20of%20k-Means%20provided%20with%20Mahout&ceoid=75159&key=MAHOUT&size=L&version=0" alt="A Gliffy Diagram named: Example implementation of k-Means provided with Mahout">
+                       </TD></TR>
+        </TABLE> 
+</TD></TR>
+</TABLE>
+<MAP name="GLIFFY_MAP_75159_Example_implementation_of_k-Means_provided_with_Mahout"></MAP>
+
+<P>This diagram doesn't consider CanopyClustering:</P>
+
+<TABLE width="100%">
+    <TR><TD align="left"> 
+        <TABLE>     
+            <CAPTION align="bottom">
+                                 
+                                            
+                        <A href="http://cwiki.apache.org/confluence/spaces/gliffy/viewlargediagram.action?name=k-Means%20Example&ceoid=75159&key=MAHOUT&pageId=75159" target="">Full Size</A>
+                        
+                                  
+                         |                     
+                        <A href="http://cwiki.apache.org/confluence/plugins/gliffy/showgliffyeditor.action?name=k-Means%20Example&ceoid=75159&key=MAHOUT&lastPage=%2Fconfluence%2Fdisplay%2FMAHOUT%2Fk-Means&pageId=75159" target="">Edit Diagram</A>
+                        
+                          </CAPTION>
+
+            <TR><TD>
+                            <IMG style="border: none" usemap="#GLIFFY_MAP_75159_k-Means_Example" src="http://cwiki.apache.org/confluence/plugins/servlet/gliffyapi/clientdiagramjpeg?cb=-938933507&pk=pub&name=k-Means%20Example&ceoid=75159&key=MAHOUT&size=L&version=0" alt="A Gliffy Diagram named: k-Means Example">
+                       </TD></TR>
+        </TABLE> 
+</TD></TR>
+</TABLE>
+<MAP name="GLIFFY_MAP_75159_k-Means_Example"></MAP>
+
+<H2><A name="k-Means-RunningkMeansClustering"></A>Running k-Means Clustering</H2>
+
+<P>The k-Means clustering algorithm may be run using a command-line invocation on KMeansDriver.main or by making a Java call to KMeansDriver.runJob(). Both require several arguments:</P>
+
+<OL>
+	<LI>input: a file path string to a directory containing the input data set a SequenceFile(WritableComparable, VectorWritable). The sequence file <EM>key</EM> is not used.</LI>
+	<LI>clustersIn: a file path string to a directory containing the initial clusters, a SequenceFile(key, Cluster | Canopy). Both KMeans clusters and Canopy canopies may be used for the initial clusters.</LI>
+	<LI>output: a file path string to an empty directory which is used for all output from the algorithm.</LI>
+	<LI>measure: the fully-qualified class name of an instance of DistanceMeasure which will be used for the clustering.</LI>
+	<LI>convergence: a double value used to determine if the algorithm has converged (clusters have not moved more than the value in the last iteration)</LI>
+	<LI>max-iterations: the maximum number of iterations to run, independent of the convergence specified</LI>
+	<LI>num-reducers: the number of reducer tasks to be launched. Each reducer will process a subset of the clusters, in the limit, one per cluster.</LI>
+	<LI>runClustering: a boolean indicating, if true, that the clustering step is to be executed after clusters have been determined.</LI>
+</OL>
+
+
+<P>After running the algorithm, the output directory will contain:</P>
+<OL>
+	<LI>clusters-N: directories containing SequenceFiles(Text, Cluster) produced by the algorithm for each iteration. The Text <EM>key</EM> is a cluster identifier string.</LI>
+	<LI>clusteredPoints: (if runClustering enabled) a directory containing SequenceFile(IntWritable, WeightedVectorWritable). The IntWritable <EM>key</EM> is the clusterId. The WeightedVectorWritable <EM>value</EM> is a bean containing a double <EM>weight</EM> and a VectorWritable <EM>vector</EM> where the weight indicates the probability that the vector is a member of the cluster. For k-Means clustering, the weights are all 1.0 since the algorithm selects only a single, most likely cluster for each point.</LI>
+</OL>
+
+        </DIV>
+
+        
+      </DIV>
+    </DIV>
+    <DIV class="footer">
+      Generated by
+      <A href="http://www.atlassian.com/confluence/">Atlassian Confluence</A> (Version: 3.2 Build: 1810 Mar 16, 2010)
+      <A href="http://could.it/autoexport/">Auto Export Plugin</A> (Version: 1.0.0-dkulp)
+    </DIV>
+  </BODY>
+</HTML>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/latent-dirichlet-allocation.html
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/latent-dirichlet-allocation.html?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/latent-dirichlet-allocation.html (added)
+++ mahout/site/new_website/MAHOUT/latent-dirichlet-allocation.html Sat Feb 11 10:22:15 2012
@@ -0,0 +1,205 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<HTML>
+  <HEAD>
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/space.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/wiki-content.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/abs.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/tables.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/panels.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/renderer-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/content-types.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/login.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/information-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/layout-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/default-theme.css">
+    <LINK type="text/css" rel="stylesheet" href="resources/space.css">
+    <STYLE type="text/css">
+      .footer {
+        background-image:      url('https://cwiki.apache.org/confluence/images/border/border_bottom.gif');
+        background-repeat:     repeat-x;
+        background-position:   left top;
+        padding-top:           4px;
+        color:                 #666;
+      }
+    </STYLE>
+    <SCRIPT type="text/javascript" language="javascript">
+      var hide = null;
+      var show = null;
+      var children = null;
+
+      function init() {
+        /* Search form initialization */
+        var form = document.forms['search'];
+        if (form != null) {
+          form.elements['domains'].value = location.hostname;
+          form.elements['sitesearch'].value = location.hostname;
+        }
+
+        /* Children initialization */
+        hide = document.getElementById('hide');
+        show = document.getElementById('show');
+        children = document.all != null ?
+                   document.all['children'] :
+                   document.getElementById('children');
+        if (children != null) {
+          children.style.display = 'none';
+          show.style.display = 'inline';
+          hide.style.display = 'none';
+        }
+      }
+
+      function showChildren() {
+        children.style.display = 'block';
+        show.style.display = 'none';
+        hide.style.display = 'inline';
+      }
+
+      function hideChildren() {
+        children.style.display = 'none';
+        show.style.display = 'inline';
+        hide.style.display = 'none';
+      }
+    </SCRIPT>
+    <TITLE>Latent Dirichlet Allocation</TITLE>
+  <META http-equiv="Content-Type" content="text/html;charset=UTF-8"></HEAD>
+  <BODY onload="init()">
+    <TABLE border="0" cellpadding="2" cellspacing="0" width="100%">
+      <TR class="topBar">
+        <TD align="left" valign="middle" class="topBarDiv" align="left" nowrap="">
+          &nbsp;<A href="mahout-wiki.html" title="Apache Mahout">Apache Mahout</A>&nbsp;&gt;&nbsp;<A href="mahout-wiki.html" title="Mahout Wiki">Mahout Wiki</A>&nbsp;&gt;&nbsp;<A href="algorithms.html" title="Algorithms">Algorithms</A>&nbsp;&gt;&nbsp;<A href="" title="Latent Dirichlet Allocation">Latent Dirichlet Allocation</A>
+        </TD>
+        <TD align="right" valign="middle" nowrap="">
+          <FORM name="search" action="http://www.google.com/search" method="get">
+            <INPUT type="hidden" name="ie" value="UTF-8">
+            <INPUT type="hidden" name="oe" value="UTF-8">
+            <INPUT type="hidden" name="domains" value="">
+            <INPUT type="hidden" name="sitesearch" value="">
+            <INPUT type="text" name="q" maxlength="255" value="">        
+            <INPUT type="submit" name="btnG" value="Google Search">
+          </FORM>
+        </TD>
+      </TR> 
+    </TABLE>
+
+    <DIV id="PageContent">
+      <DIV class="pageheader" style="padding: 6px 0px 0px 0px;">
+        <!-- We'll enable this once we figure out how to access (and save) the logo resource -->
+        <!--img src="/wiki/images/confluence_logo.gif" style="float: left; margin: 4px 4px 4px 10px;" border="0"-->
+        <DIV style="margin: 0px 10px 0px 10px" class="smalltext">Apache Mahout</DIV>
+        <DIV style="margin: 0px 10px 8px 10px" class="pagetitle">Latent Dirichlet Allocation</DIV>
+
+        <DIV class="greynavbar" align="right" style="padding: 2px 10px; margin: 0px;">
+          <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=121699">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/notep_16.gif" height="16" width="16" border="0" align="absmiddle" title="Edit Page"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=121699">Edit Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/browse_space.gif" height="16" width="16" border="0" align="absmiddle" title="Browse Space"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">Browse Space</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=121699">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_page_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add Page"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=121699">Add Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=121699">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_blogentry_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add News"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=121699">Add News</A>
+        </DIV>
+      </DIV>
+
+      <DIV class="pagecontent">
+        <DIV class="wiki-content">
+          <H1><A name="LatentDirichletAllocation-Overview"></A>Overview</H1>
+
+<P>Latent Dirichlet Allocation (Blei et al, 2003) is a powerful learning algorithm for automatically and jointly clustering words into &quot;topics&quot; and documents into mixtures of topics. It has been successfully applied to model change in scientific fields over time (Griffiths and Steyvers, 2004; Hall, et al. 2008). </P>
+
+<P>A topic model is, roughly, a hierarchical Bayesian model that associates with each document a probability distribution over &quot;topics&quot;, which are in turn distributions over words. For instance, a topic in a collection of newswire might include words about &quot;sports&quot;, such as &quot;baseball&quot;, &quot;home run&quot;, &quot;player&quot;, and a document about steroid use in baseball might include &quot;sports&quot;, &quot;drugs&quot;, and &quot;politics&quot;. Note that the labels &quot;sports&quot;, &quot;drugs&quot;, and &quot;politics&quot;, are post-hoc labels assigned by a human, and that the algorithm itself only assigns associate words with probabilities. The task of parameter estimation in these models is to learn both what the topics are, and which documents employ them in what proportions.</P>
+
+<P>Another way to view a topic model is as a generalization of a mixture model like <A href="dirichlet-process-clustering.html" title="Dirichlet Process Clustering">Dirichlet Process Clustering</A>. Starting from a normal mixture model, in which we have a single global mixture of several distributions, we instead say that <EM>each</EM> document has its own mixture distribution over the globally shared mixture components. Operationally in Dirichlet Process Clustering, each document has its own latent variable drawn from a global mixture that specifies which model it belongs to, while in LDA each word in each document has its own parameter drawn from a document-wide mixture.</P>
+
+<P>The idea is that we use a probabilistic mixture of a number of models that we use to explain some observed data. Each observed data point is assumed to have come from one of the models in the mixture, but we don't know which.  The way we deal with that is to use a so-called latent parameter which specifies which model each data point came from.</P>
+
+<H1><A name="LatentDirichletAllocation-InvocationandUsage"></A>Invocation and Usage</H1>
+
+<P>Mahout's implementation of LDA operates on a collection of SparseVectors of word counts. These word counts should be non-negative integers, though things will-- probably --work fine if you use non-negative reals. (Note that the probabilistic model doesn't make sense if you do!) To create these vectors, it's recommended that you follow the instructions in <A href="creating-vectors-from-text.html" title="Creating Vectors from Text">Creating Vectors from Text</A>, making sure to use TF and not TFIDF as the scorer.</P>
+
+<P>Invocation takes the form:</P>
+
+<DIV class="preformatted panel" style="border-width: 1px;"><DIV class="preformattedContent panelContent">
+<PRE>bin/mahout lda \
+    -i &lt;input vectors directory&gt; \
+    -o &lt;output working directory&gt; \
+    -k &lt;numTopics&gt; \
+    -v &lt;number of words&gt; \
+    -a &lt;optional topic smoothing. Default: 50/numTopics&gt; \
+    -x &lt;optional number of iterations. Default is -1 (until convergence)&gt; \
+</PRE>
+</DIV></DIV>
+
+<P>Topic smoothing should generally be about 50/K, where K is the number of topics. The number of words in the vocabulary can be an upper bound, though it shouldn't be too high (for memory concerns). </P>
+
+<P>Choosing the number of topics is more art than science, and it's recommended that you try several values.</P>
+
+<P>After running LDA you can obtain an output of the computed topics using the LDAPrintTopics utility:</P>
+
+<DIV class="preformatted panel" style="border-width: 1px;"><DIV class="preformattedContent panelContent">
+<PRE>bin/mahout ldatopics \
+    -i &lt;input vectors directory&gt; \
+    -d &lt;input dictionary file&gt; \
+    -o &lt;optional output working directory. Default is to console&gt; \
+    -dt &lt;optional dictionary type (text|sequencefile). Default is text&gt;
+</PRE>
+</DIV></DIV>
+
+
+<H1><A name="LatentDirichletAllocation-Example"></A>Example</H1>
+
+<P>An example is located in mahout/examples/bin/build-reuters.sh. The script automatically downloads the Reuters-21578 corpus, builds a Lucene index and converts the Lucene index to vectors. By uncommenting the last two lines you can then cause it to run LDA on the vectors and finally print the resultant topics to the console. </P>
+
+<P>To adapt the example yourself, you should note that Lucene has specialized support for Reuters, and that building your own index will require some adaptation. The rest should hopefully not differ too much.</P>
+
+<H1><A name="LatentDirichletAllocation-ParameterEstimation"></A>Parameter Estimation</H1>
+
+<P>We use mean field variational inference to estimate the models. Variational inference can be thought of as a generalization of <A href="expectation-maximization.html" title="Expectation Maximization">EM</A> for hierarchical Bayesian models. The E-Step takes the form of, for each document, inferring the posterior probability of each topic for each word in each document. We then take the sufficient statistics and emit them in the form of (log) pseudo-counts for each word in each topic. The M-Step is simply to sum these together and (log) normalize them so that we have a distribution over the entire vocabulary of the corpus for each topic. </P>
+
+<P>In implementation, the E-Step is implemented in the Map, and the M-Step is executed in the reduce step, with the final normalization happening as a post-processing step.</P>
+
+<H1><A name="LatentDirichletAllocation-References"></A>References</H1>
+
+<P><A href="http://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf" class="external-link" rel="nofollow">David M. Blei, Andrew Y. Ng, Michael I. Jordan, John Lafferty. 2003. Latent Dirichlet Allocation. JMLR.</A></P>
+
+<P><A href="http://psiexp.ss.uci.edu/research/papers/sciencetopics.pdf" class="external-link" rel="nofollow">Thomas L. Griffiths and Mark Steyvers. 2004. Finding scientific topics. PNAS.  </A></P>
+
+<P><A href="http://www.aclweb.org/anthology-new/D/D08/D08-1038.pdf" class="external-link" rel="nofollow">David Hall, Dan Jurafsky, and Christopher D. Manning. 2008. Studying the History of Ideas Using Topic Models </A></P>
+        </DIV>
+
+        
+      </DIV>
+    </DIV>
+    <DIV class="footer">
+      Generated by
+      <A href="http://www.atlassian.com/confluence/">Atlassian Confluence</A> (Version: 3.2 Build: 1810 Mar 16, 2010)
+      <A href="http://could.it/autoexport/">Auto Export Plugin</A> (Version: 1.0.0-dkulp)
+    </DIV>
+<SCRIPT type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-17359171-1']);
+  _gaq.push(['_setDomainName', 'none']);
+  _gaq.push(['_setAllowLinker', true]);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</SCRIPT>
+  </BODY>
+</HTML>
\ No newline at end of file



Mime
View raw message