mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rawkintr...@apache.org
Subject mahout git commit: MAHOUT-1976 Canopy Clustering closes apache/mahout#314
Date Sun, 21 May 2017 03:36:13 GMT
Repository: mahout
Updated Branches:
  refs/heads/master 9b4eabb9c -> c29496cb1


MAHOUT-1976 Canopy Clustering closes apache/mahout#314


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/c29496cb
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/c29496cb
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/c29496cb

Branch: refs/heads/master
Commit: c29496cb11372baddbb76acdee51530347525645
Parents: 9b4eabb
Author: rawkintrevo <trevor.d.grant@gmail.com>
Authored: Sat May 20 22:36:02 2017 -0500
Committer: rawkintrevo <trevor.d.grant@gmail.com>
Committed: Sat May 20 22:36:02 2017 -0500

----------------------------------------------------------------------
 .../standard/ClusteringSuite.scala              |  26 +++
 .../math/algorithms/ClusteringSuite.scala       |  25 +++
 .../math/algorithms/clustering/Canopy.scala     | 158 +++++++++++++++++++
 .../algorithms/clustering/ClusteringModel.scala |  45 ++++++
 .../common/distance/DistanceMetrics.scala       |  48 ++++++
 .../math/algorithms/ClusteringSuiteBase.scala   |  48 ++++++
 .../math/algorithms/ClusteringSuite.scala       |  25 +++
 website/docs/_includes/algo_navbar.html         |  12 +-
 .../algorithms/clustering/canopy/Canopy.png     | Bin 0 -> 48061 bytes
 .../algorithms/clustering/canopy/Canopy10.png   | Bin 0 -> 42414 bytes
 .../algorithms/clustering/canopy/SampleData.png | Bin 0 -> 32587 bytes
 .../docs/algorithms/clustering/canopy/index.md  | 128 +++++++++++++++
 .../algorithms/clustering/distance-metrics.md   |  82 ++++++++++
 website/docs/algorithms/clustering/index.md     |   6 +
 .../map-reduce/clustering/canopy-clustering.md  |   3 +
 15 files changed, 604 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/ClusteringSuite.scala
----------------------------------------------------------------------
diff --git a/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/ClusteringSuite.scala
b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/ClusteringSuite.scala
new file mode 100644
index 0000000..ea86c91
--- /dev/null
+++ b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/ClusteringSuite.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.flinkbindings.standard
+
+import org.apache.mahout.flinkbindings.DistributedFlinkSuite
+import org.apache.mahout.math.algorithms.ClusteringSuiteBase
+import org.scalatest.FunSuite
+
+class ClusteringSuite extends FunSuite
+  with DistributedFlinkSuite with ClusteringSuiteBase
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/h2o/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala
----------------------------------------------------------------------
diff --git a/h2o/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala
new file mode 100644
index 0000000..61d54ac
--- /dev/null
+++ b/h2o/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.h2obindings.test.DistributedH2OSuite
+import org.scalatest.FunSuite
+
+class ClusteringSuite extends FunSuite
+  with DistributedH2OSuite with ClusteringSuiteBase
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala
new file mode 100644
index 0000000..96d1968
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala
@@ -0,0 +1,158 @@
+/**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements. See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership. The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License. You may obtain a copy of the License at
+  *
+  * http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied. See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+
+package org.apache.mahout.math.algorithms.clustering
+
+
+
+import org.apache.mahout.math.algorithms.common.distance.{DistanceMetric, DistanceMetricSelector}
+import org.apache.mahout.math._
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.function.VectorFunction
+import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.math.{Matrix, Vector}
+
+
+class CanopyClusteringModel(canopies: Matrix, dm: Symbol) extends ClusteringModel {
+
+  val canopyCenters = canopies
+  val distanceMetric = dm
+
+  def cluster[K](input: DrmLike[K]): DrmLike[K] = {
+
+    implicit val ctx = input.context
+    implicit val ktag =  input.keyClassTag
+
+    val bcCanopies = drmBroadcast(canopyCenters)
+    val bcDM = drmBroadcast(dvec(DistanceMetricSelector.namedMetricLookup(distanceMetric)))
+
+    input.mapBlock(1) {
+      case (keys, block: Matrix) => {
+        val outputMatrix = new DenseMatrix(block.nrow, 1)
+
+        val localCanopies: Matrix = bcCanopies.value
+        for (i <- 0 until block.nrow) {
+          val distanceMetric = DistanceMetricSelector.select(bcDM.value.get(0))
+
+          val cluster = (0 until localCanopies.nrow).foldLeft(-1, 9999999999999999.9)((l,
r) => {
+            val dist = distanceMetric.distance(localCanopies(r, ::), block(i, ::))
+            if ((dist) < l._2) {
+              (r, dist)
+            }
+            else {
+              l
+            }
+          })._1
+          outputMatrix(i, ::) = dvec(cluster)
+        }
+        keys -> outputMatrix
+      }
+    }
+  }
+}
+
+
+class CanopyClustering extends ClusteringFitter {
+
+  var t1: Double = _  // loose distance
+  var t2: Double = _  // tight distance
+  var t3: Double = _
+  var t4: Double = _
+  var distanceMeasure: Symbol = _
+
+  def setStandardHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)):
Unit = {
+    t1 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t1, 0.5)
+    t2 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t2, 0.1)
+    t3 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t3, t1)
+    t4 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t4, t2)
+
+    distanceMeasure = hyperparameters.asInstanceOf[Map[Symbol, Symbol]].getOrElse('distanceMeasure,
'Cosine)
+
+  }
+
+  def fit[K](input: DrmLike[K],
+             hyperparameters: (Symbol, Any)*): CanopyClusteringModel = {
+
+    setStandardHyperparameters(hyperparameters.toMap)
+    implicit val ctx = input.context
+    implicit val ktag =  input.keyClassTag
+
+    val dmNumber = DistanceMetricSelector.namedMetricLookup(distanceMeasure)
+
+    val distanceBC = drmBroadcast(dvec(t1,t2,t3,t4, dmNumber))
+    val canopies = input.allreduceBlock(
+      {
+
+        // Assign All Points to Clusters
+        case (keys, block: Matrix) => {
+          val t1_local = distanceBC.value.get(0)
+          val t2_local = distanceBC.value.get(1)
+          val dm = distanceBC.value.get(4)
+          CanopyFn.findCenters(block, DistanceMetricSelector.select(dm), t1_local, t2_local)
+        }
+      }, {
+        // Optionally Merge Clusters that are close enough
+        case (oldM: Matrix, newM: Matrix) => {
+          val t3_local = distanceBC.value.get(2)
+          val t4_local = distanceBC.value.get(3)
+          val dm = distanceBC.value.get(4)
+          CanopyFn.findCenters(oldM, DistanceMetricSelector.select(dm), t3_local, t4_local)
+        }
+      })
+
+    val model = new CanopyClusteringModel(canopies, distanceMeasure)
+    model.summary = s"""CanopyClusteringModel\n${canopies.nrow} Clusters\n${distanceMeasure}
distance metric used for calculating distances\nCanopy centers stored in model.canopies where
row n coresponds to canopy n"""
+    model
+  }
+
+
+}
+
+object CanopyFn extends Serializable {
+  def findCenters(block: Matrix, distanceMeasure: DistanceMetric, t1: Double, t2: Double):
Matrix = {
+    val block = dense((1.0, 1.2, 1.3, 1.4), (1.1, 1.5, 2.5, 1.0), (6.0, 5.2, -5.2, 5.3),
(7.0,6.0, 5.0, 5.0), (10.0, 1.0, 20.0, -10.0))
+    var rowAssignedToCanopy = Array.fill(block.nrow) { false }
+    val clusterBuf = scala.collection.mutable.ListBuffer.empty[org.apache.mahout.math.Vector]
+    while (rowAssignedToCanopy.contains(false)) {
+      val rowIndexOfNextUncanopiedVector = rowAssignedToCanopy.indexOf(false)
+      clusterBuf += block(rowIndexOfNextUncanopiedVector, ::).cloned
+      block(rowIndexOfNextUncanopiedVector, ::) = svec(Nil, cardinality = block.ncol)
+      rowAssignedToCanopy(rowIndexOfNextUncanopiedVector) = true
+      for (i <- 0 until block.nrow) {
+        if (block(i, ::).getNumNonZeroElements > 0) { //
+          distanceMeasure.distance(block(i, ::), clusterBuf.last) match {
+            case d if d < t2 => {
+
+              rowAssignedToCanopy(i) = true
+              block(i, ::) = svec(Nil, cardinality = block.ncol)
+            }
+            case d if d < t1 => {
+
+              rowAssignedToCanopy(i) = true
+            }
+            case d => {}
+          }
+        }
+      }
+    }
+    dense(clusterBuf)
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala
new file mode 100644
index 0000000..8ab1170
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala
@@ -0,0 +1,45 @@
+/**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements. See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership. The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License. You may obtain a copy of the License at
+  *
+  * http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied. See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+
+package org.apache.mahout.math.algorithms.clustering
+
+import org.apache.mahout.math.algorithms.{UnsupervisedFitter, UnsupervisedModel}
+import org.apache.mahout.math.drm.DrmLike
+
+trait ClusteringModel  extends UnsupervisedModel {
+
+  def cluster[K](input: DrmLike[K]): DrmLike[K]
+
+}
+
+trait ClusteringFitter extends UnsupervisedFitter {
+
+  def fit[K](input: DrmLike[K],
+             hyperparameters: (Symbol, Any)*): ClusteringModel
+
+  def fitCluster[K](input: DrmLike[K],
+                      hyperparameters: (Symbol, Any)*): DrmLike[K] = {
+    model = this.fit(input, hyperparameters:_*)
+    model.cluster(input)
+
+  }
+
+  // used to store the model if `fitTransform` method called
+  var model: ClusteringModel = _
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala
new file mode 100644
index 0000000..00495fd
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala
@@ -0,0 +1,48 @@
+/**
+  * Licensed to the Apache Software Foundation (ASF) under one or more
+  * contributor license agreements.  See the NOTICE file distributed with
+  * this work for additional information regarding copyright ownership.
+  * The ASF licenses this file to You under the Apache License, Version 2.0
+  * (the "License"); you may not use this file except in compliance with
+  * the License.  You may obtain a copy of the License at
+  *
+  * http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+package org.apache.mahout.math.algorithms.common.distance
+
+import org.apache.mahout.math.function.Functions
+import org.apache.mahout.math.{CardinalityException, Vector}
+
+trait DistanceMetric extends Serializable {
+  def distance(v1: Vector, v2: Vector): Double
+}
+
+
+object DistanceMetricSelector extends Serializable{
+
+  val namedMetricLookup = Map('Chebyshev -> 1.0, 'Cosine -> 2.0)
+
+  def select(dm: Double): DistanceMetric = {
+    dm match {
+      case 1.0 => Chebyshev
+      case 2.0 => Cosine
+    }
+  }
+}
+
+object Chebyshev extends DistanceMetric {
+  def distance(v1: Vector, v2: Vector): Double =  {
+    if (v1.size != v2.size) throw new CardinalityException(v1.size, v2.size)
+    v1.aggregate(v2, Functions.MAX_ABS, Functions.MINUS)
+  }
+}
+
+object Cosine extends DistanceMetric {
+  def distance(v1: Vector, v2: Vector): Double = 1.0 - v1.dot(v2) / (Math.sqrt(v1.getLengthSquared)
* Math.sqrt(v2.getLengthSquared))
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala
b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala
new file mode 100644
index 0000000..70fb10f
--- /dev/null
+++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala
@@ -0,0 +1,48 @@
+/**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements. See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership. The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License. You may obtain a copy of the License at
+  *
+  * http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied. See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.math.algorithms.preprocessing._
+import org.apache.mahout.math.drm.drmParallelize
+import org.apache.mahout.math.scalabindings.{dense, sparse, svec}
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.test.DistributedMahoutSuite
+import org.scalatest.{FunSuite, Matchers}
+
+import org.apache.mahout.test.DistributedMahoutSuite
+
+trait ClusteringSuiteBase extends DistributedMahoutSuite with Matchers {
+
+  this: FunSuite =>
+
+  test("canopy test") {
+    val drmA = drmParallelize(dense((1.0, 1.2, 1.3, 1.4), (1.1, 1.5, 2.5, 1.0), (6.0, 5.2,
-5.2, 5.3), (7.0,6.0, 5.0, 5.0), (10.0, 1.0, 20.0, -10.0)))
+
+    import org.apache.mahout.math.algorithms.clustering.CanopyClustering
+
+    val model = new CanopyClustering().fit(drmA, 't1 -> 6.5, 't2 -> 5.5, 'distanceMeasure
-> 'Chebyshev)
+    val myAnswer = model.cluster(drmA).collect
+
+    val correctAnswer = dense((0.0), (0.0), (1.0), (0.0), (2.0))
+
+    val epsilon = 1E-6
+    (myAnswer.norm - correctAnswer.norm) should be <= epsilon
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/spark/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala
b/spark/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala
new file mode 100644
index 0000000..3e5a06e
--- /dev/null
+++ b/spark/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import org.scalatest.FunSuite
+
+class ClusteringSuite extends FunSuite
+  with DistributedSparkSuite with ClusteringSuiteBase
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/_includes/algo_navbar.html
----------------------------------------------------------------------
diff --git a/website/docs/_includes/algo_navbar.html b/website/docs/_includes/algo_navbar.html
index 40a2861..38f833a 100644
--- a/website/docs/_includes/algo_navbar.html
+++ b/website/docs/_includes/algo_navbar.html
@@ -9,6 +9,14 @@
                 <li> <a href="{{ BASE_PATH }}/algorithms/linear-algebra/d-ssvd.html">Distributed
Stochastic Singular Value Decomposition</a></li>
             </ul>
         </div>
+        <a href="#clustering" class="list-group-item list-group-item-success" data-toggle="collapse"
data-parent="#AlgoMenu"><b>Clustering</b><i class="fa fa-caret-down"></i></a>
+        <div class="collapse" id="clustering">
+            <ul class="nav sidebar-nav">
+                <li> <a href="{{ BASE_PATH }}/algorithms/clustering">Clustering
Algorithms</a></li>
+                <li> <a href="{{ BASE_PATH }}/algorithms/clustering/distance-metrics.html">Distance
Metrics</a></li>
+                <li> <a href="{{ BASE_PATH }}/algorithms/clustering/canopy">Canopy
Clustering</a></li>
+            </ul>
+        </div>
         <a href="#preprocessors" class="list-group-item list-group-item-success" data-toggle="collapse"
data-parent="#AlgoMenu"><b>Preprocessors</b><i class="fa fa-caret-down"></i></a>
         <div class="collapse" id="preprocessors">
             <ul class="nav sidebar-nav">
@@ -64,8 +72,8 @@
                 <li> <a href="{{ BASE_PATH }}/algorithms/map-reduce/classification/support-vector-machines.html">Support
Vector Machines</a></li>
             </ul>
         </div>
-        <a href="#clustering" class="list-group-item list-group-item-success" data-toggle="collapse"
data-parent="#AlgoMenu"><b>Clustering</b><i class="fa fa-caret-down"></i></a>
-        <div class="collapse" id="clustering">
+        <a href="#mr-clustering" class="list-group-item list-group-item-success" data-toggle="collapse"
data-parent="#AlgoMenu"><b>Clustering</b><i class="fa fa-caret-down"></i></a>
+        <div class="collapse" id="mr-clustering">
             <ul class="nav sidebar-nav">
                 <li> <a href="{{ BASE_PATH }}/algorithms/map-reduce/clustering/canopy-clustering.html">Canopy
Clustering</a></li>
                 <li> <a href="{{ BASE_PATH }}/algorithms/map-reduce/clustering/cluster-dumper.html">Cluster
Dumper</a></li>

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/canopy/Canopy.png
----------------------------------------------------------------------
diff --git a/website/docs/algorithms/clustering/canopy/Canopy.png b/website/docs/algorithms/clustering/canopy/Canopy.png
new file mode 100644
index 0000000..934efd7
Binary files /dev/null and b/website/docs/algorithms/clustering/canopy/Canopy.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/canopy/Canopy10.png
----------------------------------------------------------------------
diff --git a/website/docs/algorithms/clustering/canopy/Canopy10.png b/website/docs/algorithms/clustering/canopy/Canopy10.png
new file mode 100644
index 0000000..4bb291c
Binary files /dev/null and b/website/docs/algorithms/clustering/canopy/Canopy10.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/canopy/SampleData.png
----------------------------------------------------------------------
diff --git a/website/docs/algorithms/clustering/canopy/SampleData.png b/website/docs/algorithms/clustering/canopy/SampleData.png
new file mode 100644
index 0000000..48c4690
Binary files /dev/null and b/website/docs/algorithms/clustering/canopy/SampleData.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/canopy/index.md
----------------------------------------------------------------------
diff --git a/website/docs/algorithms/clustering/canopy/index.md b/website/docs/algorithms/clustering/canopy/index.md
new file mode 100644
index 0000000..923c63a
--- /dev/null
+++ b/website/docs/algorithms/clustering/canopy/index.md
@@ -0,0 +1,128 @@
+---
+layout: algorithm
+title: Canopy Clustering
+theme:
+   name: retro-mahout
+---
+
+### About
+
+[Canopy Clustering](http://www.kamalnigam.com/papers/canopy-kdd00.pdf)
+ is a very simple, fast and surprisingly accurate method for grouping
+objects into clusters. All objects are represented as a point in a
+multidimensional feature space. The algorithm uses a fast approximate
+distance metric and two distance thresholds T1 > T2 for processing. The
+basic algorithm is to begin with a set of points and remove one at random.
+Create a Canopy containing this point and iterate through the remainder of
+the point set. At each point, if its distance from the first point is < T1,
+then add the point to the cluster. If, in addition, the distance is < T2,
+then remove the point from the set. This way points that are very close to
+the original will avoid all further processing. The algorithm loops until
+the initial set is empty, accumulating a set of Canopies, each containing
+one or more points. A given point may occur in more than one Canopy.
+
+Canopy Clustering is often used as an initial step in more rigorous
+clustering techniques, such as [K-Means Clustering](k-means-clustering.html)
+. By starting with an initial clustering the number of more expensive
+distance measurements can be significantly reduced by ignoring points
+outside of the initial canopies.
+
+#### Strategy for parallelization
+
+Looking at the sample Hadoop implementation in [http://code.google.com/p/canopy-clustering/](http://code.google.com/p/canopy-clustering/)
+ the processing is done in 3 steps:
+1. The data is massaged into suitable input format
+1. Each mapper performs canopy clustering on the points in its input set and
+outputs its canopies' centers
+1. The reducer clusters the canopy centers to produce the final canopy
+centers
+
+The points are then clustered into these final canopies when the `model.cluster(inputDRM)`
is called.
+
+Some ideas can be found in [Cluster computing and MapReduce](https://www.youtube.com/watch?v=yjPBkvYh-ss&list=PLEFAB97242917704A)
+ lecture video series \[by Google(r)\]; Canopy Clustering is discussed in [lecture #4](https://www.youtube.com/watch?v=1ZDybXl212Q)
+. Finally here is the [Wikipedia page](http://en.wikipedia.org/wiki/Canopy_clustering_algorithm)
+.
+
+#### Illustrations
+
+The following images illustrate Canopy clustering applied to a set of
+randomly-generated 2-d data points. The points are generated using a normal
+distribution centered at a mean location and with a constant standard
+deviation. See the README file in the [/examples/src/main/java/org/apache/mahout/clustering/display/README.txt](https://github.com/apache/mahout/blob/master/examples/src/main/java/org/apache/mahout/clustering/display/README.txt)
+ for details on running similar examples.
+
+The points are generated as follows:
+
+* 500 samples m=\[1.0, 1.0\](1.0,-1.0\.html)
+ sd=3.0
+* 300 samples m=\[1.0, 0.0\](1.0,-0.0\.html)
+ sd=0.5
+* 300 samples m=\[0.0, 2.0\](0.0,-2.0\.html)
+ sd=0.1
+
+In the first image, the points are plotted and the 3-sigma boundaries of
+their generator are superimposed. 
+
+![sample data](SampleData.png)
+
+In the second image, the resulting canopies are shown superimposed upon the
+sample data. Each canopy is represented by two circles, with radius T1 and
+radius T2.
+
+![canopy](Canopy.png)
+
+The third image uses the same values of T1 and T2 but only superimposes
+canopies covering more than 10% of the population. This is a bit better
+representation of the data but it still has lots of room for improvement.
+The advantage of Canopy clustering is that it is single-pass and fast
+enough to iterate runs using different T1, T2 parameters and display
+thresholds.
+
+![canopy](Canopy10.png)
+
+### Parameters
+
+<div class="table-striped">
+  <table class="table">
+    <tr>
+        <th>Parameter</th>
+        <th>Description</th>
+        <th>Default Value</th>
+    </tr>
+    <tr>
+        <td><code>'distanceMeasure</code></td>
+        <td>The metric used for calculating distance, see <a href="../distance-metrics.html">Distance
Metrics</a></td>
+        <td><code>'Cosine</code></td>
+    </tr>
+    <tr>
+        <td><code>'t1</code></td>
+        <td>The "loose" distance in the mapping phase</code></td>
+        <td>0.5</td>
+    </tr>
+    <tr>
+        <td><code>'t2</code></td>
+        <td>The "tight" distance in the mapping phase</code></td>
+        <td>0.1</td>
+    </tr>
+    <tr>
+        <td><code>'t3</code></td>
+        <td>The "loose" distance in the reducing phase</code></td>
+        <td><code>'t1</code></td>
+    </tr>
+    <tr>
+        <td><code>'t4</code></td>
+        <td>The "tight" distance in the reducing phase</code></td>
+        <td><code>'t2</code></td>
+    </tr>
+  </table>
+</div>
+
+### Example
+
+    val drmA = drmParallelize(dense((1.0, 1.2, 1.3, 1.4), (1.1, 1.5, 2.5, 1.0), (6.0, 5.2,
-5.2, 5.3), (7.0,6.0, 5.0, 5.0), (10.0, 1.0, 20.0, -10.0)))
+    
+    import org.apache.mahout.math.algorithms.clustering.CanopyClustering
+    
+    val model = new CanopyClustering().fit(drmA, 't1 -> 6.5, 't2 -> 5.5, 'distanceMeasure
-> 'Chebyshev)
+    model.cluster(drmA).collect

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/distance-metrics.md
----------------------------------------------------------------------
diff --git a/website/docs/algorithms/clustering/distance-metrics.md b/website/docs/algorithms/clustering/distance-metrics.md
new file mode 100644
index 0000000..5b25a93
--- /dev/null
+++ b/website/docs/algorithms/clustering/distance-metrics.md
@@ -0,0 +1,82 @@
+---
+layout: algorithm
+title: Distance Metrics
+theme:
+   name: retro-mahout
+---
+
+### Distance Metrics Supported By Mahout
+
+<div class="table-striped">
+  <table class="table">
+    <tr>
+        <th>Name</th>
+        <th>Object</th>
+        <th>Symbol</th>
+    </tr>
+    <tr>
+        <td><a href="https://en.wikipedia.org/wiki/Chebyshev_distance">Chebyshev
Distance</a></td>
+        <td><code>org.apache.mahout.math.algorithms.common.distance.Chebyshev</code></td>
+        <td><code>'Chebyshev</code></td>
+    </tr>
+    <tr>
+        <td><a href="https://en.wikipedia.org/wiki/Cosine_similarity">Cosine
Similarity</a></td>
+        <td><code>org.apache.mahout.math.algorithms.common.distance.Cosine</code></td>
+        <td><code>'Cosine</code></td>
+    </tr>
+    </table>
+</div>
+
+
+<!--
+A beginner JIRA to port the rest of these
+[Euclidean](https://en.wikipedia.org/wiki/Euclidean_distance)
+
+[Mahalanobis](https://en.wikipedia.org/wiki/Mahalanobis_distance)
+
+[Manhattan](https://en.wiktionary.org/wiki/Manhattan_distance)
+
+[Minkowski](https://en.wikipedia.org/wiki/Minkowski_distance)
+
+[Squared Euclidian](https://en.wikipedia.org/wiki/Euclidean_distance#Squared_Euclidean_distance)
+
+[Tanimoto](https://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_similarity_and_distance)
+
+Weighted Euclidean
+
+Weighted Manhattan-->
+
+### Using Distance Metrics
+
+In Mahout one can access the distant metrics directly to measure the distance between two
arbitrary vectors, or 
+can specify which distance metric to use as part of an algorithm. In the latter case the
distance metric is called 
+by `Symbol`, we never pass Distance metrics directly to an algorithm.  This design choice,
in part has to do with 
+serialization of object and keeping the engine bindings as simple as possible.  Behind the
scenes, the only thing 
+that is serialized and sent to the workers is a number which specifies what distant metric
to use- this is much more
+abstract and easier to maintain on the back end than making sure each function can be serialized
by any arbitrary engine.
+We feel from the user perspective, it may seem quirky but causes no decrease in usability.
 If a user wishes to use a 
+custom distance metric- simply add it to [math-scala/src/main/org/apache/mahout/math/common/DistanceMetrics.scala](https://github.com/apache/mahout/blob/master/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/DistanceMetrics.scala)
+and recompile. 
+
+### Examples
+
+**Meausring the distance between two vectors**
+
+    import org.apache.mahout.math.algorithms.common.distance._
+    
+    val v1 = dvec(1.0, 1.5, -1.2, 3.5)
+    val v2 = dvec(0.1, -1.4, 10.5, 3.2)
+    
+    Cosine.distance(v1, v2)
+
+**Using distance in clustering**
+
+    import org.apache.mahout.math.algorithms.clustering.CanopyClustering
+
+    val drmA = drmParallelize(dense((1.0, 1.2, 1.3, 1.4), 
+                                    (1.1, 1.5, 2.5, 1.0), 
+                                    (6.0, 5.2, -5.2, 5.3), 
+                                    (7.0,6.0, 5.0, 5.0), 
+                                    (10.0, 1.0, 20.0, -10.0)))
+                                    
+    val model = new CanopyClustering().fit(drmA, 'distanceMeasure -> 'Cosine)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/index.md
----------------------------------------------------------------------
diff --git a/website/docs/algorithms/clustering/index.md b/website/docs/algorithms/clustering/index.md
new file mode 100644
index 0000000..50030c2
--- /dev/null
+++ b/website/docs/algorithms/clustering/index.md
@@ -0,0 +1,6 @@
+---
+layout: algorithm
+title: Clustering Algorithms
+theme:
+   name: retro-mahout
+---

http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md
----------------------------------------------------------------------
diff --git a/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md b/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md
index d538d88..ac8a47c 100644
--- a/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md
+++ b/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md
@@ -5,6 +5,9 @@ theme:
    name: retro-mahout
 ---
 
+
+**_This Document refers to the deprecated Map-Reduce Version of this algorithm, please see
[the new implementation]({{ BASE_PATH }}/algorithms/clustering/canopy.html)_**
+
 <a name="CanopyClustering-CanopyClustering"></a>
 # Canopy Clustering
 


Mime
View raw message