spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From m...@apache.org
Subject spark git commit: [SPARK-7585] [ML] [DOC] VectorIndexer user guide section
Date Thu, 21 May 2015 20:06:02 GMT
Repository: spark
Updated Branches:
  refs/heads/master 15680aeed -> 6d75ed7e5


[SPARK-7585] [ML] [DOC] VectorIndexer user guide section

Added VectorIndexer section to ML user guide.  Also added javaCategoryMaps() method and Java
unit test for it.

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #6255 from jkbradley/vector-indexer-guide and squashes the following commits:

dbb8c4c [Joseph K. Bradley] simplified VectorIndexerModel.javaCategoryMaps
f692084 [Joseph K. Bradley] Added VectorIndexer section to ML user guide.  Also added javaCategoryMaps()
method and Java unit test for it.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d75ed7e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d75ed7e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d75ed7e

Branch: refs/heads/master
Commit: 6d75ed7e5ccf6c58143de4608115f9a2b3ff6cf4
Parents: 15680ae
Author: Joseph K. Bradley <joseph@databricks.com>
Authored: Thu May 21 13:05:48 2015 -0700
Committer: Xiangrui Meng <meng@databricks.com>
Committed: Thu May 21 13:05:48 2015 -0700

----------------------------------------------------------------------
 docs/ml-features.md                             | 83 ++++++++++++++++++++
 .../apache/spark/ml/feature/VectorIndexer.scala | 10 +++
 .../ml/feature/JavaVectorIndexerSuite.java      |  4 +-
 3 files changed, 96 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/6d75ed7e/docs/ml-features.md
----------------------------------------------------------------------
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 235029d..06f1ac1 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -535,5 +535,88 @@ encoded = encoder.transform(indexed)
 </div>
 </div>
 
+## VectorIndexer
+
+`VectorIndexer` helps index categorical features in datasets of `Vector`s.
+It can both automatically decide which features are categorical and convert original values
to category indices.  Specifically, it does the following:
+
+1. Take an input column of type [Vector](api/scala/index.html#org.apache.spark.mllib.linalg.Vector)
and a parameter `maxCategories`.
+2. Decide which features should be categorical based on the number of distinct values, where
features with at most `maxCategories` are declared categorical.
+3. Compute 0-based category indices for each categorical feature.
+4. Index categorical features and transform original feature values to indices.
+
+Indexing categorical features allows algorithms such as Decision Trees and Tree Ensembles
to treat categorical features appropriately, improving performance.
+
+Please refer to the [VectorIndexer API docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer)
for more details.
+
+In the example below, we read in a dataset of labeled points and then use `VectorIndexer`
to decide which features should be treated as categorical.  We transform the categorical feature
values to their indices.  This transformed data could then be passed to algorithms such as
`DecisionTreeRegressor` that handle categorical features.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.VectorIndexer
+import org.apache.spark.mllib.util.MLUtils
+
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val indexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexed")
+  .setMaxCategories(10)
+val indexerModel = indexer.fit(data)
+val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
+println(s"Chose ${categoricalFeatures.size} categorical features: " +
+  categoricalFeatures.mkString(", "))
+
+// Create new column "indexed" with categorical values transformed to indices
+val indexedData = indexerModel.transform(data)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import java.util.Map;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+
+JavaRDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(),
+  "data/mllib/sample_libsvm_data.txt").toJavaRDD();
+DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+VectorIndexer indexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexed")
+  .setMaxCategories(10);
+VectorIndexerModel indexerModel = indexer.fit(data);
+Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
+System.out.print("Chose " + categoryMaps.size() + "categorical features:");
+for (Integer feature : categoryMaps.keySet()) {
+  System.out.print(" " + feature);
+}
+System.out.println();
+
+// Create new column "indexed" with categorical values transformed to indices
+DataFrame indexedData = indexerModel.transform(data);
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+from pyspark.ml.feature import VectorIndexer
+from pyspark.mllib.util import MLUtils
+
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
+indexerModel = indexer.fit(data)
+
+# Create new column "indexed" with categorical values transformed to indices
+indexedData = indexerModel.transform(data)
+{% endhighlight %}
+</div>
+</div>
+
 # Feature Selectors
 

http://git-wip-us.apache.org/repos/asf/spark/blob/6d75ed7e/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 6d1d052..e238fb3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.ml.feature
 
+import java.lang.{Double => JDouble, Integer => JInt}
+import java.util.{Map => JMap}
+
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.attribute._
@@ -248,6 +253,11 @@ class VectorIndexerModel private[ml] (
     val categoryMaps: Map[Int, Map[Double, Int]])
   extends Model[VectorIndexerModel] with VectorIndexerParams {
 
+  /** Java-friendly version of [[categoryMaps]] */
+  def javaCategoryMaps: JMap[JInt, JMap[JDouble, JInt]] = {
+    categoryMaps.mapValues(_.asJava).asJava.asInstanceOf[JMap[JInt, JMap[JDouble, JInt]]]
+  }
+
   /**
    * Pre-computed feature attributes, with some missing info.
    * In transform(), set attribute name and other info, if available.

http://git-wip-us.apache.org/repos/asf/spark/blob/6d75ed7e/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
----------------------------------------------------------------------
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
index 1611001..c7ae546 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
@@ -19,6 +19,7 @@ package org.apache.spark.ml.feature;
 
 import java.io.Serializable;
 import java.util.List;
+import java.util.Map;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -64,7 +65,8 @@ public class JavaVectorIndexerSuite implements Serializable {
       .setMaxCategories(2);
     VectorIndexerModel model = indexer.fit(data);
     Assert.assertEquals(model.numFeatures(), 2);
-    Assert.assertEquals(model.categoryMaps().size(), 1);
+    Map<Integer, Map<Double, Integer>> categoryMaps = model.javaCategoryMaps();
+    Assert.assertEquals(categoryMaps.size(), 1);
     DataFrame indexedData = model.transform(data);
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org


Mime
View raw message