spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mengxr <...@git.apache.org>
Subject [GitHub] spark pull request: [SPARK-2756] [mllib] Decision tree bug fixes
Date Thu, 31 Jul 2014 07:12:05 GMT
Github user mengxr commented on a diff in the pull request:

    https://github.com/apache/spark/pull/1673#discussion_r15628659
  
    --- Diff: mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala ---
    @@ -602,12 +609,78 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext
{
         assert(bestSplit.featureType === Categorical)
       }
     
    +  test("stump with 1 continuous variable for binary classification, to check off-by-1
error") {
    +    val arr = new Array[LabeledPoint](4)
    +    arr(0) = new LabeledPoint(0.0, Vectors.dense(0.0))
    +    arr(1) = new LabeledPoint(1.0, Vectors.dense(1.0))
    +    arr(2) = new LabeledPoint(1.0, Vectors.dense(2.0))
    +    arr(3) = new LabeledPoint(1.0, Vectors.dense(3.0))
    +    val input = sc.parallelize(arr)
    +    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 5,
    +      numClassesForClassification = 2)
    +
    +    val model = DecisionTree.train(input, strategy)
    +    validateClassifier(model, arr, 1.0)
    +    assert(model.numNodes === 3)
    +    assert(model.depth === 1)
    +  }
    +
    +  test("stump with 2 continuous variables for binary classification") {
    +    val arr = new Array[LabeledPoint](4)
    +    arr(0) = new LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0))))
    +    arr(1) = new LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 1.0))))
    +    arr(2) = new LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0))))
    +    arr(3) = new LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 2.0))))
    +
    +    val input = sc.parallelize(arr)
    +    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 5,
    +      numClassesForClassification = 2)
    +
    +    val model = DecisionTree.train(input, strategy)
    +    validateClassifier(model, arr, 1.0)
    +    assert(model.numNodes === 3)
    +    assert(model.depth === 1)
    +    assert(model.topNode.split.get.feature === 1)
    +  }
    +
    +  test("stump with categorical variables for multiclass classification, with just enough
bins") {
    +    val maxBins = math.pow(2, 3 - 1).toInt // just enough bins to allow unordered features
    +    val arr = DecisionTreeSuite.generateCategoricalDataPointsForMulticlass()
    +    val input = sc.parallelize(arr)
    +    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 5,
    +      numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 -> 3, 1 ->
3))
    +    assert(strategy.isMulticlassClassification)
    +
    +    val model = DecisionTree.train(input, strategy)
    +    validateClassifier(model, arr, 1.0)
    +    assert(model.numNodes === 3)
    +    assert(model.depth === 1)
    +
    +    val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
    +    val bestSplits = DecisionTree.findBestSplits(input, new Array(31), strategy, 0,
    +      Array[List[Filter]](), splits, bins, 10)
    +
    +    assert(bestSplits.length === 1)
    +    val bestSplit = bestSplits(0)._1
    +    assert(bestSplit.feature === 0)
    +    assert(bestSplit.categories.length === 1)
    +    assert(bestSplit.categories.contains(1))
    +    assert(bestSplit.featureType === Categorical)
    +    val gain = bestSplits(0)._2
    +    assert(gain.leftImpurity == 0)
    --- End diff --
    
    use `===` instead of `==` to get more information if something is wrong (same for the
line below)


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message