Mailing-List: contact reviews-help@spark.apache.org; run by ezmlm
Precedence: bulk
Reply-To: reviews@spark.apache.org
From: mengxr <git@git.apache.org>
To: reviews@spark.apache.org
Reply-To: reviews@spark.apache.org
References: <git-pr-1673-spark@git.apache.org>
In-Reply-To: <git-pr-1673-spark@git.apache.org>
Subject: [GitHub] spark pull request: [SPARK-2756] [mllib] Decision tree bug
 fixes
Content-Type: text/plain
Message-Id: <20140731071205.BC8119BB279@tyr.zones.apache.org>
Date: Thu, 31 Jul 2014 07:12:05 +0000 (UTC)

Github user mengxr commented on a diff in the pull request:

    https://github.com/apache/spark/pull/1673#discussion_r15628659
  
    --- Diff: mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala ---
    @@ -602,12 +609,78 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
         assert(bestSplit.featureType === Categorical)
       }
     
    +  test("stump with 1 continuous variable for binary classification, to check off-by-1 error") {
    +    val arr = new Array[LabeledPoint](4)
    +    arr(0) = new LabeledPoint(0.0, Vectors.dense(0.0))
    +    arr(1) = new LabeledPoint(1.0, Vectors.dense(1.0))
    +    arr(2) = new LabeledPoint(1.0, Vectors.dense(2.0))
    +    arr(3) = new LabeledPoint(1.0, Vectors.dense(3.0))
    +    val input = sc.parallelize(arr)
    +    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 5,
    +      numClassesForClassification = 2)
    +
    +    val model = DecisionTree.train(input, strategy)
    +    validateClassifier(model, arr, 1.0)
    +    assert(model.numNodes === 3)
    +    assert(model.depth === 1)
    +  }
    +
    +  test("stump with 2 continuous variables for binary classification") {
    +    val arr = new Array[LabeledPoint](4)
    +    arr(0) = new LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0))))
    +    arr(1) = new LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 1.0))))
    +    arr(2) = new LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0))))
    +    arr(3) = new LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 2.0))))
    +
    +    val input = sc.parallelize(arr)
    +    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 5,
    +      numClassesForClassification = 2)
    +
    +    val model = DecisionTree.train(input, strategy)
    +    validateClassifier(model, arr, 1.0)
    +    assert(model.numNodes === 3)
    +    assert(model.depth === 1)
    +    assert(model.topNode.split.get.feature === 1)
    +  }
    +
    +  test("stump with categorical variables for multiclass classification, with just enough bins") {
    +    val maxBins = math.pow(2, 3 - 1).toInt // just enough bins to allow unordered features
    +    val arr = DecisionTreeSuite.generateCategoricalDataPointsForMulticlass()
    +    val input = sc.parallelize(arr)
    +    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 5,
    +      numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3))
    +    assert(strategy.isMulticlassClassification)
    +
    +    val model = DecisionTree.train(input, strategy)
    +    validateClassifier(model, arr, 1.0)
    +    assert(model.numNodes === 3)
    +    assert(model.depth === 1)
    +
    +    val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
    +    val bestSplits = DecisionTree.findBestSplits(input, new Array(31), strategy, 0,
    +      Array[List[Filter]](), splits, bins, 10)
    +
    +    assert(bestSplits.length === 1)
    +    val bestSplit = bestSplits(0)._1
    +    assert(bestSplit.feature === 0)
    +    assert(bestSplit.categories.length === 1)
    +    assert(bestSplit.categories.contains(1))
    +    assert(bestSplit.featureType === Categorical)
    +    val gain = bestSplits(0)._2
    +    assert(gain.leftImpurity == 0)
    --- End diff --
    
    use `===` instead of `==` to get more information if something is wrong (same for the line below)


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---