mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rawkintr...@apache.org
Subject [1/3] mahout git commit: MAHOUT-1962 Add Ftest closes apache/mahout#300
Date Sun, 21 May 2017 03:34:44 GMT
Repository: mahout
Updated Branches:
  refs/heads/master 7c0babd79 -> 9b4eabb9c


MAHOUT-1962 Add Ftest closes apache/mahout#300


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/cf5a33d5
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/cf5a33d5
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/cf5a33d5

Branch: refs/heads/master
Commit: cf5a33d565f6602f6624bc4d34c62c048379140f
Parents: 7c0babd
Author: dustinvanstee <dustinvanstee@gmail.com>
Authored: Sat May 20 22:19:32 2017 -0500
Committer: rawkintrevo <trevor.d.grant@gmail.com>
Committed: Sat May 20 22:19:32 2017 -0500

----------------------------------------------------------------------
 .../regression/LinearRegressorModel.scala       | 71 ++++++++++++++---
 .../regression/OrdinaryLeastSquaresModel.scala  |  1 +
 .../algorithms/regression/RegressorModel.scala  |  6 ++
 .../regression/tests/FittnessTests.scala        | 83 +++++++++++++++++++-
 .../algorithms/RegressionTestsSuiteBase.scala   | 23 +++++-
 5 files changed, 166 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
index 84f50ed..7b87a1a 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
@@ -26,6 +26,8 @@ import org.apache.mahout.math.drm.RLikeDrmOps._
 import org.apache.mahout.math.scalabindings.dvec
 import org.apache.mahout.math.{Matrix, Vector => MahoutVector}
 import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.commons.math3.distribution._
+
 import scala.language.higherKinds
 
 trait LinearRegressorModel[K] extends RegressorModel[K] {
@@ -34,7 +36,8 @@ trait LinearRegressorModel[K] extends RegressorModel[K] {
   var se: MahoutVector = _
   var tScore: MahoutVector = _
   var pval: MahoutVector = _
-  var degreesFreedom: Int = _
+
+
 
 }
 
@@ -54,46 +57,59 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] {
     addIntercept = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('addIntercept,
true)
   }
 
+
   def calculateStandardError[M[K] <: LinearRegressorModel[K]](X: DrmLike[K],
                              drmTarget: DrmLike[K],
                              drmXtXinv: Matrix,
                              model: M[K]): M[K] = {
     import org.apache.mahout.math.function.Functions.SQRT
     import org.apache.mahout.math.scalabindings.MahoutCollections._
-    var modelOut = model
+
     val yhat = X %*% model.beta
     val residuals = drmTarget - yhat
-    val ete = (residuals.t %*% residuals).collect // 1x1
+
+    // Setting modelOut.rss
+    // Changed name from ete, to rssModel.  This is residual sum of squares for model of
yhat vs y
+    var modelOut = FittnessTests.calculateResidualSumOfSquares(model,residuals)
+
     val n = drmTarget.nrow
     val k = safeToNonNegInt(X.ncol)
     val invDegFreedomKindOf = 1.0 / (n - k)
-    val varCovarMatrix = invDegFreedomKindOf * ete(0,0) * drmXtXinv
+    val varCovarMatrix = invDegFreedomKindOf * modelOut.rss * drmXtXinv
     val se = varCovarMatrix.viewDiagonal.assign(SQRT)
     val tScore = model.beta / se
-    val tDist = new org.apache.commons.math3.distribution.TDistribution(n-k)
+    val tDist = new TDistribution(n-k)
+
     val pval = dvec(tScore.toArray.map(t => 2 * (1.0 - tDist.cumulativeProbability(Math.abs(t)))
))
+
     // ^^ TODO bug in this calculation- fix and add test
     //degreesFreedom = k
-
-
     modelOut.se = se
     modelOut.tScore = tScore
     modelOut.pval = pval
-    modelOut.degreesFreedom = X.ncol
-    modelOut.summary = generateSummaryString(modelOut)
+    modelOut.degreesOfFreedom = safeToNonNegInt(X.ncol)
+    modelOut.trainingExamples = safeToNonNegInt(n)
+
     if (calcCommonStatistics){
       modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals)
     }
+
+    // Let Statistics Get Calculated prior to assigning the summary
+    modelOut.summary = generateSummaryString(modelOut)
+
     modelOut
   }
 
+
   def calculateCommonStatistics[M[K] <: LinearRegressorModel[K]](model: M[K],
                                                                  drmTarget: DrmLike[K],
                                                                  residuals: DrmLike[K]):
M[K] ={
     var modelOut = model
     modelOut = FittnessTests.CoefficientOfDetermination(model, drmTarget, residuals)
     modelOut = FittnessTests.MeanSquareError(model, residuals)
-    modelOut.summary += s"MSE: ${modelOut.mse}\nR2: ${modelOut.r2}\n"
+    modelOut = FittnessTests.FTest(model, drmTarget)
+
+
     modelOut
   }
 
@@ -109,6 +125,8 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] {
         (0 until X.ncol).map(i => s"X${i}\t${modelOut.beta(i)}").mkString("\n")
       if (calcCommonStatistics) { // we do this in calcStandard errors to avoid calculating
residuals twice
         val residuals = drmTarget - (X %*% modelOut.beta)
+        // If rss is already set, then this will drop through to calculateCommonStatistics
+        modelOut = FittnessTests.calculateResidualSumOfSquares(modelOut,residuals)
         modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals)
       }
 
@@ -123,9 +141,38 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] {
   }
 
   def generateSummaryString[M[K] <: LinearRegressorModel[K]](model: M[K]): String = {
+
+    /*  Model after R implementation ...
+    Call:
+      lm(formula = target ~ a + b + c + d, data = df1)
+
+    Residuals:
+    1       2       3       4       5       6       7       8       9
+    -4.2799  0.5059 -2.2783  4.3765 -1.3455  0.7202 -1.8063  1.2889  2.8184
+
+    Coefficients:
+      Estimate Std. Error t value Pr(>|t|)
+    (Intercept)  163.179     51.915   3.143   0.0347 *
+      a             -1.336      2.688  -0.497   0.6452
+      b            -13.158      5.394  -2.439   0.0713 .
+      c             -4.153      1.785  -2.327   0.0806 .
+      d             -5.680      1.887  -3.010   0.0395 *
+      ---
+    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+    */
+
     val k = model.beta.length
-    "Coef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" +
-      (0 until k).map(i => s"X${i}\t${model.beta(i)}\t${model.se(i)}\t${model.tScore(i)}\t${model.pval(i)}").mkString("\n")
 
+      // Using Formatted Print here to pretty print the columns
+    var summaryString  = "\nCoef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n"
+
+      (0 until k).map(i => "X%-3d\t\t%+5.5f\t\t%+5.5f\t\t%+5.5f\t\t%+5.5f".format(i,model.beta(i),model.se(i),model.tScore(i),model.pval(i))).mkString("\n")
+    if(calcCommonStatistics) {
+      summaryString += "\nF-statistic: " + model.fScore + " on " + (model.degreesOfFreedom
- 1) + " and " +
+        (model.trainingExamples - model.degreesOfFreedom) + " DF,  p-value: " + 0.009545
+ "\n"
+      summaryString += s"\nMean Squared Error: ${model.mse}"
+      summaryString += s"\nR^2: ${model.r2}"
+
+    }
+    summaryString
   }
 }

http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
index 58eb45d..fd9924e 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
@@ -55,6 +55,7 @@ class OrdinaryLeastSquares[K] extends LinearRegressorFitter[K] {
     }
 
     var X = drmFeatures
+
     if (addIntercept) {
       X = X cbind 1
     }

http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
index 8a4b7d2..aa3dad4 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
@@ -30,6 +30,12 @@ trait RegressorModel[K] extends SupervisedModel[K] {
   // Common Applicable Tests- here only for convenience.
   var mse: Double = _
   var r2: Double = _
+  var fpval: Double = _
+  // default rss to a negative number to ensure rss gets set.
+  var rss:Double = -9999.0
+  var fScore: Double = _
+  var degreesOfFreedom: Int = _
+  var trainingExamples :Int = _
 
   /**
     * Syntatictic sugar for fetching test results.  Will Return test result if it exists,
otherwise None

http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
index d1dd3bb..c2d634b 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
@@ -19,14 +19,19 @@
 
 package org.apache.mahout.math.algorithms.regression.tests
 
+
+
+
+
+import org.apache.commons.math3.distribution.FDistribution
 import org.apache.mahout.math.algorithms.regression.RegressorModel
 import org.apache.mahout.math.algorithms.preprocessing.MeanCenter
 import org.apache.mahout.math.drm.DrmLike
 import org.apache.mahout.math.function.Functions.SQUARE
 import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.math.drm.RLikeDrmOps._
 
 import scala.language.higherKinds
-import scala.reflect.ClassTag
 
 object FittnessTests {
 
@@ -41,16 +46,88 @@ object FittnessTests {
     val r2 = 1 - (sumSquareResiduals / sumSquareTotal)
     model.r2 = r2
     model.testResults += ('r2 -> r2)  // need setResult and setSummary method incase you
change in future, also to initialize map if non exists or update value if it does
-    model.summary += s"\nR^2: ${r2}"
+    //model.summary += s"\nR^2: ${r2}"
     model
   }
 
   // https://en.wikipedia.org/wiki/Mean_squared_error
   def MeanSquareError[R[K] <: RegressorModel[K], K](model: R[K], residuals: DrmLike[K]):
R[K] = {
+    // TODO : I think mse denom should be (row - col) ?? <-- https://en.wikipedia.org/wiki/Mean_squared_error
 see regression section
     val mse = residuals.assign(SQUARE).sum / residuals.nrow
     model.mse = mse
     model.testResults += ('mse -> mse)
-    model.summary += s"\nMean Squared Error: ${mse}"
+    //model.summary += s"\nMean Squared Error: ${mse}"
+    model
+  }
+
+  // Since rss is needed for multiple test statistics, use this function to cache this value
+  def calculateResidualSumOfSquares[R[K] <: RegressorModel[K], K](model: R[K],residuals:
DrmLike[K]) : R[K] = {
+    // This is a check so that model.rss isnt unnecessarily computed
+    // by default setting this value to negative, so that the first time its garaunteed to
evaluate.
+    if (model.rss < 0) {
+      val ete = (residuals.t %*% residuals).collect // 1x1
+      model.rss = ete(0, 0)
+    }
+    model
+  }
+
+
+  // https://en.wikipedia.org/wiki/F-test
+  /*
+  # R Prototype
+  # Cereal Dataframe
+  df1 <- data.frame(
+    "X0" = c(1,1,1,1,1,1,1,1,1),
+    "a"  = c(2,1,1,2,1,2,6,3,3),
+    "b" = c( 2,2,1,1,2,1,2,2,3),
+    "c" = c( 10.5,12,12, 11,12, 16,17, 13,13),
+    "d" = c( 10,12,13,13,11,8, 1, 7, 4),
+    "target" = c( 29.509541,18.042851,22.736446,32.207582,21.871292,36.187559,50.764999,40.400208,45.811716))
+
+  # Create linear regression models adding features one by one
+  lrfit0 <- lm(data=df1, formula = target ~ 1  )
+  lrfit1 <- lm(data=df1, formula = target ~ a  )
+  lrfit2 <- lm(data=df1, formula = target ~ a + b )
+  lrfit3 <- lm(data=df1, formula = target ~ a + b + c )
+  lrfit4 <- lm(data=df1, formula = target ~ a + b + c + d)
+
+  ######################################
+  # Fscore Calculation
+  ######################################
+
+  # So in the anova report using lm ...
+  # These are the residual sum of squares for each model
+  rssint <- sum(lrfit0$residuals^2)
+  rssa <- sum(lrfit1$residuals^2)
+  rssb <- sum(lrfit2$residuals^2)
+  rssc <- sum(lrfit3$residuals^2)
+  rssd <- sum(lrfit4$residuals^2)
+
+  #Ftest in overall model
+  (rssint - rssd)/4 / (rssd/4)  # g = 4, n - g - 1  = 4
+  # Compare with R
+  summary(lrfit4)
+
+   */
+  def FTest[R[K] <: RegressorModel[K], K](model: R[K] , drmTarget: DrmLike[K]): R[K] =
{
+
+    val targetMean: Double = drmTarget.colMeans().get(0)
+
+    // rssint is the Residual Sum of Squares for model using only based on the intercept
+    val rssint: Double = ((drmTarget - targetMean  ).t %*% (drmTarget - targetMean)).zSum()
+    // K-1 is model.degreesOfFreedom-1
+    // N-K is model.trainingExamples - model.degreesOfFreedom
+
+    val fScore = ((rssint - model.rss) / (model.degreesOfFreedom-1) / ( model.rss / (model.trainingExamples
- model.degreesOfFreedom)))
+    val fDist = new FDistribution(model.degreesOfFreedom-1,model.trainingExamples-model.degreesOfFreedom)
+    val fpval = 1.0 - fDist.cumulativeProbability(fScore)
+    model.fpval = fpval
+
+    model.fScore = fScore
+    model.testResults += ('fScore -> fScore)
+    //model.summary += s"\nFscore : ${fScore}"
     model
   }
+
+
 }

http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
index 55967e1..57dffef 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
@@ -21,7 +21,7 @@ package org.apache.mahout.math.algorithms
 
 import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares
 import org.apache.mahout.math.algorithms.regression.tests._
-import org.apache.mahout.math.drm.drmParallelize
+import org.apache.mahout.math.drm.{CheckpointedDrm, drmParallelize}
 import org.apache.mahout.math.drm.RLikeDrmOps._
 import org.apache.mahout.math.scalabindings.{`::`, dense}
 import org.apache.mahout.test.DistributedMahoutSuite
@@ -78,11 +78,26 @@ trait RegressionTestsSuiteBase extends DistributedMahoutSuite with Matchers
{
 
     val r2: Double = model.r2
     val mse: Double = model.mse
-    println("R2: " + r2)
-    println("MSE: " + mse)
     (rR2 - r2) should be < epsilon
     (rMSE - mse) should be < epsilon
 
+    Math.abs(model.beta.get(4) - 163.17933  ) should be < epsilon
+    Math.abs(model.beta.get(0) - (-1.33627) ) should be < epsilon
+    Math.abs(model.beta.get(1) - (-13.15770)) should be < epsilon
+    Math.abs(model.beta.get(2) - (-4.15265) ) should be < epsilon
+    Math.abs(model.beta.get(3) - (-5.679908)) should be < epsilon
+
+    Math.abs(model.tScore.get(0) - (-0.49715717)) should be < epsilon
+    Math.abs(model.tScore.get(1) - (-2.43932888)) should be < epsilon
+    Math.abs(model.tScore.get(2) - (-2.32654000)) should be < epsilon
+    Math.abs(model.tScore.get(3) - (-3.01022444)) should be < epsilon
+    Math.abs(model.tScore.get(4) -  3.143183937 ) should be < epsilon
+
+    model.degreesOfFreedom should equal(5)
+    model.trainingExamples should equal(9)
+
+    Math.abs((model.fScore - 16.38542361))  should be < 0.0000001
+
   }
 
   test("durbinWatsonTest test") {
@@ -106,4 +121,6 @@ trait RegressionTestsSuiteBase extends DistributedMahoutSuite with Matchers
{
     (myAnswer - correctAnswer) should be < epsilon
   }
 
+
 }
+


Mime
View raw message