Return-Path: X-Original-To: apmail-mahout-dev-archive@www.apache.org Delivered-To: apmail-mahout-dev-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id B3DF311D1F for ; Wed, 18 Jun 2014 17:48:25 +0000 (UTC) Received: (qmail 30236 invoked by uid 500); 18 Jun 2014 17:48:25 -0000 Delivered-To: apmail-mahout-dev-archive@mahout.apache.org Received: (qmail 30160 invoked by uid 500); 18 Jun 2014 17:48:25 -0000 Mailing-List: contact dev-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list dev@mahout.apache.org Received: (qmail 30146 invoked by uid 99); 18 Jun 2014 17:48:25 -0000 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 18 Jun 2014 17:48:25 +0000 Date: Wed, 18 Jun 2014 17:48:25 +0000 (UTC) From: "ASF GitHub Bot (JIRA)" To: dev@mahout.apache.org Message-ID: In-Reply-To: References: Subject: [jira] [Commented] (MAHOUT-1500) H2O integration MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394 [ https://issues.apache.org/jira/browse/MAHOUT-1500?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14036018#comment-14036018 ] ASF GitHub Bot commented on MAHOUT-1500: ---------------------------------------- Github user dlyubimov commented on a diff in the pull request: https://github.com/apache/mahout/pull/21#discussion_r13929215 --- Diff: h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala --- @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.decompositions + +import org.scalatest.{Matchers, FunSuite} +import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.math._ +import drm._ +import scalabindings._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.h2obindings._ +import org.apache.mahout.common.RandomUtils +import scala.math._ + +class MathSuite extends FunSuite with Matchers with MahoutLocalContext { + + test("thin distributed qr") { + + val inCoreA = dense( + (1, 2, 3, 4), + (2, 3, 4, 5), + (3, -4, 5, 6), + (4, 5, 6, 7), + (8, 6, 7, 8) + ) + + val A = drmParallelize(inCoreA, numPartitions = 2) + val (drmQ, inCoreR) = dqrThin(A, checkRankDeficiency = false) + + // Assert optimizer still knows Q and A are identically partitioned + drmQ.partitioningTag should equal(A.partitioningTag) + +// drmQ.rdd.partitions.size should be(A.rdd.partitions.size) + + // Should also be zippable +// drmQ.rdd.zip(other = A.rdd) + + val inCoreQ = drmQ.collect + + printf("A=\n%s\n", inCoreA) + printf("Q=\n%s\n", inCoreQ) + printf("R=\n%s\n", inCoreR) + + val (qControl, rControl) = qr(inCoreA) + printf("qControl=\n%s\n", qControl) + printf("rControl=\n%s\n", rControl) + + // Validate with Cholesky + val ch = chol(inCoreA.t %*% inCoreA) + printf("A'A=\n%s\n", inCoreA.t %*% inCoreA) + printf("L:\n%s\n", ch.getL) + + val rControl2 = (ch.getL cloned).t + val qControl2 = ch.solveRight(inCoreA) + printf("qControl2=\n%s\n", qControl2) + printf("rControl2=\n%s\n", rControl2) + + // Housholder approach seems to be a little bit more stable + (rControl - inCoreR).norm should be < 1E-5 + (qControl - inCoreQ).norm should be < 1E-5 + + // Assert identicity with in-core Cholesky-based -- this should be tighter. + (rControl2 - inCoreR).norm should be < 1E-10 + (qControl2 - inCoreQ).norm should be < 1E-10 + + // Assert orhtogonality: + // (a) Q[,j] dot Q[,j] == 1.0 for all j + // (b) Q[,i] dot Q[,j] == 0.0 for all i != j + for (col <- 0 until inCoreQ.ncol) + ((inCoreQ(::, col) dot inCoreQ(::, col)) - 1.0).abs should be < 1e-10 + for (col1 <- 0 until inCoreQ.ncol - 1; col2 <- col1 + 1 until inCoreQ.ncol) + (inCoreQ(::, col1) dot inCoreQ(::, col2)).abs should be < 1e-10 + + + } + + test("dssvd - the naive-est - q=0") { + dssvdNaive(q = 0) + } + + test("ddsvd - naive - q=1") { + dssvdNaive(q = 1) + } + + test("ddsvd - naive - q=2") { + dssvdNaive(q = 2) + } + + + def dssvdNaive(q: Int) { + val inCoreA = dense( + (1, 2, 3, 4), + (2, 3, 4, 5), + (3, -4, 5, 6), + (4, 5, 6, 7), + (8, 6, 7, 8) + ) + val drmA = drmParallelize(inCoreA, numPartitions = 2) + + val (drmU, drmV, s) = dssvd(drmA, k = 4, q = q) + val (inCoreU, inCoreV) = (drmU.collect, drmV.collect) + + printf("U:\n%s\n", inCoreU) + printf("V:\n%s\n", inCoreV) + printf("Sigma:\n%s\n", s) + + (inCoreA - (inCoreU %*%: diagv(s)) %*% inCoreV.t).norm should be < 1E-5 + } + + test("dspca") { + + val rnd = RandomUtils.getRandom + + // Number of points + val m = 500 + // Length of actual spectrum + val spectrumLen = 40 + + val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) + printf("spectrum:%s\n", spectrum) + + val (u, _) = qr(new SparseRowMatrix(m, spectrumLen) := + ((r, c, v) => if (rnd.nextDouble() < 0.2) 0 else rnd.nextDouble() + 5.0)) + + // PCA Rotation matrix -- should also be orthonormal. + val (tr, _) = qr(Matrices.symmetricUniformView(spectrumLen, spectrumLen, rnd.nextInt) - 10.0) + + val input = (u %*%: diagv(spectrum)) %*% tr.t + val drmInput = drmParallelize(m = input, numPartitions = 2) + + // Calculate just first 10 principal factors and reduce dimensionality. + // Since we assert just validity of the s-pca, not stochastic error, we bump p parameter to + // ensure to zero stochastic error and assert only functional correctness of the method's pca- + // specific additions. + val k = 10 + + // Calculate just first 10 principal factors and reduce dimensionality. + var (drmPCA, _, s) = dspca(A = drmInput, k = 10, p = spectrumLen, q = 1) + // Un-normalized pca data: + drmPCA = drmPCA %*% diagv(s) + + val pca = drmPCA.checkpoint(CacheHint.NONE).collect + + // Of course, once we calculated the pca, the spectrum is going to be different since our originally + // generated input was not centered. So here, we'd just brute-solve pca to verify + val xi = input.colMeans() + for (r <- 0 until input.nrow) input(r, ::) -= xi + var (pcaControl, _, sControl) = svd(m = input) + pcaControl = (pcaControl %*%: diagv(sControl))(::, 0 until k) + + printf("pca:\n%s\n", pca(0 until 10, 0 until 10)) + printf("pcaControl:\n%s\n", pcaControl(0 until 10, 0 until 10)) + + (pca(0 until 10, 0 until 10).norm - pcaControl(0 until 10, 0 until 10).norm).abs should be < 1E-5 + + } + + test("als") { + + val rnd = RandomUtils.getRandom + + // Number of points + val m = 500 + val n = 500 + + // Length of actual spectrum + val spectrumLen = 40 + + // Create singluar values with decay + val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) + printf("spectrum:%s\n", spectrum) + + // Create A as an ideal input + val inCoreA = (qr(Matrices.symmetricUniformView(m, spectrumLen, 1234))._1 %*%: diagv(spectrum)) %*% + qr(Matrices.symmetricUniformView(n, spectrumLen, 2345))._1.t + val drmA = drmParallelize(inCoreA, numPartitions = 2) + + // Decompose using ALS + val (drmU, drmV, rmse) = als(drmInput = drmA, k = 20).toTuple + val inCoreU = drmU.collect + val inCoreV = drmV.collect + + val predict = inCoreU %*% inCoreV.t + + printf("Control block:\n%s\n", inCoreA(0 until 3, 0 until 3)) + printf("ALS factorized approximation block:\n%s\n", predict(0 until 3, 0 until 3)) + + val err = (inCoreA - predict).norm + printf ("norm of residuals %f\n",err) + printf ("train iteration rmses: %s\n", rmse) + + err should be < 1e-2 + + } --- End diff -- I think if all these tests are passing, this would be an incredibly cool step forward in this issue.. > H2O integration > --------------- > > Key: MAHOUT-1500 > URL: https://issues.apache.org/jira/browse/MAHOUT-1500 > Project: Mahout > Issue Type: Improvement > Reporter: Anand Avati > Fix For: 1.0 > > > Provide H2O backend for the Mahout DSL -- This message was sent by Atlassian JIRA (v6.2#6252)