singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wang...@apache.org
Subject [1/4] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface
Date Thu, 07 Apr 2016 13:12:17 GMT
Repository: incubator-singa
Updated Branches:
  refs/heads/master 369d87960 -> 8329aa0c3


SINGA-80 New Blob Level and Address Level Math Operation Interface

-------

add gtest for blob level functions
and bug fix in math_blob.h


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/247002d3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/247002d3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/247002d3

Branch: refs/heads/master
Commit: 247002d3027b5ac45dda735f62cc2f8df7c2dcac
Parents: 0233049
Author: jinyangturbo <pku.turbo@gmail.com>
Authored: Wed Mar 16 00:16:01 2016 -0700
Committer: Wei Wang <wangwei@comp.nus.edu.sg>
Committed: Sat Apr 2 21:33:27 2016 +0800

----------------------------------------------------------------------
 include/singa/utils/blob.h      |  15 +
 include/singa/utils/math_blob.h |  70 +++--
 src/test/test_math.cc           | 587 +++++++++++++++++++++++++++++++++++
 3 files changed, 649 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/247002d3/include/singa/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h
index 3351cff..9defeac 100644
--- a/include/singa/utils/blob.h
+++ b/include/singa/utils/blob.h
@@ -276,6 +276,21 @@ class Blob {
     ret.transpose_ = !transpose_;
     return ret;
   }
+  // to check if two blob has the exact same content
+  bool check_equal(Blob* other) const {
+    if (transpose() != other->transpose()) return false;
+    if (count() != other->count()) return false;
+    if (shape().size() != other->shape().size()) return false;
+    for (int i = 0; i < shape().size(); i++) {
+      if (shape(i) != other->shape(i)) return false;
+    }
+    const Dtype * a = cpu_data();
+    const Dtype * b = other->cpu_data();
+    for (int i = 0; i < count(); i++) {
+      if (a[i] != b[i]) return false;
+    }
+    return true;
+  }
 
  protected:
   std::shared_ptr<SyncedMemory> data_ = nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/247002d3/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
index 55ba44b..35985f1 100644
--- a/include/singa/utils/math_blob.h
+++ b/include/singa/utils/math_blob.h
@@ -32,14 +32,16 @@
 #include "singa/utils/context.h"
 
 namespace singa {
+
+#define NO_GPU LOG(FATAL) << "Not compiled with GPU";
 /**
  * \file math_blob.h is not tested thorough.
  * Only GEMM() and MMDot() MVSumRow() andMVAddRow() are used now.
  */
 /************* BLAS level 1 *****************/
 /**
- * Scale each element of A with alpha, and put the result into B.
- * Bi = alpha*Ai
+ * Scale each element of A with alpha, and put the result into A.
+ * Ai = alpha*Ai
  * Use blas scale internally.
  */
 template<typename Dtype>
@@ -52,6 +54,8 @@ void Scale(Dtype alpha, Blob<Dtype> * B) {
 #ifdef USE_GPU
     gpu_scale(context->cublas_handle(device), B->count(), alpha,
         B->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif
   }
 }
@@ -70,7 +74,9 @@ void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype>
* B) {
 #ifdef USE_GPU
     gpu_axpy(context->cublas_handle(device), A.count(), alpha, A.gpu_data(),
         B->mutable_gpu_data());
-#endif  // USE_GPU
+#else
+    NO_GPU;
+#endif
   }
 }
 
@@ -111,6 +117,8 @@ void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A,
 #ifdef USE_GPU
     gpu_gemv(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n,
         alpha, beta, TranA, C->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -176,6 +184,8 @@ void GEMM(Dtype alpha, Dtype beta, const Blob<Dtype>& A, const
Blob<Dtype>& B,
 #ifdef USE_GPU
     gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
         m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -216,9 +226,10 @@ Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype>
& B) {
     res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
   } else {
 #ifdef USE_GPU
-    // gpu part
     res = gpu_dot(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
         n);
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
   return res;
@@ -242,12 +253,14 @@ void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>&
B, Blob<Dtype> * C) {
   auto context = Singleton<Context>::Instance();
   int device = context->device_id(std::this_thread::get_id());
   if (device < 0) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, false, false,
-        C->mutable_cpu_data());
+    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, Dtype(1), Dtype(0), false,
+        false, C->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
     gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
         m, n, 1, 1, 0, false, false, C->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -268,7 +281,7 @@ void Map(const Blob<Dtype> & A, Blob<Dtype> * B) {
 #ifdef USE_GPU
     gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
 #else
-    LOG(ERROR) << "Not implemented";
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -310,7 +323,7 @@ void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>*
B) {
 #ifdef USE_GPU
     gpu_e_f<Op>(A.count(), A.gpu_data(), alpha, B->mutable_gpu_data());
 #else
-    LOG(FATAL) << "Not implemented";
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -328,9 +341,8 @@ void Map(Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>&
B,
     cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
         C->mutable_cpu_data());
   } else {
-#ifdef USE_GPU
-    LOG(ERROR) << "Not implemented";
-#endif  // USE_GPU
+    // TODO(wangwei) implement gpu version.
+    NO_GPU;
   }
 }
 
@@ -353,7 +365,7 @@ void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) {
   CUDA_CHECK(cudaMemcpy(static_cast<Dtype*>(B->mutable_gpu_data()),
              A.gpu_data(), sizeof(Dtype) * A.count(), cudaMemcpyDefault));
 #else
-  LOG(FATAL) << "Not implemented";
+  NO_GPU;
 #endif
   }
 }
@@ -365,7 +377,7 @@ void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) {
  */
 template<typename Dtype>
 void Add(Dtype alpha,  const Blob<Dtype> & A, Blob<Dtype> * B) {
-  Map<singa::op::Add<Dtype>>(alpha, A, B);
+  Map<singa::op::Add<Dtype>, Dtype>(alpha, A, B);
 }
 
 /**
@@ -385,7 +397,7 @@ void Add(const Blob<Dtype> & A, const Blob<Dtype> &
B,
  */
 template<typename Dtype>
 void Sub(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Sub<Dtype>>(alpha, A, B);
+  Map<singa::op::Sub<Dtype>, Dtype>(alpha, A, B);
 }
 
 /**
@@ -406,7 +418,7 @@ void Sub(const Blob<Dtype> & A, const Blob<Dtype> &
B,
 template<typename Dtype>
 void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
-  Map<singa::op::Mult<Dtype>>(A, B, C);
+  Map<singa::op::Mult<Dtype>, Dtype>(A, B, C);
   // TODO(wangwei) use MKL's vector func
 }
 
@@ -417,7 +429,7 @@ void Mult(const Blob<Dtype> & A, const Blob<Dtype> &
B,
 template<typename Dtype>
 void Div(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
-  Map<singa::op::Div<Dtype>>(A, B, C);
+  Map<singa::op::Div<Dtype>, Dtype>(A, B, C);
   // TODO(wangwei) use MKL's vector func
 }
 /**
@@ -481,6 +493,8 @@ void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A,
Blob<Dtype> * B) {
 #ifdef USE_GPU
       singa_gpu_add_vec_row(A.gpu_data(), B->gpu_data(), B->mutable_gpu_data(),
           m, n, n);
+#else
+      NO_GPU;
 #endif  // USE_GPU
     }
   }
@@ -520,6 +534,8 @@ void MVAddRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A,
Blob<Dtype> * B) {
 #ifdef USE_GPU
       singa_gpu_add_vec_row(A.gpu_data(), B->gpu_data(), B->mutable_gpu_data(),
           m, n, n);
+#else
+      NO_GPU;
 #endif  // USE_GPU
     }
   }
@@ -574,7 +590,8 @@ void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A,
Blob<Dtype> * B) {
   } else {
 #ifdef USE_GPU
     singa_gpu_sum_col(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
-    // gpu part (TODO check transpose case)
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -599,7 +616,8 @@ void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A,
Blob<Dtype> * B) {
   } else {
 #ifdef USE_GPU
     singa_gpu_sum_row(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
-    // gpu part (TODO check transpose case)
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -619,8 +637,9 @@ void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B)
{
     cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    // gpu part
     gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -640,6 +659,8 @@ void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B)
{
   } else {
 #ifdef USE_GPU
     gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif  // USE_GPU
   }
 }
@@ -659,6 +680,8 @@ Dtype Asum(const Blob<Dtype>& A) {
 #ifdef USE_GPU
     ret = gpu_asum(context->cublas_handle(device), A.count(), A.gpu_data(), 1)
       / A.count();
+#else
+    NO_GPU;
 #endif
   }
   return ret;
@@ -679,7 +702,7 @@ void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) {
     gpu_sample_uniform(context->curand_generator(thread), A->count(), low, high,
         A->mutable_gpu_data());
 #else
-    LOG(FATAL) << "Not implemented";
+    NO_GPU;
 #endif
   }
 }
@@ -696,6 +719,8 @@ void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) {
 #ifdef USE_GPU
     gpu_sample_gaussian(context->curand_generator(thread), A->count(),
         mean, std, A->mutable_gpu_data());
+#else
+    NO_GPU;
 #endif
   }
 }
@@ -712,8 +737,7 @@ void Softmax(int nb_rows, const Blob<Dtype>& A, Blob<Dtype>*
B) {
     cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(),
       B->mutable_cpu_data());
   } else {
-#ifdef USE_GPU
-#endif  // USE_GPU
+    NO_GPU;
   }
 }
 
@@ -727,7 +751,7 @@ void Zero(Blob<Dtype>* B) {
 #ifdef USE_GPU
     cudaMemset(B->mutable_gpu_data(), 0, B->count() * sizeof(float));
 #else
-    LOG(FATAL) << "Not implemented";
+    NO_GPU;
 #endif  // USE_GPU
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/247002d3/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
index 2627b2e..6bb6001 100644
--- a/src/test/test_math.cc
+++ b/src/test/test_math.cc
@@ -20,6 +20,8 @@
 *************************************************************/
 #include <thread>
 #include "gtest/gtest.h"
+#include "singa/utils/blob.h"
+#include "singa/utils/math_blob.h"
 #include "singa/utils/math_addr.h"
 #include "singa/utils/math_kernel.h"
 #include "singa/utils/singa_op.h"
@@ -34,6 +36,591 @@
 using namespace singa;
 using namespace std;
 
+TEST(MathBlobTest, TestScale) {
+  Blob<float> *A = new Blob<float>(10);
+  Blob<float> *B = new Blob<float>(10);
+  A->SetValue(2);
+  B->SetValue(6);
+  Scale<float>(3.0, A);
+  ASSERT_EQ(A->check_equal(B), true);
+}
+
+TEST(MathBlobTest, TestAXPY) {
+  Blob<float> * A = new Blob<float>(10);
+  Blob<float> * B = new Blob<float>(10);
+  Blob<float> * C = new Blob<float>(10);
+  Blob<float> * D = new Blob<float>(10);
+  A->SetValue(2);
+  B->SetValue(3);
+  C->SetValue(7);
+  D->SetValue(2);
+  AXPY<float>(2.0, *A, B);
+  ASSERT_EQ(B->check_equal(C), true);
+  ASSERT_EQ(A->check_equal(D), true);
+}
+
+TEST(MathBlobTest, TestGEMV) {
+  float A[5][5] = {};
+  float AT[5][5] = {};
+  float B[5] = {};
+  float Res[5] = {};
+  for(int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      A[i][j] = i * j + i - j;
+      AT[j][i] = i * j + i - j;
+    }
+    B[i] = 5*i + 3;
+    Res[i] = i;
+  }
+
+  Blob<float> * BlobA = new Blob<float>(5, 5);
+  Blob<float> * BlobAT = new Blob<float>(5, 5);
+  Blob<float> * BlobB = new Blob<float>(5);
+  Blob<float> * BlobAB = new Blob<float>(5);
+  Blob<float> * BlobATB = new Blob<float>(5);
+  Blob<float> * BlobRes = new Blob<float>(5);
+
+  BlobA->set_cpu_data(A[0]);
+  BlobAT->set_cpu_data(AT[0]);
+  BlobAT->set_transpose(true);
+  BlobB->set_cpu_data(B);
+  BlobAB->set_cpu_data(Res);
+  BlobATB->set_cpu_data(Res);
+
+  for (int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      Res[i] += 2*A[i][j] * B[j];
+    }
+  }
+
+  BlobRes->set_cpu_data(Res);
+
+  GEMV<float>(2, 1, *BlobA, *BlobB, BlobAB);
+  GEMV<float>(2, 1, *BlobAT, *BlobB, BlobATB);
+
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMVDot) {
+  float A[5][5] = {};
+  float AT[5][5] = {};
+  float B[5] = {};
+  float Res[5] = {};
+  for(int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      A[i][j] = i * j + i - j;
+      AT[j][i] = i * j + i - j;
+    }
+    B[i] = 5*i -2;
+    Res[i] = 0;
+  }
+
+  Blob<float> * BlobA = new Blob<float>(5, 5);
+  Blob<float> * BlobAT = new Blob<float>(5, 5);
+  Blob<float> * BlobB = new Blob<float>(5);
+  Blob<float> * BlobAB = new Blob<float>(5);
+  Blob<float> * BlobATB = new Blob<float>(5);
+  Blob<float> * BlobRes = new Blob<float>(5);
+
+  BlobA->set_cpu_data(A[0]);
+  BlobAT->set_cpu_data(AT[0]);
+  BlobAT->set_transpose(true);
+  BlobB->set_cpu_data(B);
+  BlobAB->set_cpu_data(Res);
+  BlobATB->set_cpu_data(Res);
+
+  for (int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      Res[i] += A[i][j] * B[j];
+    }
+  }
+
+  BlobRes->set_cpu_data(Res);
+
+  MVDot<float>(*BlobA, *BlobB, BlobAB);
+  MVDot<float>(*BlobAT, *BlobB, BlobATB);
+
+  const float * addrRes = BlobAB->cpu_data();
+  for (int i = 0; i < 5; i++) {
+    ASSERT_EQ(addrRes[i], Res[i]);
+  }
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestGEMM) {
+  float A[5][5] = {};
+  float AT[5][5] = {};
+  float B[5][5]= {};
+  float BT[5][5]= {};
+  float Res[5][5]= {};
+  for(int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      A[i][j] = i * j + i - j;
+      AT[j][i] = i * j + i - j;
+      B[i][j] = - i * j + i * i - j * j;
+      BT[j][i] = - i * j + i * i - j * j;
+      Res[i][j] = i * j + i * i + j * j;
+    }
+  }
+
+  Blob<float> * BlobA = new Blob<float>(5, 5);
+  BlobA->set_cpu_data(A[0]);
+  Blob<float> * BlobAT = new Blob<float>(5, 5);
+  BlobAT->set_cpu_data(AT[0]);
+  BlobAT->set_transpose(true);
+  Blob<float> * BlobB = new Blob<float>(5, 5);
+  BlobB->set_cpu_data(B[0]);
+  Blob<float> * BlobBT = new Blob<float>(5, 5);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+  Blob<float> * BlobAB = new Blob<float>(5, 5);
+  BlobAB->set_cpu_data(Res[0]);
+  Blob<float> * BlobABT = new Blob<float>(5, 5);
+  BlobABT->set_cpu_data(Res[0]);
+  Blob<float> * BlobATB = new Blob<float>(5, 5);
+  BlobATB->set_cpu_data(Res[0]);
+  Blob<float> * BlobATBT = new Blob<float>(5, 5);
+  BlobATBT->set_cpu_data(Res[0]);
+
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 5; j++) {
+      Res[i][j] *= 2;
+      for (int k = 0; k < 5; k++) {
+        Res[i][j] += 3 * A[i][k]*B[k][j];
+      }
+    }
+  }
+
+  Blob<float> * BlobRes = new Blob<float>(5, 5);
+  BlobRes->set_cpu_data(Res[0]);
+
+  GEMM<float>(3, 2, *BlobA, *BlobB, BlobAB);
+  GEMM<float>(3, 2, *BlobA, *BlobBT, BlobABT);
+  GEMM<float>(3, 2, *BlobAT, *BlobB, BlobATB);
+  GEMM<float>(3, 2, *BlobAT, *BlobBT, BlobATBT);
+
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobABT->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATBT->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMMDot) {
+  float A[5][5] = {};
+  float AT[5][5] = {};
+  float B[5][5]= {};
+  float BT[5][5]= {};
+  float Res[5][5]= {};
+  for(int i = 0; i < 5; i++) {
+    for(int j = 0; j < 5; j++) {
+      A[i][j] = i * j + i - j;
+      AT[j][i] = i * j + i - j;
+      B[i][j] = - i * j + i * i - j * j;
+      BT[j][i] = - i * j + i * i - j * j;
+      Res[i][j] = i * j + i * i + j * j;
+    }
+  }
+
+  Blob<float> * BlobA = new Blob<float>(5, 5);
+  BlobA->set_cpu_data(A[0]);
+  Blob<float> * BlobAT = new Blob<float>(5, 5);
+  BlobAT->set_cpu_data(AT[0]);
+  BlobAT->set_transpose(true);
+  Blob<float> * BlobB = new Blob<float>(5, 5);
+  BlobB->set_cpu_data(B[0]);
+  Blob<float> * BlobBT = new Blob<float>(5, 5);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+  Blob<float> * BlobAB = new Blob<float>(5, 5);
+  BlobAB->set_cpu_data(Res[0]);
+  Blob<float> * BlobABT = new Blob<float>(5, 5);
+  BlobABT->set_cpu_data(Res[0]);
+  Blob<float> * BlobATB = new Blob<float>(5, 5);
+  BlobATB->set_cpu_data(Res[0]);
+  Blob<float> * BlobATBT = new Blob<float>(5, 5);
+  BlobATBT->set_cpu_data(Res[0]);
+
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 5; j++) {
+      Res[i][j] = 0;
+      for (int k = 0; k < 5; k++) {
+        Res[i][j] += A[i][k]*B[k][j];
+      }
+    }
+  }
+
+  Blob<float> * BlobRes = new Blob<float>(5, 5);
+  BlobRes->set_cpu_data(Res[0]);
+
+  MMDot<float>(*BlobA, *BlobB, BlobAB);
+  MMDot<float>(*BlobA, *BlobBT, BlobABT);
+  MMDot<float>(*BlobAT, *BlobB, BlobATB);
+  MMDot<float>(*BlobAT, *BlobBT, BlobATBT);
+
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobABT->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobATBT->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestVVDot) {
+  float A[10] = {};
+  float B[10] = {};
+  float prod = 0;
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+    B[i] = 2* i * i - 3* (i%4);
+    prod += A[i] * B[i];
+  }
+
+  Blob<float> * BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+  Blob<float> * BlobB = new Blob<float>(10);
+  BlobB->set_cpu_data(B);
+  float blobprod = VVDot<float>(*BlobA, *BlobB);
+  ASSERT_EQ(blobprod, prod);
+}
+
+TEST(MathBlobTest, TestOuterProduct) {
+  float A[10] = {};
+  float B[10] = {};
+  float AB[10][10] = {};
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+    B[i] = 2* i * i - 3* (i%4);
+  }
+  for(int i = 0; i < 10; i++) {
+    for(int j = 0; j < 10; j++) {
+      AB[i][j] = A[i]*B[j];
+    }
+  }
+  Blob<float> * BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+  Blob<float> * BlobB = new Blob<float>(10);
+  BlobB->set_cpu_data(B);
+  Blob<float> * BlobAB = new Blob<float>(10, 10);
+  // BlobAB->SetValue(3);
+  Blob<float> * BlobRes = new Blob<float>(10, 10);
+  BlobRes->set_cpu_data(AB[0]);
+  OuterProduct<float>(*BlobA, *BlobB, BlobAB);
+
+  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMapAB) {
+  float A[10] = {};
+  float Res[10] = {};
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+    Res[i] = A[i] * A[i];
+  }
+  Blob<float> * BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+  Blob<float> * BlobB = new Blob<float>(10);
+  Blob<float> * BlobRes = new Blob<float>(10);
+  BlobRes->set_cpu_data(Res);
+  Map<singa::op::Square<float>, float>(*BlobA, BlobB);
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMapABC) {
+  float A[10] = {};
+  float B[10] = {};
+  float Res[10] = {};
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+    B[i] = 2* i * i - 3* (i%4);
+    Res[i] = A[i] * B[i];
+  }
+  Blob<float> * BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+  Blob<float> * BlobB = new Blob<float>(10);
+  BlobB->set_cpu_data(B);
+  Blob<float> * BlobC = new Blob<float>(10);
+  Blob<float> * BlobRes = new Blob<float>(10);
+  BlobRes->set_cpu_data(Res);
+  Map<singa::op::Mult<float>, float>(*BlobA, *BlobB, BlobC);
+  ASSERT_EQ(BlobC->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestCopy) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10);
+  float A[10] = {};
+  for (int i = 0; i < 10; i++) {
+    A[i] = i * i - 5* (i%2);
+  }
+  BlobA->set_cpu_data(A);
+  Copy<float>(*BlobA, BlobB);
+  ASSERT_EQ(BlobA->check_equal(BlobB), true);
+}
+
+TEST(MathBlobTest, TestAdd) {
+  Blob<float> *A = new Blob<float>(10);
+  Blob<float> *B = new Blob<float>(10);
+  Blob<float> *C = new Blob<float>(10);
+  Blob<float> *D = new Blob<float>(10);
+  A->SetValue(5);
+  B->SetValue(6);
+  D->SetValue(11);
+  Add<float>(*A, *B, C);
+  ASSERT_EQ(C->check_equal(D), true);
+}
+
+TEST(MathBlobTest, TestSub) {
+  Blob<float> *A = new Blob<float>(10);
+  Blob<float> *B = new Blob<float>(10);
+  Blob<float> *C = new Blob<float>(10);
+  Blob<float> *D = new Blob<float>(10);
+  A->SetValue(5);
+  B->SetValue(6);
+  D->SetValue(-1);
+  Sub<float>(*A, *B, C);
+  ASSERT_EQ(C->check_equal(D), true);
+}
+
+TEST(MathBlobTest, TestMVAddCol) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10, 10);
+  Blob<float> *BlobResT = new Blob<float>(10, 10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = i * j + i - j;
+      BT[j][i] = i * j + i - j;
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobB->set_cpu_data(B[0]);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+
+  for(int i = 0; i < 10; i++) {
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = 2.0 * A[i] + 3.0 * B[i][j];
+      BT[j][i] = 2.0 * A[i] + 3.0 * BT[j][i];
+    }
+  }
+
+  BlobRes->set_cpu_data(B[0]);
+  BlobResT->set_cpu_data(BT[0]);
+  BlobResT->set_transpose(true);
+
+  MVAddCol<float>(2.0, 3.0, *BlobA, BlobB);
+  MVAddCol<float>(2.0, 3.0, *BlobA, BlobBT);
+
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
+}
+
+TEST(MathBlobTest, TestMVAddRow) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10, 10);
+  Blob<float> *BlobResT = new Blob<float>(10, 10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = i * j + i - j;
+      BT[j][i] = i * j + i - j;
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobB->set_cpu_data(B[0]);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+
+  for(int i = 0; i < 10; i++) {
+    for(int j = 0; j < 10; j++) {
+      B[j][i] = 2.0 * A[i] + 3.0 * B[j][i];
+      BT[i][j] = 2.0 * A[i] + 3.0 * BT[i][j];
+    }
+  }
+
+  BlobRes->set_cpu_data(B[0]);
+  BlobResT->set_cpu_data(BT[0]);
+  BlobResT->set_transpose(true);
+
+  MVAddRow<float>(2.0, 3.0, *BlobA, BlobB);
+  MVAddRow<float>(2.0, 3.0, *BlobA, BlobBT);
+
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
+}
+
+TEST(MathBlobTest, TestRepmatCol) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10, 10);
+  Blob<float> *BlobResT = new Blob<float>(10, 10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = A[i];
+      BT[j][i] = A[i];
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobBT->set_transpose(true);
+
+  BlobRes->set_cpu_data(B[0]);
+  BlobResT->set_cpu_data(BT[0]);
+  BlobResT->set_transpose(true);
+
+  RepmatCol<float>(*BlobA, BlobB);
+  RepmatCol<float>(*BlobA, BlobBT);
+
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
+}
+
+TEST(MathBlobTest, TestRepmatRow) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10, 10);
+  Blob<float> *BlobResT = new Blob<float>(10, 10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[j][i] = A[i];
+      BT[i][j] = A[i];
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobBT->set_transpose(true);
+
+  BlobRes->set_cpu_data(B[0]);
+  BlobResT->set_cpu_data(BT[0]);
+  BlobResT->set_transpose(true);
+
+  RepmatRow<float>(*BlobA, BlobB);
+  RepmatRow<float>(*BlobA, BlobBT);
+
+  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
+}
+
+TEST(MathBlobTest, TestMVSumCol) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobACopy = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[i][j] = i * j + i - j;
+      BT[j][i] = i * j + i - j;
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobACopy->set_cpu_data(A);
+  BlobB->set_cpu_data(B[0]);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+
+  for(int i = 0; i < 10; i++) {
+    A[i] *= 2.0;
+    for(int j = 0; j < 10; j++) {
+      A[i] += 3.0 * B[i][j];
+    }
+  }
+  BlobRes->set_cpu_data(A);
+
+  MVSumCol<float>(2.0, 3.0, *BlobB, BlobA);
+  MVSumCol<float>(2.0, 3.0, *BlobBT, BlobACopy);
+
+  ASSERT_EQ(BlobA->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobACopy->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestMVSumRow) {
+  Blob<float> *BlobA = new Blob<float>(10);
+  Blob<float> *BlobACopy = new Blob<float>(10);
+  Blob<float> *BlobB = new Blob<float>(10, 10);
+  Blob<float> *BlobBT = new Blob<float>(10, 10);
+  Blob<float> *BlobRes = new Blob<float>(10);
+
+  float A[10] = {};
+  float B[10][10] = {};
+  float BT[10][10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = 5*i -2;
+    for(int j = 0; j < 10; j++) {
+      B[j][i] = i * j + i - j;
+      BT[i][j] = i * j + i - j;
+    }
+  }
+
+  BlobA->set_cpu_data(A);
+  BlobACopy->set_cpu_data(A);
+  BlobB->set_cpu_data(B[0]);
+  BlobBT->set_cpu_data(BT[0]);
+  BlobBT->set_transpose(true);
+
+  for(int i = 0; i < 10; i++) {
+    A[i] *= 2.0;
+    for(int j = 0; j < 10; j++) {
+      A[i] += 3.0 * B[j][i];
+    }
+  }
+  BlobRes->set_cpu_data(A);
+
+  MVSumRow<float>(2.0, 3.0, *BlobB, BlobA);
+  MVSumRow<float>(2.0, 3.0, *BlobBT, BlobACopy);
+
+  ASSERT_EQ(BlobA->check_equal(BlobRes), true);
+  ASSERT_EQ(BlobACopy->check_equal(BlobRes), true);
+}
+
+TEST(MathBlobTest, TestASum) {
+  float A[10] = {};
+  for(int i = 0; i < 10; i++) {
+    A[i] = ((i % 3) -1) * i;
+  }
+
+  Blob<float> *BlobA = new Blob<float>(10);
+  BlobA->set_cpu_data(A);
+
+  float BlobRes = Asum<float>(*BlobA);
+  float res = cblas_sasum(10, A, 1) / 10;
+
+  ASSERT_EQ(BlobRes, res);
+}
+
 TEST(MathTest, TestGemmCPU) {
   float A[3][2] = {};
   float B[3][2] = {};


Mime
View raw message