singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wan...@apache.org
Subject [42/60] incubator-singa git commit: SINGA-176 - Add loss and metric base classes
Date Fri, 03 Jun 2016 07:48:47 GMT
SINGA-176 - Add loss and metric base classes

Add loss and metric base classes, and implement the MSE as a sub-class
of Loss and the Accuracy as a subclass of Metric.

Add math functions to support the metric/loss classes.

Draft test files for MSE and Accuracy.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/d6800791
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/d6800791
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/d6800791

Branch: refs/heads/dev
Commit: d680079165496da2787064d04daf283f5b3e7bba
Parents: 72923b1
Author: wangwei <wangwei.cs@gmail.com>
Authored: Sun May 22 23:12:30 2016 +0800
Committer: Wei Wang <wangwei@comp.nus.edu.sg>
Committed: Thu May 26 14:09:53 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |  21 +-
 include/singa/model/loss.h         |  61 +++++
 include/singa/model/metric.h       |  57 +++++
 src/core/tensor/math_kernel.cu     | 421 ++++++++++++++++++++++++++++++++
 src/core/tensor/math_kernel.h      |  82 +++++++
 src/core/tensor/tensor.cc          |  96 +++++++-
 src/core/tensor/tensor_math.h      |  66 +++--
 src/core/tensor/tensor_math_cpp.h  |  54 ++++
 src/core/tensor/tensor_math_cuda.h |  34 ++-
 src/model/loss/mse.h               |  66 +++++
 src/model/metric/accuracy.h        |  82 +++++++
 src/proto/layer.proto              |  13 +-
 test/singa/test_accuracy.cc        |  35 +++
 test/singa/test_mse.cc             |  88 +++++++
 test/singa/test_tensor.cc          |   8 +-
 test/singa/test_tensor_math.cc     |   8 +-
 16 files changed, 1146 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 359f1ee..e560071 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -121,7 +121,7 @@ class Tensor {
   }
 
   /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
-  void ReShape(const Shape& shape);
+  void Reshape(const Shape& shape);
 
   /// Reset the shape, device, and data type as given tensor.
   /// If blob size changes, then reallocate a new blob. The previous blob would
@@ -138,6 +138,10 @@ class Tensor {
   /// Equivalent to ToDevice(host_dev).
   void ToHost();
 
+  /// Set each element of the tensor to be x
+  template<typename SType>
+  void SetValue(SType x);
+
   /// For init the tensor values, copy 'num' elements.
   template<typename DType>
   void CopyDataFromHostPtr(const DType* src, size_t num);
@@ -223,8 +227,23 @@ Tensor ReLU(const Tensor& t);
 Tensor Sigmoid(const Tensor& t);
 Tensor Sign(const Tensor& t);
 Tensor Sqrt(const Tensor& t);
+Tensor Square(const Tensor& t);
 Tensor Tanh(const Tensor& t);
 
+
+template<typename SType>
+SType Sum(const Tensor& t);
+/// Sum elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, sum all rows into a single row
+/// if 'axis' is 1, sum all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.sum
+Tensor Sum(const Tensor& t, int axis);
+
+/// Average elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, average all rows into a single row
+/// if 'axis' is 1, average all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.average
+Tensor Average(const Tensor&t, int axis);
 /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
 /// and shape_[axis+1]*...*shape_[nDim()] columns.
 /// and do softmax along each row.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
new file mode 100644
index 0000000..6c79e7b
--- /dev/null
+++ b/include/singa/model/loss.h
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LOSS_H_
+#define SINGA_MODEL_LOSS_H_
+#include "singa/proto/layer.pb.h"
+#include "singa/core/tensor.h"
+namespace singa {
+
+/// The base loss class, which declares the APIs for computing the objective
+/// score (loss) for a pair of prediction (from the model) and the target (i.e.
+/// the ground truth). It also computes the gradients of the objective w.r.t.
+/// the prediction. It has similar APIs as Layer.
+template <typename T = Tensor>
+class Loss {
+ public:
+  Loss() = default;
+  void Setup(const string& conf) {
+    LossConf loss;
+    loss.ParseFromString(conf);
+    Setup(loss);
+  }
+
+  /// Set meta fields from user configurations.
+  virtual void Setup(const LossConf& conf) {}
+
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target.
+  virtual Tensor Forward(const Tensor& prediction, const T& target) = 0;
+
+  /// Average loss values for all samples in the mini-batch
+  /// It calls Forward() internally. The calling pattern should be
+  /// [Evaluate|Forward] Backward.
+  float Evaluate(const Tensor& prediction, const T& target) {
+    const Tensor& loss = Forward(prediction, target);
+    return Sum<float>(loss) / (1.0f * loss.Size());
+  }
+
+  /// Compute the gradients of the loss values w.r.t. the prediction.
+  virtual Tensor Backward() = 0;
+};
+}  // namespace singa
+
+#endif  // SINGA_MODEL_LOSS_H_
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/include/singa/model/metric.h
----------------------------------------------------------------------
diff --git a/include/singa/model/metric.h b/include/singa/model/metric.h
new file mode 100644
index 0000000..6519028
--- /dev/null
+++ b/include/singa/model/metric.h
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_METRIC_H_
+#define SINGA_MODEL_METRIC_H_
+#include "singa/core/tensor.h"
+#include "singa/proto/layer.pb.h"
+namespace singa {
+
+/// The base metric class, which declares the APIs for computing the performance
+/// evaluation metrics given the prediction of the model and the ground truth,
+/// i.e., the target.
+/// The target type is a template argument.  For data samples with a single
+/// label, T could be 1-d tenor (or vector<int>); If each data sample has
+/// multiple labels, T could be vector<vector<int>>, one vector per sample.
+template <typename T = Tensor>
+class Metric {
+ public:
+  // TODO(wangwei) call Setup using a default MetricConf.
+  Metric() = default;
+  void Setup(const string& conf) {
+    MetricConf metric;
+    metric.ParseFromString(conf);
+    Setup(metric);
+  }
+
+  /// Set meta fields from user configurations.
+  virtual void Setup(const MetricConf& conf) {}
+
+  /// Compute the metric for each data sample
+  virtual Tensor Forward(const Tensor& prediction, const T& target) = 0;
+
+  /// Comptue the metric value averaged over all samples (in a batch)
+  float Evaluate(const Tensor& prediction, const T& target) {
+    const Tensor& metric = Forward(prediction, target);
+    return Sum<float>(metric) / (1.0f * metric.Size());
+  }
+};
+
+}  // namespace singa
+
+#endif  // SINGA_MODEL_METRIC_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
new file mode 100644
index 0000000..585d65d
--- /dev/null
+++ b/src/core/tensor/math_kernel.cu
@@ -0,0 +1,421 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifdef USE_CUDA
+#include <cmath>
+#include <algorithm>
+#include "./math_kernel.h"
+
+#define CU2DBLOCK_X 32
+#define CU2DBLOCK_Y 32
+
+#define CU1DBLOCK 1024
+#define CU1DBLOCKF 1024.0
+
+// Cuda Kernel Functions
+namespace cuda {
+__global__ void kernel_softmax_loss(const float *prob, const int *label,
+                                    float *loss, int n, int dim) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    float prob_of_truth = prob[index * dim + label[index]];
+    loss[index] -= log(max(prob_of_truth, FLT_MIN));
+  }
+}
+
+__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
+                                        int dim, float scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    int pos = index * dim + label[index];
+    grad[pos] = (grad[pos] - 1.0f) * scale;
+  }
+}
+
+__global__ void kernel_sum_vec(float *data, float *sum, int n) {
+  int THREADS = blockDim.x;
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (n - 1) / THREADS + 1;
+  aux[threadIdx.x] = data[threadIdx.x];
+
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < n) {
+      aux[threadIdx.x] += data[threadIdx.x + i * THREADS];
+    }
+  }
+
+  int total_threads = THREADS;
+  __syncthreads();
+
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
+  }
+
+  __syncthreads();
+  *sum = aux[0];
+}
+
+__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < rows; index += num_threads) {
+    dst_vec_data[index] = 0.0f;
+    for (int k = 0; k < cols; k++) {
+      dst_vec_data[index] += src_mat_data[index * stride + k];
+    }
+  }
+}
+
+__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int j = blockIdx.x;
+  int THREADS = blockDim.x;
+  if (j >= cols) {
+    return;
+  }
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (rows - 1) / THREADS + 1;
+  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < rows) {
+      aux[threadIdx.x] +=
+          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
+    }
+  }
+
+  int total_threads = THREADS;
+  __syncthreads();
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
+  }
+
+  __syncthreads();
+  dst_vec_data[j] = aux[0];
+}
+
+__global__ void kernel_add_vec_row(const float *src_vec_data,
+                                   const float *src_mat_data,
+                                   float *des_mat_data, int rows, int cols,
+                                   int stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int num_threads_x = blockDim.x * gridDim.x;
+  int num_threads_y = blockDim.y * gridDim.y;
+  int index = 0;
+  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
+    index = j * stride + i;
+    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+  }
+}
+
+__global__ void kernel_exp(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = exp(src_data[index]);
+  }
+}
+
+__global__ void kernel_log(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = log(src_data[index]);
+  }
+}
+
+__global__ void kernel_sigmoid(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
+}
+
+__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
+                                    int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * (1.0f - src_data[index]);
+  }
+}
+
+__global__ void kernel_relu(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = max(src_data[index], 0.0f);
+  }
+}
+
+__global__ void kernel_relu_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+  }
+}
+
+__global__ void kernel_tanh(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = tanhf(src_data[index]);
+  }
+}
+
+__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+  }
+}
+
+__global__ void kernel_softplus(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = logf(1 + expf(src_data[index]));
+  }
+}
+
+__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
+                                     int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
+}
+
+__global__ void kernel_square(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * src_data[index];
+  }
+}
+
+__global__ void kernel_square_grad(const float *src_data, float *des_data,
+                                   int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 2 * sqrt(src_data[index]);
+  }
+}
+
+__global__ void kernel_sqrt(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = sqrt(src_data[index]);
+  }
+}
+
+__global__ void kernel_pow(const float *src_data_a, const float *src_data_b,
+                           float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = pow(src_data_a[index], src_data_b[index]);
+  }
+}
+
+__global__ void kernel_mult(const float *src_data_a, const float *src_data_b,
+                            float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data_a[index] * src_data_b[index];
+  }
+}
+
+__global__ void kernel_div(const float *src_data_a, const float *src_data_b,
+                           float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data_a[index] / src_data_b[index];
+  }
+}
+
+__global__ static void kernel_set_value(float *data, float value, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    data[index] = value;
+  }
+}
+
+__global__ void kernel_threshold(const float *src_data, float *des_data,
+                                 float alpha, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
+  }
+}
+
+/*
+void softmaxloss_forward(int n, int dim, const float *prob,
+    const int *label, float *loss) {
+  kernel_softmax_loss<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(prob, label, loss, n,
+      dim);
+}
+
+void softmaxloss_backward(int n, int dim, float scale,
+    const int *label, float *grad) {
+  kernel_softmax_gradient<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(grad, label, n,
+      dim, scale);
+}
+*/
+void sum(int n, const float *in, float *out) {
+  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+  //  here, we only need one block
+  int num_blocks = 1;
+
+  kernel_sum_vec<<<num_blocks, threads_per_block>>>(in, out, n);
+}
+
+void sum_row(int rows, int cols, int stride, const float *in, float *out) {
+  int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
+  int num_blocks = cols;
+
+  kernel_sum_row<<<num_blocks, threads_per_block>>>(in, out, rows, cols,
+                                                    stride);
+}
+
+void sum_col(int rows, int cols, int stride, const float *in, float *out) {
+  int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
+  int num_blocks = rows;
+
+  kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data,
+                                                    rows, cols, stride);
+}
+void add_row(int rows, int cols, int stride, const float *in_row,
+             const float *in_mat, float *out) {
+  dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
+  dim3 num_blocks(
+      cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
+      rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
+  kernel_add_vec_row<<<num_blocks, threads_per_block>>>(in_row, in_mat, out,
+                                                        rows, cols, stride);
+}
+
+void exp(int n, const float *in, float *out) {
+  kernel_exp<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void log(int n, const float *in, float *out) {
+  kernel_log<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void sigmoid(int n, const float *in, float *out) {
+  kernel_sigmoid<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void sigmoid_grad(int n, const float *in, float *out) {
+  kernel_sigmoid_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void relu(int n, const float *in, float *out) {
+  kernel_relu<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void relu_grad(int n, const float *in, float *out) {
+  kernel_relu_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void tanh(int n, const float *in, float *out) {
+  kernel_tanh<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void tanh_grad(int n, const float *in, float *out) {
+  kernel_tanh_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void softplus(int n, const float *in, float *out) {
+  kernel_softplus<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void softplus_grad(int n, const float *in, float *out) {
+  kernel_softplus_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void square(int n, const float *in, float *out) {
+  kernel_square<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void square_grad(int n, const float *in, float *out) {
+  kernel_square_grad<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void sqrt(int n, const float *in, float *out) {
+  kernel_sqrt<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, n);
+}
+
+void pow(int n, const float *a, const float *b, float *out) {
+  kernel_pow<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+}
+
+void mult(int n, const float *a, const float *b, float *out) {
+  kernel_mult<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+}
+
+void div(int n, const float *a, const float *b, float *out) {
+  kernel_div<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(a, b, out, n);
+}
+
+void set_value(int n, float v, float *out) {
+  kernel_set_value<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(out, v, n);
+}
+
+void threshold(int n, float alpha, const float *in, float *out) {
+  kernel_threshold<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, alpha, n);
+}
+}  // namespace cuda
+}  // namespace singa
+
+#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
new file mode 100644
index 0000000..7629ac8
--- /dev/null
+++ b/src/core/tensor/math_kernel.h
@@ -0,0 +1,82 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#ifndef SRC_CORE_TENSOR__MATH_KERNEL_H_
+#define SRC_CORE_TENSOR__MATH_KERNEL_H_
+
+namespace singa {
+
+/*
+  void softmaxloss_forward(int n, int dim, const float *prob,
+      const int *label, float *loss);
+
+  void softmaxloss_backward(int n, int dim, float scale,
+      const int *label, float *grad);
+*/
+// TODO(wangwei) make all function templates.
+namespace cuda {
+void sum(int n, const float *in, float *out);
+
+void sum_row(int rows, int cols, int stride, const float *in, float *out);
+
+void sum_col(int rows, int cols, int stride, const float *in, float *out);
+
+void add_row(int rows, int cols, int stride, const float *in_row,
+  const float *in_mat, float *out);
+
+void exp(int n, const float *in, float *out);
+
+void log(int n, const float *in, float *out);
+
+void sigmoid(int n, const float *in, float *out);
+
+void sigmoid_grad(int n, const float *in, float *out);
+
+void relu(int n, const float *in, float *out);
+
+void relu_grad(int n, const float *in, float *out);
+
+void tanh(int n, const float *in, float *out);
+
+void tanh_grad(int n, const float *in, float *out);
+
+void softplus(int n, const float *in, float *out);
+
+void softplus_grad(int n, const float *in, float *out);
+
+void square(int n, const float *in, float *out);
+
+void square_grad(int n, const float *in, float *out);
+
+void sqrt(int n, const float *in, float *out);
+
+void pow(int n, const float *a, const float *b, float *out);
+
+void mult(int n, const float *a, const float *b, float *out);
+
+void div(int n, const float *a, const float *b, float *out);
+
+void set_value(int n, float v, float *out);
+
+void threshold(int n, float alpha, const float *in, float *out);
+}  // cuda
+}  // namespace singa
+
+#endif  // SRC_CORE_TENSOR__MATH_KERNEL_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 185b1f9..052f3ff 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -77,7 +77,7 @@ void Tensor::ResetLike(const Tensor& t) {
   }
 }
 
-void Tensor::ReShape(const Shape& shape) {
+void Tensor::Reshape(const Shape& shape) {
   if (shape_ != shape) {
     if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
@@ -119,6 +119,7 @@ void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
   }
 }
 template void Tensor::CopyDataFromHostPtr(const float* src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const int* src, size_t num);
 
 void Tensor::CopyData(const Tensor& src) {
   CHECK_EQ(Size(), src.Size());
@@ -279,6 +280,20 @@ void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num,
     }                                                          \
   } while (0)
 
+
+template <typename SType>
+void Tensor::SetValue(SType x) {
+  CHECK_EQ(sizeof(SType), SizeOf(data_type_));
+  auto size = Size();
+  auto ptr = blob_;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    device_->Exec(
+        [size, x, ptr](Context* ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
+        {}, {ptr});
+  });
+}
+
+
 #define EltwiseUnaryTensorFn(fn, t, ret)                               \
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
@@ -305,8 +320,87 @@ GenUnaryTensorFunction(ReLU);
 GenUnaryTensorFunction(Sigmoid);
 GenUnaryTensorFunction(Sign);
 GenUnaryTensorFunction(Sqrt);
+GenUnaryTensorFunction(Square);
 GenUnaryTensorFunction(Tanh);
 
+// TODO(wangwei) consider matrix transpose.
+Tensor SumRows(const Tensor& t) {
+  int ndim = t.shape().size();
+  CHECK_EQ(ndim, 2) << "Cannot do SumRows for Tensor with ndim = " << ndim;
+  size_t nrow = t.shape().at(0), ncol = t.shape().at(1);
+  Tensor ret(Shape{ncol}, t.device(), t.data_type());
+  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
+    ret.device()->Exec(
+        [nrow, ncol, t, ret](Context* ctx) {
+          SumRows<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx);
+        },
+        {t.blob()}, {ret.blob()});
+  });
+  return ret;
+}
+
+// TODO(wangwei) consider matrix transpose.
+Tensor SumColumns(const Tensor& t) {
+  int ndim = t.shape().size();
+  CHECK_EQ(ndim, 2) << "Cannot do SumColumns for Tensor with ndim = " << ndim;
+  CHECK(!t.transpose());  // TODO(wangwei) enable transpose
+  size_t nrow = t.shape().at(0), ncol = t.shape().at(1);
+  Tensor ret(Shape{nrow}, t.device(), t.data_type());
+  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
+    ret.device()->Exec(
+        [nrow, ncol, t, ret](Context* ctx) {
+          SumColumns<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx);
+        },
+        {t.blob()}, {ret.blob()});
+  });
+  return ret;
+}
+
+// TODO(wangwei) conside async exec
+template<>
+float Sum<float>(const Tensor& t)  {
+  float s = 0.0f;
+  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
+      t.device()->Exec(
+        [t, &s](Context* ctx) {
+        Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
+        },
+        {t.blob()}, {});
+      });
+  return s;
+}
+
+Tensor Sum(const Tensor& t, int axis) {
+  if (axis == 0) {
+    return SumRows(t);
+  } else {
+    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
+    return SumColumns(t);
+  }
+}
+
+Tensor Average(const Tensor& t, int axis) {
+  // operator/ only has implementation for float scalar type, hence it is
+  // necessary to cast the denominator to a float.
+  // TODO(wangwei) implement function for cast scalar type involved in Tensor
+  // functions. E.g.,
+  // template<S, D>
+  // D CastTo(S x) {
+  //   return D(x);
+  // }
+  // for speical types, e.g., fp16:
+  // tempalte<>
+  // fp16 CastType(float x) {
+  //    ....
+  // }
+  if (axis == 0) {
+    return Sum(t, 0) / (1.0f * t.shape().at(0));
+  } else {
+    CHECK_EQ(axis, 1);
+    return Sum(t, 1) / (1.0f * t.shape().at(1));
+  }
+}
+
 Tensor Softmax(const Tensor& t, int axis) {
   Tensor ret(t.shape(), t.device(), t.data_type());
   Softmax(t, &ret, axis);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 53e979b..d55e15a 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -50,6 +50,10 @@ void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
+template <typename DType, typename Lang>
+void Set(int count, DType x, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
 /// sum all elements of input into ret
 template <typename DType, typename Lang>
 void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
@@ -80,6 +84,12 @@ void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
+/// Element-wise operation, ret[i]=square([input[i])
+template <typename DType, typename Lang>
+void Square(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
 /// Element-wise operation, ret[i]=tanh([input[i])
 template <typename DType, typename Lang>
 void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
@@ -102,6 +112,35 @@ void Softmax(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the input matrix into a vector
+template <typename DType, typename Lang>
+void SumRows(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Sum the columns of the input matrix into a vector
+template <typename DType, typename Lang>
+void SumColumns(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of ret
+template <typename DType, typename Lang>
+void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
+            Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Add the vector v to every column of A as the column of ret
+template <typename DType, typename Lang>
+void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
+            Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+
 /// Element-wise operation, do v^x for every v from the input tensor
 template <typename DType, typename Lang>
 void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
@@ -177,33 +216,6 @@ void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
   LOG(FATAL) << "Not Implemented";
 }
 
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lang>
-void SumRow(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lang>
-void SumCol(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of ret
-template <typename DType, typename Lang>
-void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Add the vector v to every column of A as the column of ret
-template <typename DType, typename Lang>
-void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // ===== Level 1
 /// return the index of the element with the max value.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index b58e3bd..c584b69 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -26,6 +26,16 @@
 
 namespace singa {
 template <>
+void Square<float, lang::Cpp>(int count, const Blob* input,
+                           Blob* ret, Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = in[i] * in[i];
+  }
+}
+
+template <>
 void Add<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
                            Blob* ret, Context* ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
@@ -36,6 +46,50 @@ void Add<float, lang::Cpp>(int count, const Blob* lhs, const Blob* rhs,
     dptr[i] = lptr[i] + rptr[i];
   }
 }
+
+
+// sum all elements of input into ret
+// TODO(wangwei) optimize using omp
+template <>
+void Sum<float, lang::Cpp>(int count, const Blob* input, float* ret,
+    Context* ctx) {
+  float s = 0.f;
+  const float* in = static_cast<const float*>(input->data());
+  for (int i = 0; i < count; i++) {
+    s += in[i];
+  }
+  *ret = s;
+}
+
+// TODO(wangwei) optimize using omp
+template <>
+void SumRows<float, lang::Cpp>(int nrow, int ncol, const Blob* input, Blob* ret,
+    Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  memset(dptr, 0, ncol * sizeof(float));
+  for (int r = 0; r < nrow; r++) {
+    for (int c = 0; c < ncol; c++) {
+      dptr[c] += in[r * ncol + c];
+    }
+  }
+}
+
+// Sum the rows of the input matrix into a vector
+// TODO(wangwei) optimize using omp
+template <>
+void SumColumns<float, lang::Cpp>(int nrow, int ncol, const Blob* input, Blob* ret,
+    Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  memset(dptr, 0, ncol * sizeof(float));
+  for (int r = 0; r < nrow; r++) {
+    for (int c = 0; c < ncol; c++) {
+      dptr[r] += in[r * ncol + c];
+    }
+  }
+}
+
 template <>
 void EltwiseMult<float, lang::Cpp>(int count, const Blob* input, float x,
                                    Blob* ret, Context* ctx) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 40f9210..2e497d2 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -18,14 +18,14 @@
 
 #ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
 #define  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
-#include "./tensor_math.h"
 #include "singa_config.h"
+#ifdef USE_CUDA
+#include "./tensor_math.h"
+#include "./math_kernel.h"
 #include "singa/core/common.h"
 
-
 namespace singa {
 
-#ifdef USE_CUDA
 template<>
 void Add<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
                         Blob* ret, Context* ctx) {
@@ -38,9 +38,35 @@ void Add<float, lang::Cuda>(int count, const Blob* lhs, const Blob* rhs,
   cublasSaxpy(ctx->cublas_handle, 1.0f, rptr, 1, ptr, 1);
   */
 }
+// sum all elements of input into ret
+// TODO(wangwei) optimize using stream
+template <>
+void Sum<float, lang::Cuda>(int count, const Blob* input, float* ret,
+                            Context* ctx) {
+  const float* in = static_cast<const float*>(input->data());
+  cuda::sum(count, in, ret);
+}
+
+// TODO(wangwei) optimize using stream
+template <>
+void SumRows<float, lang::Cuda>(int nrow, int ncol, const Blob* input,
+                                Blob* ret, Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  cuda::sum_row(nrow, ncol, ncol, in, dptr);
+}
 
-#endif
+// Sum the rows of the input matrix into a vector
+// TODO(wangwei) optimize using stream
+template <>
+void SumColumns<float, lang::Cuda>(int nrow, int ncol, const Blob* input,
+                                   Blob* ret, Context* ctx) {
+  float* dptr = static_cast<float*>(ret->mutable_data());
+  const float* in = static_cast<const float*>(input->data());
+  cuda::sum_col(nrow, ncol, ncol, in, dptr);
+}
 }
 
 
+#endif  // USE_CUDA
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/model/loss/mse.h
----------------------------------------------------------------------
diff --git a/src/model/loss/mse.h b/src/model/loss/mse.h
new file mode 100644
index 0000000..5799f13
--- /dev/null
+++ b/src/model/loss/mse.h
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LOSS_MSE_H_
+#define SINGA_MODEL_LOSS_MSE_H_
+#include <stack>
+#include "singa/model/loss.h"
+
+namespace singa {
+
+/// MSE is for mean squared error or squared euclidean distance.
+class MSE : public Loss<Tensor> {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is 0.5/||prediction-target||^2
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is (prediction-target)/batchsize
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., prediction-target
+  std::stack<Tensor> buf_;
+};
+
+Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+  Tensor t = prediction - target;
+  size_t batchsize = 1;
+  if (t.nDim() > 1) batchsize = t.shape().at(0);
+  size_t dim = t.Size() / batchsize;
+  t.Reshape(Shape{batchsize, dim});
+  buf_.push(t);
+  // TODO(wangwei) use CastType for operator/
+  return Sum(Square(t), 1);
+}
+
+Tensor MSE::Backward() {
+  const Tensor& ret = buf_.top();
+  buf_.pop();
+  return ret / (1.0f * ret.shape().at(0));
+}
+}  // namespace singa
+
+#endif  // SINGA_MODEL_LOSS_H_
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/model/metric/accuracy.h
----------------------------------------------------------------------
diff --git a/src/model/metric/accuracy.h b/src/model/metric/accuracy.h
new file mode 100644
index 0000000..05c1643
--- /dev/null
+++ b/src/model/metric/accuracy.h
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_METRIC_ACCURACY_H_
+#define SINGA_MODEL_METRIC_ACCURACY_H_
+#include "singa/model/metric.h"
+namespace singa {
+
+/// Compute the accuray of the prediction, which is matched against the
+/// ground truth labels.
+/// TODO(wangwei) consider multi-label cases.
+class Accuracy : public Metric<Tensor> {
+ public:
+  /// Set meta fields from user configurations.
+  void Setup(const MetricConf& conf) override { top_k_ = conf.top_k(); }
+
+  /// Check the prediction against the target (ground truth) for each data
+  /// sample. The returned Tensor has a float value for each sample, 0 for wrong
+  /// and 1 for correct. Users can call Sum(const Tensor&) / Tensor::Size() to
+  /// get the accuracy.
+  Tensor Forward(const Tensor& prediction, const Tensor& target);
+
+ private:
+  /// \copydoc Match(const Tensor&, const Tensor&);
+  Tensor Match(const Tensor& prediction, const vector<int>& target);
+  /// If the ground truth label is in the top k predicted labels, then the
+  /// prediction is correct.
+  size_t top_k_ = 1;
+};
+
+Tensor Accuracy::Match(const Tensor& prediction, const vector<int>& target) {
+  size_t batchsize = target.size();
+  size_t nb_classes = prediction.Size() / batchsize;
+  // each row of prediction is the prob distribution for one sample
+  CHECK_EQ(prediction.shape().at(0), batchsize);
+  const float* prob = prediction.data<const float*>();
+  float* score = new float[batchsize];
+  for (size_t b = 0; b < batchsize; b++) {
+    vector<std::pair<float, int>> prob_class;
+    for (size_t c = 0; c < nb_classes; c++) {
+      prob_class.push_back(std::make_pair(prob[b * nb_classes + c], c));
+    }
+    std::partial_sort(prob_class.begin(), prob_class.begin() + top_k_,
+                      prob_class.end(), std::greater<std::pair<float, int>>());
+
+    for (size_t k = 0; k < top_k_; k++)
+      if (prob_class.at(k).second == target.at(b)) score[b] = 1;
+  }
+  Tensor ret(Shape{batchsize});
+  ret.CopyDataFromHostPtr(score, batchsize);
+  return ret;
+}
+
+// TODO(wangwei) consider multi-label cases, where target is of shape
+// nb_samples * nb_classes
+Tensor Accuracy::Forward(const Tensor& prediction, const Tensor& target) {
+  vector<int> target_vec;
+  // TODO(wangwei) copy target to host.
+  const int* target_value = target.data<const int*>();
+  for (size_t i = 0; i < target.Size(); i++)
+    target_vec.push_back(target_value[i]);
+  return Match(prediction, target_vec);
+}
+
+}  // namespace singa
+
+#endif  // SINGA_MODEL_METRIC_ACCURACY_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/src/proto/layer.proto
----------------------------------------------------------------------
diff --git a/src/proto/layer.proto b/src/proto/layer.proto
index 3d130ea..51225ee 100644
--- a/src/proto/layer.proto
+++ b/src/proto/layer.proto
@@ -157,7 +157,7 @@ message LayerConf {
   // for their implementation. These layers include an Engine type and
   // engine parameter for selecting the implementation.
   // The default for the engine is set by the ENGINE switch at compile-time.
-  optional AccuracyConf accuracy_conf = 102;
+  //optional AccuracyConf accuracy_conf = 102;
   optional ArgMaxConf argmax_conf = 103;
   optional ConcatConf concat_conf = 104;
   optional ContrastiveLossConf contrastive_loss_conf = 105;
@@ -177,6 +177,8 @@ message LayerConf {
   optional InnerProductConf inner_product_conf = 117;
   optional LogConf log_conf = 134;
   optional LRNConf lrn_conf = 118;
+  // Used in SINGA
+  optional MetricConf metric_conf = 200;
   // optional MemoryDataConf memory_data_conf = 119;
   optional MVNConf mvn_conf = 120;
   optional PoolingConf pooling_conf = 121;
@@ -230,10 +232,7 @@ message LossConf {
   optional bool normalize = 2 [default = true];
 }
 
-// Messages that store hyper-parameters used by individual layer types follow, in
-// alphabetical order.
-
-message AccuracyConf {
+message MetricConf {
   // When computing accuracy, count as correct by comparing the true label to
   // the top k scoring classes.  By default, only compare to the top scoring
   // class (i.e. argmax).
@@ -249,6 +248,10 @@ message AccuracyConf {
   // If specified, ignore instances with the given label.
   optional int32 ignore_label = 3;
 }
+// Messages that store hyper-parameters used by individual layer types follow, in
+// alphabetical order.
+
+
 
 message ArgMaxConf {
   // If true produce pairs (argmax, maxval)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/test/singa/test_accuracy.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_accuracy.cc b/test/singa/test_accuracy.cc
new file mode 100644
index 0000000..dc7719b
--- /dev/null
+++ b/test/singa/test_accuracy.cc
@@ -0,0 +1,35 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "../src/model/metric/accuracy.h"
+
+TEST(Accuracy, Compute) {
+  singa::Accuracy acc;
+  singa::Tensor p(singa::Shape{2, 3});
+  singa::Tensor t(singa::Shape{2}, singa::kInt);
+  const float pdat[6] = {0.1, 0.3, 0.6, 0.3, 0.2, 0.5};
+  const int tdat[2] = {1, 2};  // one wrong, one correct
+  p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
+  t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
+  float a = acc.Evaluate(p, t);
+  EXPECT_FLOAT_EQ(a, 0.5f);
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
new file mode 100644
index 0000000..9056176
--- /dev/null
+++ b/test/singa/test_mse.cc
@@ -0,0 +1,88 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+#include "singa/core/device.h"
+#include "../src/model/loss/mse.h"
+
+using singa::Tensor;
+class TestMSE : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    p.Reshape(singa::Shape{2, 3});
+    t.Reshape(singa::Shape{2, 3});
+    p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
+    t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
+  }
+  const float pdat[6] = {0.1, 1.1, 2.1, 0.3, 2.2, 1.8};
+  const float tdat[6] = {0.1, 1.1, 2.0, 0.3, 2.2, 1.8};
+
+  singa::Tensor p, t;
+};
+
+TEST_F(TestMSE, CppForward) {
+  singa::MSE mse;
+  const Tensor& loss = mse.Forward(p, t);
+  auto ldat = loss.data<const float*>();
+
+  EXPECT_FLOAT_EQ(ldat[0], 0.005);
+  EXPECT_FLOAT_EQ(ldat[1], 0);
+}
+
+TEST_F(TestMSE, CudaForward) {
+  singa::MSE mse;
+  singa::CudaGPU dev;
+  p.ToDevice(&dev);
+  t.ToDevice(&dev);
+  Tensor loss = mse.Forward(p, t);
+
+  loss.ToHost();
+  auto ldat = loss.data<const float*>();
+
+  for (size_t i = 0; i < loss.Size(); i++)
+    EXPECT_FLOAT_EQ(ldat[i], 0.5 * (pdat[i] - tdat[i]) * (pdat[i] - tdat[i]));
+}
+
+TEST_F(TestMSE, CppBackward) {
+  singa::MSE mse;
+  mse.Forward(p, t);
+  const Tensor& grad = mse.Backward();
+
+  auto gdat = grad.data<const float*>();
+
+  for (size_t i = 0; i < grad.Size(); i++)
+    EXPECT_FLOAT_EQ(gdat[i], pdat[i] - tdat[i]);
+}
+
+TEST_F(TestMSE, CudaBackward) {
+  singa::MSE mse;
+  singa::CudaGPU dev;
+  p.ToDevice(&dev);
+  t.ToDevice(&dev);
+  mse.Forward(p, t);
+  Tensor grad = mse.Backward();
+  grad.ToHost();
+  auto gdat = grad.data<const float*>();
+
+  for (size_t i = 0; i < grad.Size(); i++)
+    EXPECT_FLOAT_EQ(gdat[i], pdat[i] - tdat[i]);
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index b3f0c6b..f9acdb0 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -35,18 +35,18 @@ TEST(TensorTest, TestConstructor) {
 
 TEST(TensorClass, Reshape) {
   Tensor t;
-  t.ReShape(Shape{2,3});
+  t.Reshape(Shape{2,3});
   EXPECT_TRUE((Shape{2,3} == t.shape()));
 
-  t.ReShape(Shape{3,3, 4});
+  t.Reshape(Shape{3,3, 4});
   EXPECT_TRUE((Shape{3,3, 4} == t.shape()));
 
-  t.ReShape(Shape{12});
+  t.Reshape(Shape{12});
   EXPECT_TRUE((Shape{12} == t.shape()));
 
   Tensor o;
   EXPECT_TRUE(o.shape() != t.shape());
-  o.ReShape(Shape{3, 3});
+  o.Reshape(Shape{3, 3});
   EXPECT_TRUE(o.shape() != t.shape());
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d6800791/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index eee18ec..fb7e3e8 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -9,10 +9,10 @@ class TestTensorMath : public ::testing::Test {
   virtual void SetUp() {
     const float dat1[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
     const float dat2[] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
-    a.ReShape(singa::Shape{6});
-    b.ReShape(singa::Shape{6});
-    c.ReShape(singa::Shape{6, 1});
-    d.ReShape(singa::Shape{3, 2});
+    a.Reshape(singa::Shape{6});
+    b.Reshape(singa::Shape{6});
+    c.Reshape(singa::Shape{6, 1});
+    d.Reshape(singa::Shape{3, 2});
 
     a.CopyDataFromHostPtr<float>(dat1, 6);
     b.CopyDataFromHostPtr<float>(dat2, 6);


Mime
View raw message