Mailing-List: contact commits-help@singa.incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@singa.incubator.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: wangwei@apache.org
To: commits@singa.incubator.apache.org
Date: Tue, 02 Aug 2016 04:09:17 -0000
Message-Id: <ad50943f8fbc4b55ac10bd53c20fc544@git.apache.org>
Subject: [1/2] incubator-singa git commit: Singa-228 Add Cpp Version of
 Convolution and Pooling layer
archived-at: Tue, 02 Aug 2016 04:09:30 -0000

Repository: incubator-singa
Updated Branches:
  refs/heads/dev 464dcda63 -> f07e3545c


Singa-228 Add Cpp Version of Convolution and Pooling layer

Implement cpp version of convolution layer and pooling layer.
Using some functions in previous version of Singa.
Pooling layer only supports max and average pooling(no stochastic).
Passed tests.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ed981497
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ed981497
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ed981497

Branch: refs/heads/dev
Commit: ed9814976f63ed9ce3e1d721e6426be28c20fc77
Parents: 4e7f3c1
Author: Xiangrui <caixr91@gmail.com>
Authored: Thu Jul 28 10:14:27 2016 +0800
Committer: XiangruiCAI <caixr91@gmail.com>
Committed: Thu Jul 28 10:27:08 2016 +0800

----------------------------------------------------------------------
 src/model/layer/convolution.cc       | 126 ++++++++++++++++-
 src/model/layer/convolution.h        |  20 ++-
 src/model/layer/pooling.cc           | 219 +++++++++++++++++++++++++++++-
 src/model/layer/pooling.h            |  26 +++-
 test/singa/test_convolution.cc       | 204 ++++++++++++++++++++++++++++
 test/singa/test_cudnn_convolution.cc |  14 +-
 test/singa/test_pooling.cc           | 141 +++++++++++++++++++
 7 files changed, 725 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ed981497/src/model/layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
index a43014d..1bf6b39 100644
--- a/src/model/layer/convolution.cc
+++ b/src/model/layer/convolution.cc
@@ -24,7 +24,7 @@ namespace singa {
 using std::vector;
 
 RegisterLayerClass(Convolution);
-void Convolution::Setup(const Shape& in_sample, const LayerConf &conf) {
+void Convolution::Setup(const Shape &in_sample, const LayerConf &conf) {
   Layer::Setup(in_sample, conf);
   ConvolutionConf conv_conf = conf.convolution_conf();
   // kernel_size, pad, and stride are repeated fields.
@@ -97,24 +97,136 @@ void Convolution::Setup(const Shape& in_sample, const LayerConf &conf) {
 
 /// \copydoc Layer::Forward(int flag, const Tensor&)
 const Tensor Convolution::Forward(int flag, const Tensor &input) {
-  Tensor output;
-  // will be used in cpp version later
+  CHECK(buf_.empty());
+  CHECK_EQ(input.device()->lang(), kCpp);
+  CHECK_EQ(input.nDim(), 4u);
+  if (flag & kTrain) buf_.push(input);
+  size_t batchsize = input.shape(0);
+  size_t imagesize = input.Size() / batchsize;
+  DataType dtype = input.data_type();
+  auto dev = input.device();
+  Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
+  Tensor output(shape, dev, dtype);
   Tensor col_data(Shape{col_height_, col_width_});
-  Tensor col_grad(Shape{col_height_, col_width_});
+  float *data_col = new float[col_height_ * col_width_];
+  auto in_data = input.data<float>();
+  for (size_t b = 0; b < batchsize; b++) {
+    Im2col(in_data + b * imagesize, channels_, height_, width_, kernel_h_,
+           kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data_col);
+    col_data.CopyDataFromHostPtr(data_col, col_height_ * col_width_);
+    Tensor each = Mult(weight_, col_data);
+    if (bias_term_) {
+      AddColumn(bias_, &each);
+    }
+    CopyDataToFrom(&output, each, each.Size(), b * each.Size());
+  }
+  delete[] data_col;
   return output;
 }
 
 /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
 const std::pair<Tensor, vector<Tensor>> Convolution::Backward(
     int flag, const Tensor &grad) {
+  CHECK_EQ(grad.device()->lang(), kCpp);
+  CHECK_EQ(grad.nDim(), 4u);
+  CHECK(!buf_.empty());
+  Tensor src_data = buf_.top();
+  buf_.pop();
   vector<Tensor> param_grad;
-  Tensor input_grad;
-
-  return std::make_pair(input_grad, param_grad);
+  Tensor dx;
+  Tensor db, dw;
+  dx.ResetLike(src_data);
+  db.ResetLike(bias_);
+  dw.ResetLike(weight_);
+  dw.SetValue(0.0f);
+  size_t batchsize = grad.shape(0);
+  size_t imagesize = src_data.Size() / batchsize;
+  if (bias_term_) {
+    Tensor tmp1 =
+        Reshape(grad, Shape{batchsize * num_filters_,
+                            grad.Size() / (batchsize * num_filters_)});
+    Tensor tmp2(Shape{batchsize * num_filters_});
+    SumColumns(tmp1, &tmp2);
+    Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_});
+    SumRows(tmp3, &db);
+  }
+  auto in_data = src_data.data<float>();
+  Tensor col_data(Shape{col_height_, col_width_});
+  float *data_col = new float[col_height_ * col_width_];
+  float *dx_b = new float[imagesize];
+  for (size_t b = 0; b < batchsize; b++) {
+    Im2col(in_data + b * imagesize, channels_, height_, width_, kernel_h_,
+           kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data_col);
+    col_data.CopyDataFromHostPtr(data_col, col_height_ * col_width_);
+    Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_});
+    CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
+    dw += Mult(grad_b, col_data.T());
+    Tensor dcol_b = Mult(weight_.T(), grad_b);
+    auto dcol_data = dcol_b.data<float>();
+    Col2im(dcol_data, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_,
+           pad_w_, stride_h_, stride_w_, dx_b);
+    dx.CopyDataFromHostPtr(dx_b, imagesize, b * imagesize);
+  }
+  param_grad.push_back(dw);
+  param_grad.push_back(db);
+  delete[] data_col;
+  delete[] dx_b;
+  return std::make_pair(dx, param_grad);
 }
 void Convolution::ToDevice(std::shared_ptr<Device> device) {
   Layer::ToDevice(device);
   weight_.ToDevice(device);
   bias_.ToDevice(device);
 }
+
+void Convolution::Im2col(const float *data_im, const int channels,
+                         const int height, const int width, const int kernel_h,
+                         const int kernel_w, const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w,
+                         float *data_col) {
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_col[(c * height_col + h) * width_col + w] =
+              data_im[(c_im * height + h_pad) * width + w_pad];
+        else
+          data_col[(c * height_col + h) * width_col + w] = 0;
+      }
+    }
+  }
+}
+
+void Convolution::Col2im(const float *data_col, const int channels,
+                         const int height, const int width, const int patch_h,
+                         const int patch_w, const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w,
+                         float *data_im) {
+  memset(data_im, 0, height * width * channels * sizeof(float));
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int channels_col = channels * patch_h * patch_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % patch_w;
+    int h_offset = (c / patch_w) % patch_h;
+    int c_im = c / patch_h / patch_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_im[(c_im * height + h_pad) * width + w_pad] +=
+              data_col[(c * height_col + h) * width_col + w];
+      }
+    }
+  }
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ed981497/src/model/layer/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
index b3d0c42..1383a66 100644
--- a/src/model/layer/convolution.h
+++ b/src/model/layer/convolution.h
@@ -17,10 +17,10 @@
  */
 #ifndef SRC_MODEL_LAYER_CONVOLUTION_H_
 #define SRC_MODEL_LAYER_CONVOLUTION_H_
+#include <stack>
 #include <string>
 #include <utility>
 #include <vector>
-#include <stack>
 #include "singa/model/layer.h"
 
 namespace singa {
@@ -38,14 +38,24 @@ class Convolution : public Layer {
 
   // void SetupParam(const Tensor &input);
   /// \copydoc Layer::Forward(int flag, const Tensor&)
-  const Tensor Forward(int flag, const Tensor &input) override;
+  const Tensor Forward(int flag, const Tensor& input) override;
 
   /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
-                                                   const Tensor &grad) override;
+                                                   const Tensor& grad) override;
 
   void ToDevice(std::shared_ptr<Device> device) override;
 
+  void Im2col(const float* data_im, const int channels, const int height,
+              const int width, const int kernel_h, const int kernel_w,
+              const int pad_h, const int pad_w, const int stride_h,
+              const int stride_w, float* data_col);
+
+  void Col2im(const float* data_col, const int channels, const int height,
+              const int width, const int patch_h, const int patch_w,
+              const int pad_h, const int pad_w, const int stride_h,
+              const int stride_w, float* data_im);
+
   const std::vector<Tensor> param_values() override {
     return std::vector<Tensor>{weight_, bias_};
   }
@@ -61,8 +71,8 @@ class Convolution : public Layer {
   size_t height() const { return height_; }
   size_t width() const { return width_; }
   bool bias_term() const { return bias_term_; }
-  const Tensor &weight() const { return weight_; }
-  const Tensor &bias() const { return bias_; }
+  const Tensor& weight() const { return weight_; }
+  const Tensor& bias() const { return bias_; }
 
   void set_weight(Tensor w) {
     weight_.ResetLike(w);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ed981497/src/model/layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.cc b/src/model/layer/pooling.cc
index 9968c2c..943f9b2 100644
--- a/src/model/layer/pooling.cc
+++ b/src/model/layer/pooling.cc
@@ -64,24 +64,231 @@ void Pooling::Setup(const Shape& in_sample, const LayerConf& conf) {
   pooled_height_ = 1;
   if (stride_h_ > 0)
     pooled_height_ =
-      static_cast<size_t>((height_ + 2 * pad_h_ - kernel_h_) / stride_h_) + 1;
+        static_cast<size_t>((height_ + 2 * pad_h_ - kernel_h_) / stride_h_) + 1;
   pooled_width_ =
-    static_cast<size_t>((width_ + 2 * pad_w_ - kernel_w_) / stride_w_) + 1;
+      static_cast<size_t>((width_ + 2 * pad_w_ - kernel_w_) / stride_w_) + 1;
   out_sample_shape_ = vector<size_t>{channels_, pooled_height_, pooled_width_};
 }
 
 const Tensor Pooling::Forward(int flag, const Tensor& input) {
-  Tensor out;
+  CHECK(buf_.empty());
+  CHECK_EQ(input.device()->lang(), kCpp);
+  CHECK_EQ(input.nDim(), 4u);
+  size_t batchsize = input.shape(0);
+  DataType dtype = input.data_type();
+  auto dev = input.device();
+  Shape shape{batchsize, channels_, pooled_height_, pooled_width_};
+  Tensor output(shape, dev, dtype);
+  float* outptr = new float[output.Size()];
+  auto inptr = input.data<float>();
+  if (pool_ == PoolingConf_PoolMethod_MAX) {
+    Tensor mask;
+    mask.ResetLike(output);
+    float* maskptr = new float[mask.Size()];
+    ForwardMaxPooling(inptr, batchsize, channels_, height_, width_, kernel_h_,
+                      kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, outptr,
+                      maskptr);
+    mask.CopyDataFromHostPtr(maskptr, mask.Size());
+    if (flag & kTrain) buf_.push(mask);
+    delete[] maskptr;
+  } else if (pool_ == PoolingConf_PoolMethod_AVE)
+    ForwardAvgPooling(inptr, batchsize, channels_, height_, width_, kernel_h_,
+                      kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, outptr);
+  else
+    LOG(FATAL) << "Unknow pooling method";
 
-  return out;
+  output.CopyDataFromHostPtr(outptr, output.Size());
+  delete[] outptr;
+  return output;
 }
 
 const std::pair<Tensor, vector<Tensor>> Pooling::Backward(int flag,
                                                           const Tensor& grad) {
+  CHECK_EQ(grad.device()->lang(), kCpp);
+  CHECK_EQ(grad.nDim(), 4u);
   vector<Tensor> param_grad;
-  Tensor input_grad;
+  CHECK(!buf_.empty());
+  Tensor mask = buf_.top();
+  buf_.pop();
+  size_t batchsize = grad.shape(0);
+  Shape shape{batchsize, channels_, height_, width_};
+  auto dev = grad.device();
+  DataType dtype = grad.data_type();
+  Tensor dx(shape, dev, dtype);
+  auto gradptr = grad.data<float>();
+  auto maskptr = mask.data<float>();
+  float* dxptr = new float[dx.Size()];
+  if (pool_ == PoolingConf_PoolMethod_MAX)
+    BackwardMaxPooling(gradptr, maskptr, batchsize, channels_, height_, width_,
+                       kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+                       stride_w_, dxptr);
+  else if (pool_ == PoolingConf_PoolMethod_AVE)
+    BackwardAvgPooling(gradptr, batchsize, channels_, height_, width_,
+                       kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+                       stride_w_, dxptr);
+  else
+    LOG(FATAL) << "Unknow pooling method";
 
-  return std::make_pair(input_grad, param_grad);
+  dx.CopyDataFromHostPtr(dxptr, dx.Size());
+  delete[] dxptr;
+  return std::make_pair(dx, param_grad);
 }
 
+void Pooling::ForwardMaxPooling(const float* bottom, const int num,
+                                const int channels, const int height,
+                                const int width, const int kernel_h,
+                                const int kernel_w, const int pad_h,
+                                const int pad_w, const int stride_h,
+                                const int stride_w, float* top, float* mask) {
+  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
+  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
+  int top_count = num * top_height * top_width * channels;
+  for (int i = 0; i < top_count; i++) {
+    mask[i] = -1;
+    top[i] = -FLT_MAX;
+  }
+  const int bottom_offset = height * width;
+  const int top_offset = top_height * top_width;
+  // The main loop
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < top_height; ++ph) {
+        for (int pw = 0; pw < top_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int top_index = ph * top_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              if (bottom[index] > top[top_index]) {
+                top[top_index] = bottom[index];
+                mask[top_index] = index;
+              }
+            }
+          }
+        }
+      }
+      // compute offset
+      bottom += bottom_offset;
+      top += top_offset;
+      mask += top_offset;
+    }
+  }
+}
+
+void Pooling::BackwardMaxPooling(const float* top, const float* mask,
+                                 const int num, const int channels,
+                                 const int height, const int width,
+                                 const int kernel_h, const int kernel_w,
+                                 const int pad_h, const int pad_w,
+                                 const int stride_h, const int stride_w,
+                                 float* bottom) {
+  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
+  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
+  const int top_offset = top_height * top_width;
+  const int bottom_offset = height * width;
+  memset(bottom, 0, sizeof(float) * num * channels * bottom_offset);
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < top_height; ++ph) {
+        for (int pw = 0; pw < top_width; ++pw) {
+          const int top_idx = ph * top_width + pw;
+          const int bottom_idx = static_cast<int>(mask[top_idx]);
+          bottom[bottom_idx] += top[top_idx];
+        }
+      }
+      top += top_offset;
+      mask += top_offset;
+      bottom += bottom_offset;
+    }
+  }
+}
+
+void Pooling::ForwardAvgPooling(const float* bottom, const int num,
+                                const int channels, const int height,
+                                const int width, const int kernel_h,
+                                const int kernel_w, const int pad_h,
+                                const int pad_w, const int stride_h,
+                                const int stride_w, float* top) {
+  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
+  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
+  int top_count = num * top_height * top_width * channels;
+  for (int i = 0; i < top_count; i++) {
+    top[i] = 0;
+  }
+  const int bottom_offset = height * width;
+  const int top_offset = top_height * top_width;
+  // The main loop
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < top_height; ++ph) {
+        for (int pw = 0; pw < top_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          const int top_index = ph * top_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              top[top_index] += bottom[index];
+            }
+          }
+          top[top_index] /= pool_size;
+        }
+      }
+      // compute offset
+      bottom += bottom_offset;
+      top += top_offset;
+    }
+  }
+}
+
+void Pooling::BackwardAvgPooling(const float* top, const int num,
+                                 const int channels, const int height,
+                                 const int width, const int kernel_h,
+                                 const int kernel_w, const int pad_h,
+                                 const int pad_w, const int stride_h,
+                                 const int stride_w, float* bottom) {
+  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
+  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
+  const int top_offset = top_height * top_width;
+  const int bottom_offset = height * width;
+  memset(bottom, 0, sizeof(float) * num * channels * bottom_offset);
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < top_height; ++ph) {
+        for (int pw = 0; pw < top_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          const int top_index = ph * top_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              bottom[index] += top[top_index] / pool_size;
+            }
+          }
+        }
+      }
+      top += top_offset;
+      bottom += bottom_offset;
+    }
+  }
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ed981497/src/model/layer/pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.h b/src/model/layer/pooling.h
index 26a1d07..6df292a 100644
--- a/src/model/layer/pooling.h
+++ b/src/model/layer/pooling.h
@@ -17,10 +17,11 @@
  */
 #ifndef SRC_MODEL_LAYER_POOLING_H_
 #define SRC_MODEL_LAYER_POOLING_H_
+#include <cfloat>
+#include <stack>
 #include <string>
 #include <utility>
 #include <vector>
-#include <stack>
 #include "singa/model/layer.h"
 
 namespace singa {
@@ -42,6 +43,29 @@ class Pooling : public Layer {
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                    const Tensor& grad) override;
 
+  void ForwardMaxPooling(const float* bottom, const int num, const int channels,
+                         const int height, const int width, const int kernel_h,
+                         const int kernel_w, const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w, float* top,
+                         float* mask);
+
+  void BackwardMaxPooling(const float* top, const float* mask, const int num,
+                          const int channels, const int height, const int width,
+                          const int kernel_h, const int kernel_w,
+                          const int pad_h, const int pad_w, const int stride_h,
+                          const int stride_w, float* bottom);
+
+  void ForwardAvgPooling(const float* bottom, const int num, const int channels,
+                         const int height, const int width, const int kernel_h,
+                         const int kernel_w, const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w, float* top);
+
+  void BackwardAvgPooling(const float* top, const int num, const int channels,
+                          const int height, const int width, const int kernel_h,
+                          const int kernel_w, const int pad_h, const int pad_w,
+                          const int stride_h, const int stride_w,
+                          float* bottom);
+
   size_t kernel_w() const { return kernel_w_; }
   size_t kernel_h() const { return kernel_h_; }
   size_t pad_w() const { return pad_w_; }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ed981497/test/singa/test_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_convolution.cc b/test/singa/test_convolution.cc
new file mode 100644
index 0000000..b5f3605
--- /dev/null
+++ b/test/singa/test_convolution.cc
@@ -0,0 +1,204 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/convolution.h"
+
+#include "gtest/gtest.h"
+
+using singa::Convolution;
+using singa::Shape;
+TEST(Convolution, Setup) {
+  Convolution conv;
+  EXPECT_EQ("Convolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  EXPECT_EQ(2u, conv.kernel_h());
+  EXPECT_EQ(2u, conv.kernel_w());
+  EXPECT_EQ(1u, conv.pad_h());
+  EXPECT_EQ(1u, conv.pad_w());
+  EXPECT_EQ(1u, conv.stride_h());
+  EXPECT_EQ(1u, conv.stride_w());
+  EXPECT_EQ(2u, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(1u, conv.channels());
+  EXPECT_EQ(3u, conv.height());
+  EXPECT_EQ(3u, conv.width());
+}
+
+TEST(Convolution, Forward) {
+  const size_t batchsize = 2, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  singa::Tensor in(singa::Shape{batchsize, c, h, w});
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const size_t col_height = 1 * 3 * 3;  // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {1.0f,  1.0f, 0.0f, 0.0f, 0.0f,
+                                              -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, col_height});
+  weight.CopyDataFromHostPtr(we, num_filters * col_height);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters});
+  bias.CopyDataFromHostPtr(b, num_filters);
+  Convolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(8u, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+  EXPECT_EQ(3.0f, outptr1[4]);
+  EXPECT_EQ(7.0f, outptr1[5]);
+  EXPECT_EQ(-3.0f, outptr1[6]);
+  EXPECT_EQ(12.0f, outptr1[7]);
+}
+
+TEST(Convolution, Backward) {
+  // src_data
+  const size_t batchsize = 2, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w});
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const size_t col_height = 1 * 3 * 3;  // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {1.0f,  1.0f, 0.0f, 0.0f, 0.0f,
+                                              -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, col_height});
+  weight.CopyDataFromHostPtr(we, num_filters * col_height);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters});
+  bias.CopyDataFromHostPtr(b, num_filters);
+  Convolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("fastest");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {
+      0.1f, 0.2f, 0.3f, 0.4f, 0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w});
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  const float *dx = in_grad.data<float>();
+  const float *wptr = we;
+  EXPECT_EQ(18u, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+  EXPECT_EQ(dy[4] * wptr[4], dx[9]);
+  EXPECT_EQ(dy[4] * wptr[5] + dy[1] * wptr[3], dx[10]);
+  EXPECT_EQ(dy[5] * wptr[4], dx[11]);
+  EXPECT_EQ(dy[4] * wptr[7] + dy[2] * wptr[1], dx[12]);
+  EXPECT_EQ(
+      dy[4] * wptr[8] + dy[5] * wptr[6] + dy[6] * wptr[2] + dy[7] * wptr[0],
+      dx[13]);
+  EXPECT_EQ(dy[5] * wptr[7] + dy[7] * wptr[1], dx[14]);
+  EXPECT_EQ(dy[6] * wptr[4], dx[15]);
+  EXPECT_EQ(dy[6] * wptr[5] + dy[7] * wptr[3], dx[16]);
+  EXPECT_EQ(dy[7] * wptr[4], dx[17]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  const float *dbptr = db.data<float>();
+  EXPECT_FLOAT_EQ(dy[0] + dy[1] + dy[2] + dy[3] + dy[4] + dy[5] + dy[6] + dy[7],
+                  dbptr[0]);
+
+  const float *dwptr = dw.data<float>();
+  EXPECT_EQ(9u, dw.Size());
+  EXPECT_FLOAT_EQ(dy[3] * x[4] + dy[7] * x[13], dwptr[0]);
+  EXPECT_FLOAT_EQ(dy[3] * x[5] + dy[7] * x[14] + dy[2] * x[3] + dy[6] * x[12],
+                  dwptr[1]);
+  EXPECT_FLOAT_EQ(dy[2] * x[4] + dy[6] * x[13], dwptr[2]);
+  EXPECT_FLOAT_EQ(dy[1] * x[1] + dy[5] * x[10] + dy[3] * x[7] + dy[7] * x[16],
+                  dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[4] * x[9] + dy[1] * x[2] + dy[5] * x[11] +
+                      dy[2] * x[6] + dy[6] * x[15] + dy[3] * x[8] +
+                      dy[7] * x[17],
+                  dwptr[4]);
+  EXPECT_FLOAT_EQ(dy[0] * x[1] + dy[4] * x[10] + dy[2] * x[7] + dy[6] * x[16],
+                  dwptr[5]);
+  EXPECT_FLOAT_EQ(dy[1] * x[4] + dy[5] * x[13], dwptr[6]);
+  EXPECT_FLOAT_EQ(dy[0] * x[3] + dy[4] * x[12] + dy[1] * x[5] + dy[5] * x[14],
+                  dwptr[7]);
+  EXPECT_FLOAT_EQ(dy[0] * x[4] + dy[4] * x[13], dwptr[8]);
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ed981497/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_convolution.cc b/test/singa/test_cudnn_convolution.cc
index a13016b..66c62f6 100644
--- a/test/singa/test_cudnn_convolution.cc
+++ b/test/singa/test_cudnn_convolution.cc
@@ -69,10 +69,11 @@ TEST(CudnnConvolution, Forward) {
 
   // Set weight and bias manually
   const size_t num_filters = 1;
-  const float we[num_filters * batchsize * h * w] = {
+  const size_t col_height = 1 * 3 * 3; // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {
       1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
-  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, cuda);
-  weight.CopyDataFromHostPtr(we, batchsize * h * w);
+  singa::Tensor weight(singa::Shape{num_filters, col_height}, cuda);
+  weight.CopyDataFromHostPtr(we, col_height);
   const float b[num_filters] = {1.0f};
   singa::Tensor bias(singa::Shape{num_filters}, cuda);
   bias.CopyDataFromHostPtr(b, num_filters);
@@ -119,11 +120,12 @@ TEST(CudnnConvolution, Backward) {
 
   // Set weight_ and bias_ manually
   const size_t num_filters = 1;
-  const float we[num_filters * batchsize * src_h * src_w] = {
+  const size_t col_height = 1 * 3 * 3; // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {
       1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
-  singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
+  singa::Tensor weight(singa::Shape{num_filters, col_height},
                        cuda);
-  weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
+  weight.CopyDataFromHostPtr(we, col_height);
   const float b[num_filters] = {1.0f};
   singa::Tensor bias(singa::Shape{num_filters}, cuda);
   bias.CopyDataFromHostPtr(b, num_filters);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ed981497/test/singa/test_pooling.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_pooling.cc b/test/singa/test_pooling.cc
new file mode 100644
index 0000000..3089a90
--- /dev/null
+++ b/test/singa/test_pooling.cc
@@ -0,0 +1,141 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/pooling.h"
+
+#include "gtest/gtest.h"
+
+using singa::Pooling;
+using singa::Shape;
+TEST(Pooling, Setup) {
+  Pooling pool;
+  EXPECT_EQ("Pooling", pool.layer_type());
+
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(1);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(1);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(2);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  EXPECT_EQ(singa::PoolingConf_PoolMethod_MAX, pool.pool_method());
+  EXPECT_EQ(1u, pool.kernel_h());
+  EXPECT_EQ(2u, pool.kernel_w());
+  EXPECT_EQ(1u, pool.pad_h());
+  EXPECT_EQ(0u, pool.pad_w());
+  EXPECT_EQ(2u, pool.stride_h());
+  EXPECT_EQ(1u, pool.stride_w());
+  EXPECT_EQ(1u, pool.channels());
+  EXPECT_EQ(3u, pool.height());
+  EXPECT_EQ(3u, pool.width());
+}
+
+TEST(Pooling, Forward) {
+  const size_t batchsize = 2, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  singa::Tensor in(singa::Shape{batchsize, c, h, w});
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  Pooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence pooling
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
+  EXPECT_EQ(8u, out1.Size());
+  EXPECT_EQ(5.0f, outptr1[0]);
+  EXPECT_EQ(6.0f, outptr1[1]);
+  EXPECT_EQ(8.0f, outptr1[2]);
+  EXPECT_EQ(9.0f, outptr1[3]);
+  EXPECT_EQ(5.0f, outptr1[4]);
+  EXPECT_EQ(6.0f, outptr1[5]);
+  EXPECT_EQ(8.0f, outptr1[6]);
+  EXPECT_EQ(9.0f, outptr1[7]);
+}
+
+TEST(Pooling, Backward) {
+  // src_data
+  const size_t batchsize = 2, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w});
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  Pooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f,
+                                                     0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w});
+  grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
+
+  const auto ret = pool.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  const float *dx = in_grad.data<float>();
+  EXPECT_EQ(18u, in_grad.Size());
+  EXPECT_EQ(0.0f, dx[0]);
+  EXPECT_EQ(0.0f, dx[1]);
+  EXPECT_EQ(0.0f, dx[2]);
+  EXPECT_EQ(0.0f, dx[3]);
+  EXPECT_EQ(0.1f, dx[4]);
+  EXPECT_EQ(0.2f, dx[5]);
+  EXPECT_EQ(0.0f, dx[6]);
+  EXPECT_EQ(0.3f, dx[7]);
+  EXPECT_EQ(0.4f, dx[8]);
+  EXPECT_EQ(0.0f, dx[9]);
+  EXPECT_EQ(0.0f, dx[10]);
+  EXPECT_EQ(0.0f, dx[11]);
+  EXPECT_EQ(0.0f, dx[12]);
+  EXPECT_EQ(0.1f, dx[13]);
+  EXPECT_EQ(0.2f, dx[14]);
+  EXPECT_EQ(0.0f, dx[15]);
+  EXPECT_EQ(0.3f, dx[16]);
+  EXPECT_EQ(0.4f, dx[17]);
+}