singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wan...@apache.org
Subject [02/19] incubator-singa git commit: SINGA-100 Implement layers using CUDNN for GPU training
Date Wed, 16 Dec 2015 12:11:34 GMT
SINGA-100 Implement layers using CUDNN for GPU training

Fix errors from compiling cpu code and cudnn code.

tmp commit. Trying to run cudnn for gpu training and Mshadow for cpu training.
1. finish the CudnnSoftmaxLossLayer
2. finish the DropoutLayer
3. add ActivationLayer, which leads to link errors.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/49293a68
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/49293a68
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/49293a68

Branch: refs/heads/master
Commit: 49293a6812cbd2e74c3130af17d619a0e3bf0e89
Parents: af1bf50
Author: Wei Wang <wangwei@comp.nus.edu.sg>
Authored: Tue Nov 24 15:20:39 2015 +0800
Committer: Wei Wang <wangwei@comp.nus.edu.sg>
Committed: Fri Dec 11 11:01:29 2015 +0800

----------------------------------------------------------------------
 Makefile.am                                     | 11 ++--
 Makefile.gpu                                    | 17 ++---
 include/mshadow/tensor_random.h                 |  4 +-
 include/singa/neuralnet/input_layer.h           |  1 +
 include/singa/neuralnet/loss_layer.h            | 17 +++++
 include/singa/neuralnet/neuron_layer.h          | 13 ++--
 include/singa/utils/blob.h                      |  4 +-
 include/singa/utils/math_addr.h                 | 26 ++++++++
 include/singa/utils/math_blob.h                 | 67 ++++++++++++++++++--
 include/singa/worker.h                          |  4 +-
 src/driver.cc                                   | 30 ++-------
 src/neuralnet/neuron_layer/argsort.cc           | 56 ----------------
 src/neuralnet/neuron_layer/convolution.cc       |  8 +--
 src/neuralnet/neuron_layer/cudnn_activation.cu  | 18 +++---
 src/neuralnet/neuron_layer/cudnn_convolution.cu | 24 +++----
 src/neuralnet/neuron_layer/cudnn_lrn.cu         | 56 +++++++---------
 src/neuralnet/neuron_layer/cudnn_pooling.cu     | 15 +++--
 src/neuralnet/neuron_layer/cudnn_softmax.cu     | 16 ++---
 src/neuralnet/neuron_layer/dropout.cc           | 19 +++---
 src/neuralnet/neuron_layer/pooling.cc           | 65 +++++++++++++------
 src/neuralnet/neuron_layer/softmax.cc           |  8 +--
 src/neuralnet/output_layer/argsort.cc           | 56 ++++++++++++++++
 src/proto/job.proto                             | 53 ++++++++++++++--
 src/utils/blob.cc                               | 25 +-------
 24 files changed, 369 insertions(+), 244 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/Makefile.am
----------------------------------------------------------------------
diff --git a/Makefile.am b/Makefile.am
index 835f66d..bbb3497 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -68,11 +68,11 @@ SINGA_SRCS := src/driver.cc \
               src/neuralnet/input_layer/record.cc \
               src/neuralnet/input_layer/deprecated.cc \
               src/neuralnet/input_layer/store.cc \
+              src/neuralnet/output_layer/argsort.cc \
               src/neuralnet/output_layer/csv.cc \
               src/neuralnet/output_layer/record.cc \
               src/neuralnet/loss_layer/euclidean.cc \
               src/neuralnet/loss_layer/softmax.cc \
-              src/neuralnet/neuron_layer/argsort.cc \
               src/neuralnet/neuron_layer/convolution.cc \
               src/neuralnet/neuron_layer/dummy.cc \
               src/neuralnet/neuron_layer/dropout.cc \
@@ -118,8 +118,11 @@ SINGA_HDRS := include/singa.h \
               include/worker.h \
               include/stub.h \
               include/neuralnet/layer.h \
-              include/neuralnet/output_layer/csv.h \
-              include/neuralnet/output_layer/record.h \
+              include/neuralnet/output_layer.h \
+              include/neuralnet/input_layer.h \
+              include/neuralnet/loss_layer.h \
+              include/neuralnet/neuron_layer.h \
+              include/neuralnet/connection_layer.h \
               include/neuralnet/neuralnet.h \
               include/singa/comm/msg.h \
               include/singa/comm/socket.h \
@@ -127,12 +130,12 @@ SINGA_HDRS := include/singa.h \
               include/singa/io/kvfile.h \
               include/singa/io/kvfile_store.h \
               include/singa/io/textfile_store.h \
+              include/mshadow/cxxnet_op.h \
               include/mshadow/tensor_expr.h \
               include/mshadow/tensor_container.h \
               include/mshadow/tensor_expr_ext.h \
               include/mshadow/tensor.h \
               include/mshadow/tensor_io.h \
-              include/mshadow/cxxnet_op.h \
               include/mshadow/tensor_base.h \
               include/mshadow/tensor_random.h
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/Makefile.gpu
----------------------------------------------------------------------
diff --git a/Makefile.gpu b/Makefile.gpu
index 35b81b9..9cbbc42 100644
--- a/Makefile.gpu
+++ b/Makefile.gpu
@@ -24,12 +24,13 @@ HOME_DIR := /home/wangwei/local
 
 CUDA_DIR := /usr/local/cuda
 
-#CUDA_DIR :=
+# must config the cudnn folder if using cudnn
+CUDNN_DIR := /home/wangwei/local/cuda
 
 # Lib folder for system and external libs. You may need to change it.
-LIBRARY_DIRS := $(CUDA_DIR)/lib64 $(CUDA_DIR)/lib $(HOME_DIR)/lib64 $(HOME_DIR)/lib
+LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib $(CUDA_DIR)/lib64 $(CUDA_DIR)/lib $(CUDNN_DIR)/lib64
 # Header folder for system and external libs. You may need to change it.
-INCLUDE_DIRS := $(CUDA_DIR)/include $(HOME_DIR)/include ./include
+INCLUDE_DIRS := $(HOME_DIR)/include ./include $(HOME_DIR)/local/include/zookeeper $(CUDA_DIR)/include $(CUDNN_DIR)/include
 # g++ location, should support c++11, tested with 4.8.1
 CXX := g++
 CUCXX := nvcc
@@ -38,7 +39,7 @@ CUCXX := nvcc
 LIBRARIES := glog protobuf openblas zmq czmq zookeeper_mt
 
 ifneq ($(CUDA_DIR),)
-	LIBRARIES := $(LIBRARIES) cublas cudart curand
+	LIBRARIES := $(LIBRARIES) cublas cudart curand cudnn
 endif
 
 LDFLAGS := $(foreach librarydir, $(LIBRARY_DIRS), -L$(librarydir))\
@@ -48,9 +49,9 @@ BUILD_DIR := .libs
 MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
 ZK_FLAGS :=-DTHREADED -fpermissive
 CXXFLAGS := -O2 -msse3 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
-	$(MSHADOW_FLAGS) -DCPU_ONLY=1 $(ZK_FLAGS)\
+	$(MSHADOW_FLAGS) $(ZK_FLAGS)\
 	-funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
-CUCXXFLAGS := $(MSHADOW_FLAGS) -DUSE_GPU -std=c++11 -G $(CUDA_ARCH) \
+CUCXXFLAGS := -DUSE_CUDNN $(MSHADOW_FLAGS) -std=c++11 $(CUDA_ARCH) \
 	$(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
 
 #Add device compile option
@@ -84,7 +85,7 @@ TEST_CUDA_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cu")
 TEST_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_CUDA_SRCS:.cu=.o)))
 -include $(TEST_CUDA_OBJS:%.o=%.P)
 
-SINGA_CUDA_SRCS := $(shell find src/ \( -path "src/test" \) -prune -o \( -name "*.cu" -type f \) -print )
+SINGA_CUDA_SRCS :=$(shell find src/ -name "*.cu")
 SINGA_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_CUDA_SRCS:.cu=.o)))
 -include $(SINGA_CUDA_OBJS:%.o=%.P)
 
@@ -140,7 +141,7 @@ proto: $(PROTO_OBJS)
 $(PROTO_SRCS): $(PROTOS)
 	protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS)
 	mkdir -p include/proto/
-	cp src/proto/*.pb.h include/proto/
+	cp src/proto/*.pb.h include/singa/proto/
 	mkdir -p tool/pb2/
 	touch tool/pb2/__init__.py
 	protoc --proto_path=src/proto --python_out=tool/pb2/ $(PROTOS)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/include/mshadow/tensor_random.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_random.h b/include/mshadow/tensor_random.h
index 59ef082..ae2836a 100644
--- a/include/mshadow/tensor_random.h
+++ b/include/mshadow/tensor_random.h
@@ -249,8 +249,8 @@ namespace mshadow {
         std::mt19937 gen_;
     }; // class Random<cpu>
 
-#ifdef __CUDACC__
-
+#if MSHADOW_USE_CUDA
+// __CUDACC__
     /*! \brief GPU random number generator */
     template<>
     class Random<gpu> {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/include/singa/neuralnet/input_layer.h
----------------------------------------------------------------------
diff --git a/include/singa/neuralnet/input_layer.h b/include/singa/neuralnet/input_layer.h
index 72593d5..2825d65 100644
--- a/include/singa/neuralnet/input_layer.h
+++ b/include/singa/neuralnet/input_layer.h
@@ -26,6 +26,7 @@
 #include <vector>
 #include <thread>
 #include "singa/io/store.h"
+#include "singa/io/kvfile.h"
 #include "singa/neuralnet/layer.h"
 
 namespace singa {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/include/singa/neuralnet/loss_layer.h
----------------------------------------------------------------------
diff --git a/include/singa/neuralnet/loss_layer.h b/include/singa/neuralnet/loss_layer.h
index e7fcd74..50e6c24 100644
--- a/include/singa/neuralnet/loss_layer.h
+++ b/include/singa/neuralnet/loss_layer.h
@@ -49,8 +49,25 @@ class SoftmaxLossLayer : public LossLayer, public SoftmaxLayer {
 
  private:
   float scale_;
+  int topk_, dim_;
+};
+
+#ifdef USE_CUDNN
+class CudnnSoftmaxLossLayer : public LossLayer, public CudnnSoftmaxLayer {
+ public:
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
+  const std::string ToString(bool debug, int flag) override;
+
+ private:
   int topk_;
+  int counter_;
+  float loss_, accuracy_;
+
+  CudnnSoftmaxLayer *softmax_;
 };
+#endif
 }  // namespace singa
 
 #endif  // SINGA_NEURALNET_LOSS_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/include/singa/neuralnet/neuron_layer.h
----------------------------------------------------------------------
diff --git a/include/singa/neuralnet/neuron_layer.h b/include/singa/neuralnet/neuron_layer.h
index 2bf9682..097b209 100644
--- a/include/singa/neuralnet/neuron_layer.h
+++ b/include/singa/neuralnet/neuron_layer.h
@@ -249,6 +249,7 @@ class STanhLayer : public NeuronLayer {
 
 /*************** Layers implemented using cudnn v3 ***************/
 #ifdef USE_CUDNN
+#include <cudnn.h>
 #define CHECK_CUDNN(x) CHECK_EQ(x, CUDNN_STATUS_SUCCESS)
 
 class CudnnLayer : virtual public NeuronLayer {
@@ -289,9 +290,9 @@ class CudnnActivationLayer : public ActivationLayer, public CudnnLayer {
 /**
  * Convolution layer implemeneted using cudnn (v3 version backward functions).
  */
-class CuDNNConvLayer : public ConvolutionLayer, public CudnnLayer {
+class CudnnConvLayer : public ConvolutionLayer, public CudnnLayer {
  public:
-  ~CuDNNConvLayer();
+  ~CudnnConvLayer();
   void InitCudnn() override;
   void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
   void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
@@ -306,7 +307,7 @@ class CuDNNConvLayer : public ConvolutionLayer, public CudnnLayer {
   size_t workspace_byte_limit_, workspace_count_;
 };
 
-class CudnnLRNLayer : public LRNLayer {
+class CudnnLRNLayer : public LRNLayer, public CudnnLayer {
  public:
   ~CudnnLRNLayer();
   void InitCudnn() override;
@@ -320,9 +321,9 @@ class CudnnLRNLayer : public LRNLayer {
 /**
  * Pooling layer implemented using cudnn.
  */
-class CuDNNPoolLayer : public PoolingLayer, public CudnnLayer {
+class CudnnPoolLayer : public PoolingLayer, public CudnnLayer {
  public:
-  ~CuDNNPoolLayer();
+  ~CudnnPoolLayer();
   void InitCudnn() override;
   void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
   void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
@@ -407,4 +408,4 @@ class RBMHidLayer: public RBMLayer {
 };
 
 }  // namespace singa
-#define SINGA_NEURALNET_NEURON_LAYER_H_
+#endif  // SINGA_NEURALNET_NEURON_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/include/singa/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h
index 87a97d5..e7bf995 100644
--- a/include/singa/utils/blob.h
+++ b/include/singa/utils/blob.h
@@ -255,14 +255,14 @@ class Blob {
   /**
    * @return the size of the k-th dimension.
    */
-  inline const int shape(int k) const {
+  inline int shape(int k) const {
     CHECK_LT(k, shape_.size());
     return shape_.at(k);
   }
   inline int count() const {
     return count_;
   }
-  inline const int version() const {
+  inline int version() const {
     return version_;
   }
   inline void set_version(int v) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
index 9b91e70..b21ea45 100644
--- a/include/singa/utils/math_addr.h
+++ b/include/singa/utils/math_addr.h
@@ -124,6 +124,20 @@ void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
 }
 // expand each element in A into a row of B
 
+template<typename Dtype>
+void cpu_sample_uniform(int n, Dtype low, Dtype high, Dtype* A);
+
+template<>
+inline void cpu_sample_uniform<float>(int n, float low, float high, float* A) {
+
+}
+template<typename Dtype>
+void cpu_sample_gaussian(int n, Dtype mean, Dtype std, Dtype* A);
+
+template<>
+inline void cpu_sample_gaussian<float>(int n, float mean, float std, float* A) {
+
+}
 #ifdef USE_GPU
 template<typename Dtype>
 void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n,
@@ -213,6 +227,18 @@ void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
     Op::CudaMap(A[i], n, B+i*n);
   }
 }
+
+
+template<typename Dtype>
+void gpu_sample_uniform(int n, Dtype low, Dtype high, Dtype* A) {
+
+}
+
+template<typename Dtype>
+void gpu_sample_gaussian(int n, Dtype mean, Dtype std, Dtype* A) {
+
+}
+
 // expand each element in A into a row of B
 #endif  // USE_GPU
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
index 6428117..ca75205 100644
--- a/include/singa/utils/math_blob.h
+++ b/include/singa/utils/math_blob.h
@@ -24,9 +24,12 @@
 
 #include <vector>
 #include <algorithm>
+#include <thread>
 #include "singa/utils/blob.h"
 #include "singa/utils/singa_op.h"
 #include "singa/utils/math_addr.h"
+#include "singa/utils/singleton.h"
+#include "singa/utils/context.h"
 
 
 namespace singa {
@@ -355,10 +358,21 @@ void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
  * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
  */
 template<typename Dtype>
-void Mult(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
-  Map<singa::op::Mult<Dtype>>(xpu, A, B, C);
+  //Map<singa::op::Mult<Dtype>>(xpu, A, B, C);
   // TODO(wangwei) use MKL's vector func
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1)
+    cpu_e_f<op::Mult<Dtype>>(C->count(), A.cpu_data(), B.cpu_data(),
+        C->mutable_cpu_data());
+  else {
+#ifdef USE_GPU
+  gpu_e_f<op::Mult<Dtype>>(C->count(), A.gpu_data(), B.gpu_data(),
+        C->mutable_gpu_data());
+#endif
+  }
 }
 
 /**
@@ -473,7 +487,7 @@ void RepmatRow(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
 }
 
 /**
- * Sum all columns of matrix A to a column vector B, 
+ * Sum all columns of matrix A to a column vector B,
  * i.e., Bi = \sum_j {alpha*Aij}+beta*Bi
  * Loose shape checking, A.count() % B.count() == 0.
  * # columns of A = A.count() / B.count().
@@ -498,7 +512,7 @@ void MVSumCol(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
 }
 
 /**
- * Sum all rows of matrix A to a row vector B, 
+ * Sum all rows of matrix A to a row vector B,
  * i.e., Bj = \sum_i {alpha*Aij}+beta*Bj
  * Loose shape checking, A.count() % B.count() == 0.
  * # rows of A = A.count() / B.count().
@@ -574,6 +588,51 @@ Dtype Asum(XPU xpu, const Blob<Dtype>& A) {
 #endif
 }
 
+/*************Random Sample***************/
+template<typename Dtype>
+void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A);
+
+template<>
+inline void SampleUniform<float>(float low, float high, Blob<float> *A) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1)
+    cpu_sample_uniform(A->count(), low, high, A->mutable_cpu_data());
+  else {
+#ifdef USE_GPU
+    gpu_sample_uniform(A->count(), low, high, A->mutable_gpu_data());
+#endif
+  }
+}
+
+template<typename Dtype>
+void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A);
+
+template<>
+inline void SampleGaussian<float>(float low, float high, Blob<float> *A) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1)
+    cpu_sample_gaussian(A->count(), low, high, A->mutable_cpu_data());
+  else {
+#ifdef USE_GPU
+    gpu_sample_gaussian(A->count(), low, high, A->mutable_gpu_data());
+#endif
+  }
+}
+
+/************** Other functions ****************/
+template<typename Dtype>
+void Softmax(int nb_rows, const Blob<Dtype>& A, Blob<Dtype>* B) {
+  CHECK_GT(nb_rows, 0);
+  CHECK_EQ(A.count() % nb_rows, 0);
+  CHECK_EQ(A.count(), B->count());
+
+#ifdef USE_GPU
+  cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(),
+      B->mutable_cpu_data());
+#endif  // USE_GPU
+}
 }  // end of namespace singa
 
 #endif  // SINGA_UTILS_MATH_BLOB_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/include/singa/worker.h
----------------------------------------------------------------------
diff --git a/include/singa/worker.h b/include/singa/worker.h
index 53f2118..8738c27 100644
--- a/include/singa/worker.h
+++ b/include/singa/worker.h
@@ -28,8 +28,8 @@
 #include "singa/comm/socket.h"
 #include "singa/neuralnet/neuralnet.h"
 #include "singa/proto/job.pb.h"
-#include "singa/neuralnet/connection_layer/bridge.h"
-#include "singa/neuralnet/neuron_layer/rbm.h"
+#include "singa/neuralnet/connection_layer.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index 4ffd9b1..8a48c30 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -33,31 +33,11 @@
 #include "singa/stub.h"
 #include "singa/worker.h"
 
-#include "singa/neuralnet/connection_layer/bridge.h"
-#include "singa/neuralnet/connection_layer/concate.h"
-#include "singa/neuralnet/connection_layer/slice.h"
-#include "singa/neuralnet/connection_layer/split.h"
-#include "singa/neuralnet/input_layer/deprecated.h"
-#include "singa/neuralnet/input_layer/csv.h"
-#include "singa/neuralnet/input_layer/image_preprocess.h"
-#include "singa/neuralnet/input_layer/prefetch.h"
-#include "singa/neuralnet/input_layer/record.h"
-#include "singa/neuralnet/input_layer/store.h"
-#include "singa/neuralnet/loss_layer/euclidean.h"
-#include "singa/neuralnet/loss_layer/softmax.h"
-#include "singa/neuralnet/neuron_layer/argsort.h"
-#include "singa/neuralnet/neuron_layer/convolution.h"
-#include "singa/neuralnet/neuron_layer/dropout.h"
-#include "singa/neuralnet/neuron_layer/inner_product.h"
-#include "singa/neuralnet/neuron_layer/lrn.h"
-#include "singa/neuralnet/neuron_layer/pooling.h"
-#include "singa/neuralnet/neuron_layer/rbm.h"
-#include "singa/neuralnet/neuron_layer/relu.h"
-#include "singa/neuralnet/neuron_layer/sigmoid.h"
-#include "singa/neuralnet/neuron_layer/stanh.h"
-#include "singa/neuralnet/neuron_layer/softmax.h"
-#include "singa/neuralnet/output_layer/record.h"
-#include "singa/neuralnet/output_layer/csv.h"
+#include "singa/neuralnet/connection_layer.h"
+#include "singa/neuralnet/input_layer.h"
+#include "singa/neuralnet/loss_layer.h"
+#include "singa/neuralnet/neuron_layer.h"
+#include "singa/neuralnet/output_layer.h"
 
 extern "C" void openblas_set_num_threads(int num);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/argsort.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/argsort.cc b/src/neuralnet/neuron_layer/argsort.cc
deleted file mode 100644
index d1775c0..0000000
--- a/src/neuralnet/neuron_layer/argsort.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/output_layer.h"
-#include <algorithm>
-
-namespace singa {
-
-void ArgSortLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  NeuronLayer::Setup(proto, srclayers);
-  batchsize_ = srclayers[0]->data(this).shape()[0];
-  dim_ = srclayers[0]->data(this).count() / batchsize_;
-  topk_ = proto.argsort_conf().topk();
-  data_.Reshape(vector<int>{batchsize_, topk_});
-}
-
-void ArgSortLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  // TODO(wangwei) check flag to ensure it is not called in training phase
-  const float* srcptr = srclayers.at(0)->data(this).cpu_data();
-  float* ptr = data_.mutable_cpu_data();
-  for (int n = 0; n < batchsize_; n++) {
-    vector<std::pair<float, int> > vec;
-    for (int j = 0; j < dim_; ++j)
-      vec.push_back(std::make_pair(srcptr[j], j));
-    std::partial_sort(vec.begin(), vec.begin() + topk_, vec.end(),
-                      std::greater<std::pair<float, int> >());
-
-    for (int j = 0; j < topk_; ++j)
-      ptr[j] = static_cast<float> (vec.at(j).second);
-    ptr += topk_;
-    srcptr += dim_;
-  }
-}
-
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/convolution.cc b/src/neuralnet/neuron_layer/convolution.cc
index 4247bee..2b4c6d9 100644
--- a/src/neuralnet/neuron_layer/convolution.cc
+++ b/src/neuralnet/neuron_layer/convolution.cc
@@ -106,7 +106,7 @@ void ConvolutionLayer::ComputeFeature(int flag,
   auto weight = Tensor2(weight_->mutable_data());
   auto bias = Tensor1(bias_->mutable_data());
   for (int n = 0; n < batchsize_; n++) {
-    if (pad_ > 0)
+    if (pad_x_ > 0)
       col = expr::unpack_patch2col(pad(src[n], pad_x_), kernel_x_, stride_x_);
     else
       col = expr::unpack_patch2col(src[n], kernel_x_, stride_x_);
@@ -131,11 +131,11 @@ void ConvolutionLayer::ComputeGradient(int flag,
   gbias = expr::sumall_except_dim<1>(grad);
   gweight = 0.0f;
   Shape<3> padshp(gsrc.shape.SubShape());
-  padshp[0] += 2 * pad_;
-  padshp[1] += 2 * pad_;
+  padshp[0] += 2 * pad_y_;
+  padshp[1] += 2 * pad_x_;
   Shape<2> imgshp = Shape2(height_, width_);
   for (int n = 0; n < batchsize_; n++) {
-    if (pad_ > 0)
+    if (pad_x_ > 0)
       col = expr::unpack_patch2col(pad(src[n], pad_x_), kernel_x_, stride_x_);
     else
       col = expr::unpack_patch2col(src[n], kernel_x_, stride_x_);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/cudnn_activation.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_activation.cu b/src/neuralnet/neuron_layer/cudnn_activation.cu
index f77a8a8..e8a7b41 100644
--- a/src/neuralnet/neuron_layer/cudnn_activation.cu
+++ b/src/neuralnet/neuron_layer/cudnn_activation.cu
@@ -27,14 +27,14 @@ void CudnnActivationLayer::InitCudnn() {
   CudnnLayer::InitCudnn();
 
   // TODO(wangwei) make the mode case insensitive
-  if (layer_conf_.activation_conf().mode() == "sigmoid")
+  if (layer_conf_.activation_conf().type() == SIGMOID)
     mode_ = CUDNN_ACTIVATION_SIGMOID;
-  else if (layer_conf_.activation_conf().mode() == "tanh")
+  else if (layer_conf_.activation_conf().type() == TANH)
     mode_ = CUDNN_ACTIVATION_TANH;
-  else if (layer_conf_.activation_conf().mode() == "relu")
+  else if (layer_conf_.activation_conf().type() == RELU)
     mode_ = CUDNN_ACTIVATION_RELU;
   else {
-    LOG(FATAL) << "Unkown activation: " << layer_conf_.activation_conf().mode();
+    LOG(FATAL) << "Unkown activation: " << layer_conf_.activation_conf().type();
   }
 
   const auto& shape = data_.shape();
@@ -48,7 +48,7 @@ void CudnnActivationLayer::InitCudnn() {
   stride[i] = 1;
   for (--i; i >= 0; i--) {
     sdim[i] = shape[i];
-    stride[i] = shape[i + 1] * stride[i + 1]
+    stride[i] = shape[i + 1] * stride[i + 1];
   }
   CHECK_CUDNN(cudnnSetTensorNdDescriptor(src_desc_,
         CUDNN_DATA_FLOAT,
@@ -75,7 +75,7 @@ void CudnnActivationLayer::ComputeFeature(int flag,
         mode_,
         &alpha,
         src_desc_,
-        srclayers[0].data(this)->gpu_data(),
+        srclayers[0]->data(this).gpu_data(),
         &beta,
         my_desc_,
         data_.mutable_gpu_data()));
@@ -88,13 +88,13 @@ void CudnnActivationLayer::ComputeGradient(int flag,
         mode_,
         &alpha,
         my_desc_,
-        data_.gpu_data()
+        data_.gpu_data(),
         my_desc_,
         grad_.gpu_data(),
         src_desc_,
-        srclayers[0].data(this)->gpu_data(),
+        srclayers[0]->data(this).gpu_data(),
         &beta,
         src_desc_,
-        srclayers[0].mutable_grad(this)->mutable_gpu_data()));
+        srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
 }
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/cudnn_convolution.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_convolution.cu b/src/neuralnet/neuron_layer/cudnn_convolution.cu
index debe4c3..13e9f65 100644
--- a/src/neuralnet/neuron_layer/cudnn_convolution.cu
+++ b/src/neuralnet/neuron_layer/cudnn_convolution.cu
@@ -70,7 +70,7 @@ void CudnnConvLayer::InitCudnn() {
         channels_,
         height_,
         width_));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(data_desc_,
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
         CUDNN_TENSOR_NCHW,
         CUDNN_DATA_FLOAT,
         batchsize_,
@@ -126,7 +126,7 @@ void CudnnConvLayer::InitCudnn() {
         filter_desc_,
         bp_filter_alg_,
         &bp_filter_byte));
-  workspace_count_ = std::max(std::max(fp_bypte, bp_data_byte), bp_filter_byte)
+  workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte)
     / sizeof(float) + 1;
 }
 
@@ -134,30 +134,30 @@ void CudnnConvLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if (!has_init_cudnn_)
     InitCudnn();
   float alpha = 1.f, beta = 0.f;
-  Blob<float> workspace(vector<int>{workspace_count_});
-  CUDNN_CHECK(cudnnConvolutionForward(handle_,
+  Blob<float> workspace(vector<int>{static_cast<int>(workspace_count_)});
+  CHECK_CUDNN(cudnnConvolutionForward(handle_,
         &alpha,
         src_desc_,
         srclayers[0]->data(this).gpu_data(),
         filter_desc_,
         weight_->data().gpu_data(),
-        conv_descs_,
+        conv_desc_,
         fp_alg_,
         workspace.mutable_gpu_data(),
         workspace_count_ * sizeof(float),
         &beta,
-        data_desc_,
+        my_desc_,
         data_.mutable_gpu_data()));
 
   if (bias_) {
     beta = 1.f;
-    CUDNN_CHECK(cudnnAddTensor(handle_,
+    CHECK_CUDNN(cudnnAddTensor(handle_,
           CUDNN_ADD_SAME_C,
           &alpha,
           bias_desc_,
           bias_->data().gpu_data(),
           &beta,
-          data_desc_,
+          my_desc_,
           data_.mutable_gpu_data()));
   }
 }
@@ -165,11 +165,11 @@ void CudnnConvLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
 void CudnnConvLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
 {
   float alpha = 1.f, beta = 0.f;
-  Blob<float> workspace(vector<int>{workspace_count_});
+  Blob<float> workspace(vector<int>{static_cast<int>(workspace_count_)});
   if (bias_) {
     CHECK_CUDNN(cudnnConvolutionBackwardBias(handle_,
           &alpha,
-          data_desc_,
+          my_desc_,
           grad_.gpu_data(),
           &beta,
           bias_desc_,
@@ -178,7 +178,7 @@ void CudnnConvLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
   CHECK_CUDNN(cudnnConvolutionBackwardFilter_v3(handle_,
         &alpha,
         src_desc_,
-        srclayers[0]->data(this)->gpu_data(),
+        srclayers[0]->data(this).gpu_data(),
         my_desc_,
         grad_.gpu_data(),
         conv_desc_,
@@ -191,7 +191,7 @@ void CudnnConvLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
   CHECK_CUDNN(cudnnConvolutionBackwardData_v3(handle_,
         &alpha,
         filter_desc_,
-        weight_->data()->gpu_data(),
+        weight_->data().gpu_data(),
         my_desc_,
         grad_.gpu_data(),
         conv_desc_,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/cudnn_lrn.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_lrn.cu b/src/neuralnet/neuron_layer/cudnn_lrn.cu
index 7cb111a..4a2b695 100644
--- a/src/neuralnet/neuron_layer/cudnn_lrn.cu
+++ b/src/neuralnet/neuron_layer/cudnn_lrn.cu
@@ -23,72 +23,64 @@
 
 namespace singa {
 CudnnLRNLayer::~CudnnLRNLayer() {
-  if (!init_cudnn_) {
+  if (has_init_cudnn_) {
     cudnnDestroyLRNDescriptor(norm_desc_);
   }
 }
 
-void CudnnLRNLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  LRNLayer::Setup(proto, srclayers);
-  mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
-}
-
 void CudnnLRNLayer::InitCudnn() {
-  CudnnLayer::InitCudnn(srclayers);
-  CHECK_EQ(cudnnCreateLRNDescriptor(&norm_desc_), CUDNN_STATUS_SUCCESS);
-  CHECK_EQ(cudnnSetLRNDescriptor(norm_desc_,
+  mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
+  CudnnLayer::InitCudnn();
+  CHECK_CUDNN(cudnnCreateLRNDescriptor(&norm_desc_));
+  CHECK_CUDNN(cudnnSetLRNDescriptor(norm_desc_,
         lsize_,
         alpha_,
         beta_,
-        knorm_), CUDNN_STATUS_SUCCESS);
-  CHECK_EQ(cudnnCreateTensorDescriptor(&src_desc_), CUDNN_STATUS_SUCCESS);
-  CHECK_EQ(cudnnSetTensor4dDescriptor(src_desc_,
+        knorm_));
+  CHECK_CUDNN(cudnnCreateTensorDescriptor(&src_desc_));
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
       CUDNN_TENSOR_NCHW,
       CUDNN_DATA_FLOAT,
       batchsize_,
       channels_,
       height_,
-      width_), CUDNN_STATUS_SUCCESS);
-  CHECK_EQ(cudnnCreateTensorDescriptor(&my_desc_), CUDNN_STATUS_SUCCESS);
-  CHECK_EQ(cudnnSetTensor4dDescriptor(my_desc_,
+      width_));
+  CHECK_CUDNN(cudnnCreateTensorDescriptor(&my_desc_));
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
       CUDNN_TENSOR_NCHW,
       CUDNN_DATA_FLOAT,
       batchsize_,
       channels_,
       height_,
-      width_), CUDNN_STATUS_SUCCESS);
+      width_));
 }
-void ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (init_cudnn_) {
+void CudnnLRNLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  if (!has_init_cudnn_)
     InitCudnn();
-    init_cudnn_ = false;
-  }
-  CHECK_EQ(cudnnLRNCrossChannelForward(handle_,
+  CHECK_CUDNN(cudnnLRNCrossChannelForward(handle_,
       norm_desc_,
       mode_,
-      &alpha,
+      &alpha_,
       src_desc_,
       srclayers[0]->data(this).gpu_data(),
-      &beta,
+      &beta_,
       my_desc_,
-      data_.mutable_gpu_data()), CUDNN_STATUS_SUCCESS);
+      data_.mutable_gpu_data()));
 }
-void ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  CHECK_EQ(cudnnLRNCrossChannelBackward(handle_,
+void CudnnLRNLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  CHECK_CUDNN(cudnnLRNCrossChannelBackward(handle_,
         norm_desc_,
         mode_,
-        &alpha,
+        &alpha_,
         my_desc_, // ???
         data_.gpu_data(),
         my_desc_,
-        grad_.gpu_data()
+        grad_.gpu_data(),
         src_desc_,
         srclayers[0]->data(this).gpu_data(),
-        &beta,
+        &beta_,
         src_desc_,
-        srclayers[0]->mutable_grad(this)->mutable_gpu_data()),
-      CUDNN_STATUS_SUCCESS);
+        srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
 }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/cudnn_pooling.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_pooling.cu b/src/neuralnet/neuron_layer/cudnn_pooling.cu
index 619998f..ffdfb3b 100644
--- a/src/neuralnet/neuron_layer/cudnn_pooling.cu
+++ b/src/neuralnet/neuron_layer/cudnn_pooling.cu
@@ -23,13 +23,13 @@
 
 namespace singa {
 
-CuDNNPoolLayer::~CuDNNPoolLayer() {
+CudnnPoolLayer::~CudnnPoolLayer() {
   if (has_init_cudnn_) {
-    CHECK_EQ(cudnnDestroyPoolingDescriptor(pool_desc_), CUDNN_STATUS_SUCCESS);
+    CHECK_CUDNN(cudnnDestroyPoolingDescriptor(pool_desc_));
   }
 }
 
-void CuDNNPoolLayer::InitCudnn() {
+void CudnnPoolLayer::InitCudnn() {
   CudnnLayer::InitCudnn();
   CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pool_desc_));
   CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
@@ -57,9 +57,10 @@ void CuDNNPoolLayer::InitCudnn() {
         pad_x_,
         stride_y_,
         stride_x_));
+
 }
 
-void CuDNNPoolLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+void CudnnPoolLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if (!has_init_cudnn_)
     InitCudnn();
   float alpha = 1.0f, beta = 0.0f;
@@ -71,11 +72,11 @@ void CuDNNPoolLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
         src_desc_,
         srclayers[0]->data(this).gpu_data(),
         &beta,
-        data_desc_,
+        my_desc_,
         data_.mutable_gpu_data()));
 }
 
-void CuDNNPoolLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
+void CudnnPoolLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
 {
   float alpha = 1.0f, beta = 0.0f;
   CHECK_CUDNN(cudnnPoolingBackward(handle_,
@@ -89,7 +90,7 @@ void CuDNNPoolLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
         srclayers[0]->data(this).gpu_data(),
         &beta,
         src_desc_,
-        srclayers[0]->mutable_grad(this).mutable_gpu_data()));
+        srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
 }
 }  /* singa */
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/cudnn_softmax.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_softmax.cu b/src/neuralnet/neuron_layer/cudnn_softmax.cu
index f7e8abe..7fade3e 100644
--- a/src/neuralnet/neuron_layer/cudnn_softmax.cu
+++ b/src/neuralnet/neuron_layer/cudnn_softmax.cu
@@ -32,7 +32,7 @@ void CudnnSoftmaxLayer::InitCudnn() {
         num_softmax_per_instance_,
         count_per_softmax_,
         1));
-  CHECK_EQ(cudnnSetTensor4dDescriptor(my_desc_,
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
         CUDNN_TENSOR_NCHW,
         CUDNN_DATA_FLOAT,
         batchsize_,
@@ -45,10 +45,10 @@ void CudnnSoftmaxLayer::ComputeFeature(int flag,
     const vector<Layer*>& srclayers) {
   if (!has_init_cudnn_)
     InitCudnn();
-  float alpha = 1.0f, beta = 0.0f;
-  CHECK_CUDNN(CudnnSoftmaxForward(handle_,
+  const float alpha = 1.0f, beta = 0.0f;
+  CHECK_CUDNN(cudnnSoftmaxForward(handle_,
         CUDNN_SOFTMAX_ACCURATE,
-        CUDNN_SOFTMAX_CHANNEL,
+        CUDNN_SOFTMAX_MODE_CHANNEL,
         &alpha,
         src_desc_,
         srclayers[0]->data(this).gpu_data(),
@@ -59,10 +59,10 @@ void CudnnSoftmaxLayer::ComputeFeature(int flag,
 
 void CudnnSoftmaxLayer::ComputeGradient(int flag,
     const vector<Layer*>& srclayers) {
-  float alpha = 1.f, beta = 0.f;
-  CHECK_CUDNN(CudnnSoftmaxForward(handle_,
+  const float alpha = 1.f, beta = 0.f;
+  CHECK_CUDNN(cudnnSoftmaxBackward(handle_,
         CUDNN_SOFTMAX_ACCURATE,
-        CUDNN_SOFTMAX_CHANNEL,
+        CUDNN_SOFTMAX_MODE_CHANNEL,
         &alpha,
         my_desc_,
         data_.gpu_data(),
@@ -70,6 +70,6 @@ void CudnnSoftmaxLayer::ComputeGradient(int flag,
         grad_.gpu_data(),
         &beta,
         src_desc_,
-        srclayers[0]->mutable_grad(this).mutable_gpu_data()));
+        srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
 }
 }  /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/dropout.cc b/src/neuralnet/neuron_layer/dropout.cc
index 6158a6c..ad8c10f 100644
--- a/src/neuralnet/neuron_layer/dropout.cc
+++ b/src/neuralnet/neuron_layer/dropout.cc
@@ -23,6 +23,8 @@
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"
+#include "singa/utils/singa_op.h"
+#include "singa/utils/math_blob.h"
 
 namespace singa {
 using std::vector;
@@ -42,20 +44,17 @@ void DropoutLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
     data_.CopyFrom(srclayers[0]->data(this));
     return;
   }
+
   float pkeep = 1 - pdrop_;
-  auto mask = Tensor1(&mask_);
-  mask = expr::F<op::threshold>(TSingleton<Random<cpu>>::Instance() \
-                      ->uniform(mask.shape), pkeep) * (1.0f/pkeep);
-  auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers[0]->mutable_data(this));
-  data = src * mask;
+  Blob<float> rand(data_.count());
+  SampleUniform(0.0f, 1.0f, &rand);
+  // Threashold(pkeep, rand, &mask_);
+  // Scale(1.0f / pkeep, &mask_);
+  Mult(srclayers[0]->data(this), mask_, &data_);
 }
 
 void DropoutLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)  {
-  auto mask = Tensor1(&mask_);
-  auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
-  gsrc = grad * mask;
+  Mult(grad_, mask_, srclayers[0]->mutable_grad(this));
 }
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/pooling.cc b/src/neuralnet/neuron_layer/pooling.cc
index fd475d8..2e246fc 100644
--- a/src/neuralnet/neuron_layer/pooling.cc
+++ b/src/neuralnet/neuron_layer/pooling.cc
@@ -34,9 +34,31 @@ void PoolingLayer::Setup(const LayerProto& conf,
   Layer::Setup(conf, srclayers);
   CHECK_EQ(srclayers.size(), 1);
   PoolingProto pool_conf = conf.pooling_conf();
-  kernel_ = pool_conf.kernel();
-  stride_ = pool_conf.stride();
-  CHECK_LT(pad_, kernel_);
+  if (pool_conf.has_kernel()) {
+    kernel_x_ = kernel_y_ = pool_conf.kernel();
+  } else {
+    CHECK(pool_conf.has_kernel_x());
+    CHECK(pool_conf.has_kernel_y());
+    kernel_x_ = pool_conf.kernel_x();
+    kernel_y_ = pool_conf.kernel_y();
+  }
+  if (pool_conf.has_pad()) {
+    pad_x_ = pad_y_ = pool_conf.pad();
+  } else {
+    CHECK(pool_conf.has_pad_x());
+    CHECK(pool_conf.has_pad_y());
+    pad_x_ = pool_conf.pad_x();
+    pad_y_ = pool_conf.pad_y();
+  }
+  if (pool_conf.has_stride()) {
+    stride_x_ = stride_y_ = pool_conf.stride();
+  } else {
+    CHECK(pool_conf.has_stride_x());
+    CHECK(pool_conf.has_stride_y());
+    stride_x_ = pool_conf.stride_x();
+    stride_y_ = pool_conf.stride_y();
+  }
+
   pool_ = conf.pooling_conf().pool();
   CHECK(pool_ == PoolingProto_PoolMethod_AVG
         || pool_ == PoolingProto_PoolMethod_MAX)
@@ -51,8 +73,10 @@ void PoolingLayer::Setup(const LayerProto& conf,
   else
     channels_ = 1;
   batchsize_ = srcshape[0];
-  pooled_height_ = static_cast<int>((height_ - kernel_) / stride_) + 1;
-  pooled_width_ = static_cast<int>((width_ - kernel_) / stride_) + 1;
+  pooled_height_ = static_cast<int>(
+      (height_ + 2 * pad_y_- kernel_y_) / stride_y_) + 1;
+  pooled_width_ = static_cast<int>(
+      (width_ + 2* pad_x_ - kernel_x_) / stride_x_) + 1;
   data_.Reshape(vector<int>{batchsize_, channels_, pooled_height_,
                             pooled_width_});
   grad_.ReshapeLike(data_);
@@ -62,10 +86,10 @@ void PoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto data = Tensor4(&data_);
   if (pool_ == PoolingProto_PoolMethod_MAX)
-    data = expr::pool<red::maximum>(src, kernel_, stride_);
+    data = expr::pool<red::maximum>(src, kernel_x_, stride_x_);
   else if (pool_ == PoolingProto_PoolMethod_AVG)
-    data = expr::pool<red::sum>(src, kernel_, stride_)
-      * (1.0f / (kernel_ * kernel_));
+    data = expr::pool<red::sum>(src, kernel_x_, stride_x_)
+      * (1.0f / (kernel_x_ * kernel_x_));
 }
 
 /*
@@ -78,10 +102,10 @@ void PoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto data = Tensor4(&data_);
   auto grad = Tensor4(&grad_);
   if (pool_ == PoolingProto_PoolMethod_MAX)
-    gsrc = expr::unpool<red::maximum>(src, data, grad, kernel_, stride_);
+    gsrc = expr::unpool<red::maximum>(src, data, grad, kernel_x_, stride_x_);
   else if (pool_ == PoolingProto_PoolMethod_AVG)
-    gsrc = expr::unpool<red::sum>(src, data, grad, kernel_, stride_)
-           * (1.0f / (kernel_ * kernel_));
+    gsrc = expr::unpool<red::sum>(src, data, grad, kernel_x_, stride_x_)
+           * (1.0f / (kernel_x_ * kernel_x_));
 }
 
 /***************** Implementation of CPoolingLayer ***************/
@@ -95,12 +119,13 @@ void CPoolingLayer::Setup(const LayerProto& conf,
 void CPoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if (pool_ == PoolingProto_PoolMethod_MAX)
     ForwardMaxPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-        batchsize_, channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_, data_.mutable_cpu_data(), mask_.mutable_cpu_data());
+        batchsize_, channels_, height_, width_, kernel_y_, kernel_x_,
+        pad_y_, pad_y_, stride_y_, stride_x_,
+        data_.mutable_cpu_data(), mask_.mutable_cpu_data());
   else if (pool_ == PoolingProto_PoolMethod_AVG)
     ForwardAvgPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-        batchsize_, channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_, data_.mutable_cpu_data());
+        batchsize_, channels_, height_, width_, kernel_y_, kernel_x_,
+        pad_y_, pad_x_, stride_y_, stride_y_, data_.mutable_cpu_data());
   else
     LOG(FATAL) << "unknow pooling method";
 }
@@ -108,12 +133,14 @@ void CPoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
 void CPoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   if (pool_ == PoolingProto_PoolMethod_MAX)
     BackwardMaxPooling(grad_.cpu_data(), mask_.cpu_data(), batchsize_,
-        channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_, srclayers[0]->mutable_grad(this)->mutable_cpu_data());
+        channels_, height_, width_, kernel_y_, kernel_x_, pad_y_, pad_x_,
+        stride_y_, stride_y_,
+        srclayers[0]->mutable_grad(this)->mutable_cpu_data());
   else if (pool_ == PoolingProto_PoolMethod_AVG)
     BackwardAvgPooling(grad_.cpu_data(), batchsize_,
-        channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_, srclayers[0]->mutable_grad(this)->mutable_cpu_data());
+        channels_, height_, width_, kernel_y_, kernel_x_, pad_y_, pad_x_,
+        stride_y_, stride_x_,
+        srclayers[0]->mutable_grad(this)->mutable_cpu_data());
   else
     LOG(FATAL) << "unknow pooling method";
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/neuron_layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/softmax.cc b/src/neuralnet/neuron_layer/softmax.cc
index 83bbc5a..44c64f8 100644
--- a/src/neuralnet/neuron_layer/softmax.cc
+++ b/src/neuralnet/neuron_layer/softmax.cc
@@ -39,8 +39,8 @@ void SoftmaxLayer::Setup(const LayerProto& proto,
   NeuronLayer::Setup(proto, srclayers);
   const auto& srcdata = srclayers[0]->data(this);
   batchsize_ = data_.shape()[0];
-  num_softmax_per_instance_ = proto.softmax_conf().softmax_dim();
-  count_per_softmax_ = .count() / batchsize_ / num_softmax_per_instance_;
+  num_softmax_per_instance_ = proto.softmax_conf().num_softmax_per_instance();
+  count_per_softmax_ = data_.count() / batchsize_ / num_softmax_per_instance_;
   data_.Reshape(vector<int>{batchsize_, num_softmax_per_instance_,
       count_per_softmax_});
   grad_.ReshapeLike(data_);
@@ -48,8 +48,8 @@ void SoftmaxLayer::Setup(const LayerProto& proto,
 
 void SoftmaxLayer::ComputeFeature(int flag,
     const vector<Layer*>& srclayers) {
-  int dim = data_.count() / batchsize;
-  Shape<2> s = Shape2(batchsize, dim);
+  int dim = data_.count() / batchsize_;
+  Shape<2> s = Shape2(batchsize_, dim);
   Tensor<cpu, 2> prob(data_.mutable_cpu_data(), s);
   Tensor<cpu, 2> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(), s);
   Softmax(prob, src);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/neuralnet/output_layer/argsort.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/output_layer/argsort.cc b/src/neuralnet/output_layer/argsort.cc
new file mode 100644
index 0000000..63969b3
--- /dev/null
+++ b/src/neuralnet/output_layer/argsort.cc
@@ -0,0 +1,56 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/neuralnet/output_layer.h"
+#include <algorithm>
+
+namespace singa {
+
+void ArgSortLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  CHECK_EQ(srclayers.size(), 1);
+  OutputLayer::Setup(proto, srclayers);
+  batchsize_ = srclayers[0]->data(this).shape()[0];
+  dim_ = srclayers[0]->data(this).count() / batchsize_;
+  topk_ = proto.argsort_conf().topk();
+  data_.Reshape(vector<int>{batchsize_, topk_});
+}
+
+void ArgSortLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  // TODO(wangwei) check flag to ensure it is not called in training phase
+  const float* srcptr = srclayers.at(0)->data(this).cpu_data();
+  float* ptr = data_.mutable_cpu_data();
+  for (int n = 0; n < batchsize_; n++) {
+    vector<std::pair<float, int> > vec;
+    for (int j = 0; j < dim_; ++j)
+      vec.push_back(std::make_pair(srcptr[j], j));
+    std::partial_sort(vec.begin(), vec.begin() + topk_, vec.end(),
+                      std::greater<std::pair<float, int> >());
+
+    for (int j = 0; j < topk_; ++j)
+      ptr[j] = static_cast<float> (vec.at(j).second);
+    ptr += topk_;
+    srcptr += dim_;
+  }
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index ca5e546..9d64c4b 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -188,6 +188,8 @@ message LayerProto {
   optional string user_type =21;
 
   // proto for the specific layer
+  // configuration for activation layer
+  optional ActivationProto activation_conf = 54;
   // configuration for argsort layer
   optional ArgSortProto argsort_conf = 52;
   // configuration for convolution layer
@@ -216,11 +218,16 @@ message LayerProto {
   optional RGBImageProto rgbimage_conf = 39;
   // configuration for data layer
   optional DataProto sharddata_conf = 32;
+  // configuration for slice layer
+  optional SliceProto slice_conf = 41;
+  // configuration for softmax layer
+  optional SoftmaxProto softmax_conf = 53;
   // configuration for softmax loss layer
   optional SoftmaxLossProto softmaxloss_conf = 40;
   // configuration for store input layers
   optional StoreProto store_conf = 51;
 
+
   // overrides the partition dimension for neural net
   optional int32 partition_dim = 60 [default = -1];
   // names of parameters shared from other layers
@@ -298,6 +305,17 @@ message ParamGenProto {
   extensions 101 to 200;
 }
 
+enum ActivationType {
+  RELU = 1;
+  SIGMOID = 2;
+  TANH = 3;
+  STANH = 4;
+}
+
+message ActivationProto {
+  optional ActivationType type = 1 [default = RELU];
+}
+
 message RGBImageProto {
   // scale factor for each pixel
   optional float scale = 1 [default = 1.0];
@@ -327,6 +345,7 @@ message StoreProto {
   optional int32 random_skip = 11 [default = 0];
   optional bool has_label = 12 [default = true];
 }
+
 message SoftmaxLossProto {
   // computing accuracy against topk results
   optional int32 topk = 1 [default = 1];
@@ -341,17 +360,26 @@ message ArgSortProto {
 
 message ConvolutionProto {
   // The number of outputs for the layer
-  required int32 num_filters = 1;
+  optional int32 num_filters = 1;
   // the kernel height/width
-  required int32 kernel = 2;
+  optional int32 kernel = 2;
 
   // The padding height/width
   optional int32 pad = 30 [default = 0];
   // the stride
   optional int32 stride = 31 [default = 1];
 
+  optional int32 kernel_x = 41;
+  optional int32 kernel_y = 42;
+
+  optional int32 pad_x = 44 [default = 0];
+  optional int32 pad_y = 45 [default = 0];
+
+  optional int32 stride_x = 47 [default = 1];
+  optional int32 stride_y = 48 [default = 1];
+
   // cudnn workspace size in MB
-  optional int32 workspace_byte_limit = 33 [default = 512];
+  optional int32 workspace_byte_limit = 50 [default = 512];
 }
 
 message DataProto {
@@ -384,9 +412,9 @@ message MnistProto {
 
 message DummyProto {
   // shape of data and grad blobs
-  optional bool input = 1 [default = false]; 
-  optional bool output = 2 [default = false]; 
-  repeated int32 shape = 3; 
+  optional bool input = 1 [default = false];
+  optional bool output = 2 [default = false];
+  repeated int32 shape = 3;
 }
 
 // Message that stores parameters used by DropoutLayer
@@ -424,7 +452,7 @@ message LRNProto {
 
 message PoolingProto {
   // The kernel size (square)
-  required int32 kernel= 1;
+  optional int32 kernel= 1;
   enum PoolMethod {
     MAX = 0;
     AVG = 1;
@@ -435,8 +463,19 @@ message PoolingProto {
   optional uint32 pad = 31 [default = 0];
   // The stride
   optional uint32 stride = 32 [default = 1];
+
+
+  optional int32 kernel_x = 41;
+  optional int32 kernel_y = 42;
+
+  optional int32 pad_x = 44 [default = 0];
+  optional int32 pad_y = 45 [default = 0];
+
+  optional int32 stride_x = 47 [default = 1];
+  optional int32 stride_y = 48 [default = 1];
 }
 
+
 message ReLUProto {
   // Ref. Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013).
   // Rectifier nonlinearities improve neural network acoustic models.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/49293a68/src/utils/blob.cc
----------------------------------------------------------------------
diff --git a/src/utils/blob.cc b/src/utils/blob.cc
index 94b1fcb..0dc797e 100644
--- a/src/utils/blob.cc
+++ b/src/utils/blob.cc
@@ -77,28 +77,7 @@ private:\
   classname& operator=(const classname&)
 
 #ifndef CPU_ONLY
-// CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
-#define CUBLAS_CHECK(condition) \
-  do { \
-    cublasStatus_t status = condition; \
-    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
-      << caffe::cublasGetErrorString(status); \
-  } while (0)
-
-#define CURAND_CHECK(condition) \
-  do { \
-    curandStatus_t status = condition; \
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
-      << caffe::curandGetErrorString(status); \
-  } while (0)
-
+#include "singa/utils/cuda_utils.h"
 #endif  // CPU_ONLY
 
 namespace singa {
@@ -187,7 +166,7 @@ void SyncedMemory::to_gpu() {
   switch (head_) {
   case UNINITIALIZED:
     CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-    CUDA_CHECK(cudaMemset(gpu_ptr_, 0, N));
+    CUDA_CHECK(cudaMemset(gpu_ptr_, 0, size_));
     head_ = HEAD_AT_GPU;
     break;
   case HEAD_AT_CPU:


Mime
View raw message