singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wan...@apache.org
Subject [06/19] incubator-singa git commit: SINGA-100 Implement layers using CUDNN for GPU training
Date Wed, 16 Dec 2015 12:11:38 GMT
SINGA-100 Implement layers using CUDNN for GPU training

Fix the error from wrong accuracy caused by inner-product layer.
The MMDot accpets two dimension Blob, but inner-product passes 4-d blob.
The error is fixed by assuming the Blob is of shape (shape[0], count()/shape[0]).

The cifar10 example can reach the same accuracy for GPU and CPU training (running 1000 iterations).
GPU using CUDNN is 10x faster than CPU.

TODO test multiple GPU cards and cifar10 for 70000 steps.

tmp commit; there is bug in cudnn_softmaxloss.cc->segmentfault

code clean with cpplint.py


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/eb97097a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/eb97097a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/eb97097a

Branch: refs/heads/master
Commit: eb97097aad2ce6aee19aecea243074f5c67c1cc4
Parents: f31ba64
Author: Wei Wang <wangwei@comp.nus.edu.sg>
Authored: Fri Dec 4 14:07:05 2015 +0800
Committer: Wei Wang <wangwei@comp.nus.edu.sg>
Committed: Fri Dec 11 11:48:23 2015 +0800

----------------------------------------------------------------------
 examples/cifar10/cudnn.conf                     | 287 +++++++++++++++++++
 include/singa/utils/blob.h                      |   2 +-
 include/singa/utils/math_addr.h                 |  16 +-
 include/singa/utils/math_blob.h                 |  46 +--
 include/singa/utils/math_kernel.h               |   8 +-
 src/neuralnet/input_layer/deprecated.cc         |  41 ++-
 src/neuralnet/input_layer/image_preprocess.cc   |  16 +-
 src/neuralnet/input_layer/store.cc              |  12 +-
 src/neuralnet/loss_layer/cudnn_softmaxloss.cc   |  80 ++++++
 src/neuralnet/loss_layer/cudnn_softmaxloss.cu   |  82 ------
 src/neuralnet/loss_layer/euclidean.cc           |   3 +-
 src/neuralnet/loss_layer/softmax.cc             |   4 +-
 src/neuralnet/neuron_layer/activation.cc        |   8 +-
 src/neuralnet/neuron_layer/convolution.cc       |   3 +-
 src/neuralnet/neuron_layer/cudnn_activation.cc  |   7 +-
 src/neuralnet/neuron_layer/cudnn_convolution.cc |   9 +-
 src/neuralnet/neuron_layer/cudnn_lrn.cc         |  14 +-
 src/neuralnet/neuron_layer/cudnn_pooling.cc     |   7 +-
 src/neuralnet/neuron_layer/cudnn_softmax.cc     |   2 +-
 src/neuralnet/neuron_layer/dropout.cc           |   3 +-
 src/neuralnet/neuron_layer/inner_product.cc     |   3 +-
 src/neuralnet/neuron_layer/lrn.cc               |   4 +-
 src/neuralnet/neuron_layer/pooling.cc           |   4 +-
 src/neuralnet/neuron_layer/rbm.cc               |   3 +-
 src/neuralnet/neuron_layer/relu.cc              |   4 +-
 src/neuralnet/neuron_layer/sigmoid.cc           |   4 +-
 src/neuralnet/output_layer/argsort.cc           |   2 +-
 src/utils/math_kernel.cu                        |  48 ++--
 src/worker.cc                                   |  11 +-
 29 files changed, 524 insertions(+), 209 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/examples/cifar10/cudnn.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/cudnn.conf b/examples/cifar10/cudnn.conf
new file mode 100644
index 0000000..4a29b7f
--- /dev/null
+++ b/examples/cifar10/cudnn.conf
@@ -0,0 +1,287 @@
+name: "cifar10-convnet"
+train_steps: 1000
+test_steps: 0
+test_freq: 0
+#validate_steps: 100
+#validate_freq: 300
+disp_freq: 30
+gpu: 0
+gpu: 1
+#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0"
+train_one_batch {
+  alg: kBP
+}
+updater{
+  type: kSGD
+  weight_decay:0.004
+  momentum:0.9
+  learning_rate {
+    type: kFixedStep
+    fixedstep_conf:{
+      step:0
+      step:60000
+      step:65000
+      step_lr:0.001
+      step_lr:0.0001
+      step_lr:0.00001
+    }
+  }
+}
+neuralnet {
+  layer{
+    name: "data"
+    type: kRecordInput
+    store_conf {
+      backend: "kvfile"
+      path: "examples/cifar10/train_data.bin"
+      mean_file: "examples/cifar10/image_mean.bin"
+      batchsize: 100
+      #random_skip: 5000
+      shape: 3
+      shape: 32
+      shape: 32
+    }
+    include: kTrain
+  }
+#  layer{
+#    name: "data"
+#    type: kRecordInput
+#    store_conf {
+#      backend: "kvfile"
+#      path: "examples/cifar10/val_data.bin"
+#      mean_file: "examples/cifar10/image_mean.bin"
+#      batchsize: 64
+#      random_skip: 5000
+#      shape: 3
+#      shape: 32
+#      shape: 32
+#    }
+#    include: kVal
+#  }
+  layer{
+    name: "data"
+    type: kRecordInput
+    store_conf {
+      backend: "kvfile"
+      path: "examples/cifar10/test_data.bin"
+      mean_file: "examples/cifar10/image_mean.bin"
+      batchsize: 100
+      shape: 3
+      shape: 32
+      shape: 32
+    }
+    include: kTest
+  }
+
+  layer {
+    name: "conv1"
+    type: kCudnnConv
+    srclayers: "data"
+    convolution_conf {
+      num_filters: 32
+      kernel: 5
+      stride: 1
+      pad:2
+    }
+    param {
+      name: "w1"
+      init {
+        type:kGaussian
+        std:0.0001
+      }
+    }
+    param {
+      name: "b1"
+      lr_scale:2.0
+      init {
+        type: kConstant
+        value:0
+      }
+    }
+  }
+
+  layer {
+    name: "pool1"
+    type: kCudnnPool
+    srclayers: "conv1"
+    pooling_conf {
+      pool: MAX
+      kernel: 3
+      stride: 2
+    }
+  }
+  layer {
+    name: "relu1"
+    type: kCudnnActivation
+    activation_conf {
+      type: RELU
+    }
+    srclayers:"pool1"
+  }
+  layer {
+    name: "norm1"
+    type: kCudnnLRN
+    lrn_conf {
+      local_size: 3
+      alpha: 5e-05
+      beta: 0.75
+    }
+    srclayers:"relu1"
+  }
+  layer {
+    name: "conv2"
+    type: kCudnnConv
+    srclayers: "norm1"
+    convolution_conf {
+      num_filters: 32
+      kernel: 5
+      stride: 1
+      pad:2
+    }
+    param {
+      name: "w2"
+      init {
+        type:kGaussian
+        std:0.01
+      }
+    }
+    param {
+      name: "b2"
+      lr_scale:2.0
+      init {
+        type: kConstant
+        value:0
+      }
+    }
+  }
+  layer {
+    name: "relu2"
+    type: kCudnnActivation
+    activation_conf {
+      type: RELU
+    }
+    srclayers:"conv2"
+  }
+  layer {
+    name: "pool2"
+    type: kCudnnPool
+    srclayers: "relu2"
+    pooling_conf {
+      pool: AVG
+      kernel: 3
+      stride: 2
+    }
+  }
+  layer {
+    name: "norm2"
+    type: kCudnnLRN
+    lrn_conf {
+      local_size: 3
+      alpha: 5e-05
+      beta: 0.75
+    }
+    srclayers:"pool2"
+  }
+  layer {
+    name: "conv3"
+    type: kCudnnConv
+    srclayers: "norm2"
+    convolution_conf {
+      num_filters: 64
+      kernel: 5
+      stride: 1
+      pad:2
+    }
+    param {
+      name: "w3"
+      init {
+        type:kGaussian
+        std:0.01
+      }
+    }
+    param {
+      name: "b3"
+      init {
+        type: kConstant
+        value:0
+      }
+    }
+  }
+  layer {
+    name: "relu3"
+    type: kCudnnActivation
+    activation_conf {
+      type: RELU
+    }
+    srclayers:"conv3"
+  }
+  layer {
+    name: "pool3"
+    type: kCudnnPool
+    srclayers: "relu3"
+    pooling_conf {
+      pool: AVG
+      kernel: 3
+      stride: 2
+    }
+  }
+  layer {
+    name: "ip1"
+    type: kInnerProduct
+    srclayers:"pool3"
+    innerproduct_conf {
+      num_output: 10
+    }
+    param {
+      name: "w4"
+      wd_scale:250
+      init {
+        type:kGaussian
+        std:0.01
+      }
+    }
+    param {
+      name: "b4"
+      lr_scale:2.0
+      wd_scale:0
+      init {
+        type: kConstant
+        value:0
+      }
+    }
+  }
+#  layer {
+#   name : "softmax"
+#   type: kSoftmax
+#   srclayers: "ip1"
+#  }
+#
+#  layer {
+#   name : "argsort"
+#   type: kArgSort
+#   srclayers: "softmax"
+#  }
+  layer{
+    name: "loss"
+    type: kSoftmaxLoss
+    srclayers:"ip1"
+    srclayers: "data"
+  }
+# uncomment "softmax", "argsort", "output" layer and comment "loss" layer
+# to extract features from argsort
+#  layer {
+#    name : "output"
+#    type: kCSVOutput
+#    srclayers: "argsort"
+#    store_conf {
+#      path: "examples/cifar10/out.csv"
+#    }
+#  }
+}
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  nworkers_per_group: 2
+  nworkers_per_procs: 2
+  workspace: "examples/cifar10"
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/include/singa/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h
index bb63208..97b59e0 100644
--- a/include/singa/utils/blob.h
+++ b/include/singa/utils/blob.h
@@ -1,4 +1,4 @@
-/************************************************************
+/**************************************************************
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
index 7ab884e..d641251 100644
--- a/include/singa/utils/math_addr.h
+++ b/include/singa/utils/math_addr.h
@@ -21,18 +21,16 @@
 
 #ifndef SINGA_UTILS_MATH_ADDR_H_
 #define SINGA_UTILS_MATH_ADDR_H_
+
 extern "C" {
-    #include <cblas.h>
+#include <cblas.h>
 }
 #ifdef USE_GPU
 #include <cuda_runtime.h>
-#endif
-#include "singa/utils/singa_op.h"
-#ifdef USE_GPU
-#include "cuda_utils.h"
 #include <cublas_v2.h>
 #endif
 
+#include "singa/utils/singa_op.h"
 
 namespace singa {
 template<typename Dtype>
@@ -140,14 +138,14 @@ void cpu_softmax(int nb_rows, int nb_cols, const Dtype* A, Dtype* B) {
   for (int i = 0; i < nb_rows; i++) {
     const Dtype* dptr = A + i * nb_cols;
     Dtype mmax = dptr[0];
-    for (int x = 1; x < nb_cols; ++x )
+    for (int x = 1; x < nb_cols; ++x)
       if (mmax < dptr[x]) mmax = dptr[x];
     Dtype sum = 0.0f;
-    for(int x = 0; x < nb_cols; ++x ) {
-      dptr[x] = std::exp(dptr[x] - mmax );
+    for (int x = 0; x < nb_cols; ++x) {
+      dptr[x] = std::exp(dptr[x] - mmax);
       sum += dptr[x];
     }
-    for(int x = 0; x < nb_cols; ++x ) {
+    for (int x = 0; x < nb_cols; ++x) {
       dptr[x] /= sum;
     }
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
index 629839a..cf989fa 100644
--- a/include/singa/utils/math_blob.h
+++ b/include/singa/utils/math_blob.h
@@ -31,10 +31,11 @@
 #include "singa/utils/singleton.h"
 #include "singa/utils/context.h"
 
-
 namespace singa {
-enum XPU {cpu, gpu, any};
-
+/**
+ * \file math_blob.h is not tested thorough.
+ * Only GEMM() and MMDot() MVSumRow() andMVAddRow() are used now.
+ */
 /************* BLAS level 1 *****************/
 /**
  * Scale each element of A with alpha, and put the result into B.
@@ -45,9 +46,9 @@ template<typename Dtype>
 void Scale(Dtype alpha, Blob<Dtype> * B) {
   auto context = Singleton<Context>::Instance();
   int device = context->device_id(std::this_thread::get_id());
-  if (device == -1)
+  if (device == -1) {
     cpu_scale(B->count(), alpha, B->mutable_cpu_data());
-  else {
+  } else {
 #ifdef USE_GPU
     gpu_scale(context->cublas_handle(device), B->count(), alpha,
         B->mutable_gpu_data());
@@ -146,24 +147,24 @@ void MVDot(const Blob<Dtype>& A, const Blob<Dtype>& B,
  * @param[in, out] C, matrix
  */
 template <typename Dtype>
-void GEMM( Dtype alpha, Dtype beta, const Blob<Dtype>& A,
-    const Blob<Dtype> & B, Blob<Dtype> * C) {
+void GEMM(Dtype alpha, Dtype beta, const Blob<Dtype>& A, const Blob<Dtype>& B,
+    Blob<Dtype> * C) {
   CHECK_GE(A.shape().size(), 2);
   CHECK_GE(B.shape().size(), 2);
   CHECK_GE(C->shape().size(), 2);
   int a1, a2, b1, b2, m, n;
   CHECK(!C->transpose());
   a1 = A.transpose() ? A.count() / A.shape(0) : A.shape(0);
-  a2 = A.transpose() ? A.shape(0) : A.count() / A.shape(0);
+  a2 = A.count() / a1;
   b1 = B.transpose() ? B.count() /B.shape(0) : B.shape(0);
-  b2 = B.transpose() ? B.shape(0) : B.count() / B.shape(0);
+  b2 = B.count() / b1;
   m = C->shape(0);
-  n = C->count() / C->shape(0);
+  n = C->count() / m;
   CHECK_EQ(a2, b1);
   CHECK_EQ(a1, m);
   CHECK_EQ(b2, n);
 
-  int k = A.transpose() ? A.shape(0) : A.shape(1);
+  int k = a2;
   bool TranA = A.transpose();
   bool TranB = B.transpose();
   auto context = Singleton<Context>::Instance();
@@ -173,8 +174,8 @@ void GEMM( Dtype alpha, Dtype beta, const Blob<Dtype>& A,
         C->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n, k,
-        alpha, beta, TranA, TranB, C->mutable_gpu_data());
+    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
+        m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data());
 #endif  // USE_GPU
   }
 }
@@ -216,7 +217,8 @@ Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) {
   } else {
 #ifdef USE_GPU
     // gpu part
-    res = gpu_dot(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), n);
+    res = gpu_dot(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
+        n);
 #endif  // USE_GPU
   }
   return res;
@@ -244,8 +246,8 @@ void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) {
         C->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0,
-        false, false, C->mutable_gpu_data());
+    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
+        m, n, 1, 1, 0, false, false, C->mutable_gpu_data());
 #endif  // USE_GPU
   }
 }
@@ -389,7 +391,7 @@ void Sub(const Blob<Dtype> & A, const Blob<Dtype> & B,
 
 /**
  * C = A * B, implemented using
- * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
+ * Map(const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
  */
 template<typename Dtype>
 void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
@@ -400,7 +402,7 @@ void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
 
 /**
  * C = A / B, implemented using
- * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
+ * Map(const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
  */
 template<typename Dtype>
 void Div(const Blob<Dtype> & A, const Blob<Dtype> & B,
@@ -561,7 +563,7 @@ void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
         A.transpose(), false, B->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    singa_gpu_sum_by_col(A.gpu_data(), B->gpu_data(), m, n, n);
+    singa_gpu_sum_by_col(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
     // gpu part (TODO check transpose case)
 #endif  // USE_GPU
   }
@@ -586,7 +588,7 @@ void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
       false, B->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    singa_gpu_sum_by_row(A.gpu_data(), B->gpu_data(), m, n, n);
+    singa_gpu_sum_by_row(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
     // gpu part (TODO check transpose case)
 #endif  // USE_GPU
   }
@@ -645,7 +647,7 @@ Dtype Asum(const Blob<Dtype>& A) {
     ret = cpu_asum(A.count(), A.cpu_data(), 1) / A.count();
   } else {
 #ifdef USE_GPU
-    ret = gpu_asum(context->cublas_handle(device), A.count(), A.cpu_data(), 1)
+    ret = gpu_asum(context->cublas_handle(device), A.count(), A.gpu_data(), 1)
       / A.count();
 #endif
   }
@@ -665,7 +667,7 @@ void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) {
   } else {
 #ifdef USE_GPU
     gpu_sample_uniform(context->curand_generator(thread), A->count(), low, high,
-		A->mutable_gpu_data());
+        A->mutable_gpu_data());
 #endif
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/include/singa/utils/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_kernel.h b/include/singa/utils/math_kernel.h
index 8cfa562..59bc3bf 100644
--- a/include/singa/utils/math_kernel.h
+++ b/include/singa/utils/math_kernel.h
@@ -24,11 +24,11 @@
 namespace singa {
 
 extern "C" {
-  void singa_gpu_softmax_loss(const float *prob, const int *label,
-	float *loss, int n, int dim);
+  void singa_gpu_softmaxloss_forward(int n, int dim, const float *prob,
+      const int *label, float *loss);
 
-  void singa_gpu_softmax_gradient(float *grad, const int *label ,
-    int n, int dim, float scale);
+  void singa_gpu_softmaxloss_backward(int n, int dim, float scale,
+      const int *label, float *grad);
 
   void singa_gpu_sum_vec(float *data, float *sum , int n);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/input_layer/deprecated.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/deprecated.cc b/src/neuralnet/input_layer/deprecated.cc
index 1760d4b..0f98279 100644
--- a/src/neuralnet/input_layer/deprecated.cc
+++ b/src/neuralnet/input_layer/deprecated.cc
@@ -20,6 +20,8 @@
 *************************************************************/
 
 #include "singa/neuralnet/input_layer.h"
+#include "singa/utils/context.h"
+#include "singa/utils/singleton.h"
 #include "mshadow/tensor.h"
 namespace singa {
 
@@ -57,7 +59,10 @@ void ShardDataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
     shard_ = new DataShard(layer_conf_.sharddata_conf().path(),
                            DataShard::kRead);
   if (random_skip_) {
-    int nskip = rand() % random_skip_;
+  std::uniform_int_distribution<int> distribution(0, random_skip_);
+  auto generator =
+    Singleton<Context>::Instance()->generator(std::this_thread::get_id());
+    int nskip = distribution(generator);
     LOG(INFO) << "Random Skip " << nskip << " records, there are "
               << shard_->Count() << " records in total";
     string key;
@@ -123,25 +128,29 @@ void LMDBDataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if (mdb_cursor_ == nullptr)
     OpenLMDB(layer_conf_.lmdbdata_conf().path());
   if (random_skip_) {
-    int nskip = rand() % random_skip_;
+    std::uniform_int_distribution<int> distribution(0, random_skip_);
+    auto generator =
+      Singleton<Context>::Instance()->generator(std::this_thread::get_id());
+    int nskip = distribution(generator);
+
     int n = 0;
     CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
-             &mdb_value_, MDB_FIRST), MDB_SUCCESS);
+          &mdb_value_, MDB_FIRST), MDB_SUCCESS);
     while (mdb_cursor_get(mdb_cursor_, &mdb_key_,
-           &mdb_value_, MDB_NEXT) == MDB_SUCCESS)
+          &mdb_value_, MDB_NEXT) == MDB_SUCCESS)
       n++;
     LOG(INFO) << "Random Skip " << nskip << " records of total "
-              << n << "records";
+      << n << "records";
     // We have reached the end. Restart from the first.
     CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
-             &mdb_value_, MDB_FIRST), MDB_SUCCESS);
+          &mdb_value_, MDB_FIRST), MDB_SUCCESS);
     for (int i = 0; i < nskip; i++) {
       if (mdb_cursor_get(mdb_cursor_, &mdb_key_,
-          &mdb_value_, MDB_NEXT) != MDB_SUCCESS) {
+            &mdb_value_, MDB_NEXT) != MDB_SUCCESS) {
         // We have reached the end. Restart from the first.
         DLOG(INFO) << "Restarting data prefetching from start.";
         CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
-                 &mdb_value_, MDB_FIRST), MDB_SUCCESS);
+              &mdb_value_, MDB_FIRST), MDB_SUCCESS);
       }
     }
     random_skip_ = 0;
@@ -254,10 +263,16 @@ void RGBImageLayer::ParseRecords(int flag, const vector<Record>& records,
     AllocSpace(croped_image);
   int rid = 0;
   const float* meandptr = mean_.cpu_data();
+
+  std::uniform_int_distribution<int> distribution(0, r.shape(0) - cropsize_);
+  auto generator =
+    Singleton<Context>::Instance()->generator(std::this_thread::get_id());
   for (const Record& record : records) {
     auto image = images[rid];
     bool do_crop = cropsize_> 0 && ((flag & kTrain) == kTrain);
-    bool do_mirror = mirror_ && rand() % 2 && ((flag & kTrain) == kTrain);
+    bool do_mirror = mirror_
+                    && (distribution(generator) % 2)
+                    && ((flag & kTrain) == kTrain);
     float* dptr = nullptr;
     if (do_crop || do_mirror)
       dptr = raw_image.dptr;
@@ -274,8 +289,8 @@ void RGBImageLayer::ParseRecords(int flag, const vector<Record>& records,
     for (int i = 0; i < mean_.count(); i++)
       dptr[i] -= meandptr[i];
     if (do_crop) {
-      int hoff = rand() % (r.shape(1) - cropsize_);
-      int woff = rand() % (r.shape(2) - cropsize_);
+      int hoff = distribution(generator);
+      int woff = distribution(generator);
       Shape<2> cropshape = Shape2(cropsize_, cropsize_);
       if (do_mirror) {
         croped_image = expr::crop(raw_image, cropshape, hoff, woff);
@@ -355,6 +370,4 @@ void LabelLayer::ParseRecords(int flag, const vector<Record>& records,
   }
   CHECK_EQ(rid, blob->shape()[0]);
 }
-
-
-} // namespace singa
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/input_layer/image_preprocess.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/image_preprocess.cc b/src/neuralnet/input_layer/image_preprocess.cc
index 576b096..251f456 100644
--- a/src/neuralnet/input_layer/image_preprocess.cc
+++ b/src/neuralnet/input_layer/image_preprocess.cc
@@ -21,6 +21,9 @@
 
 #include "singa/neuralnet/input_layer.h"
 #include "singa/utils/image_transform.h"
+#include "singa/utils/context.h"
+#include "singa/utils/singleton.h"
+
 namespace singa {
 
 using std::vector;
@@ -52,13 +55,20 @@ void ImagePreprocessLayer::ComputeFeature(int flag,
   float* dptr = data_.mutable_cpu_data();
   int srcimage_size = channel * height * width;
   int image_size = channel * data_.shape()[2] * data_.shape()[3];
+  std::uniform_int_distribution<int> rand1(0, srcdata.shape()[1] - cropsize_);
+  std::uniform_int_distribution<int> rand2(0, srcdata.shape()[2] - cropsize_);
+  auto generator =
+    Singleton<Context>::Instance()->generator(std::this_thread::get_id());
+
   for (int k = 0; k < batchsize; k++) {
     int h_offset = 0, w_offset = 0;
     if (cropsize_> 0 && ((flag & kTrain) == kTrain)) {
-      h_offset = rand() % (srcdata.shape()[1] - cropsize_);
-      w_offset = rand() % (srcdata.shape()[2] - cropsize_);
+      h_offset = rand1(generator);
+      w_offset = rand2(generator);
     }
-    bool do_mirror = mirror_ && rand() % 2 && ((flag & kTrain) == kTrain);
+    bool do_mirror = mirror_
+                    && (rand1(generator) % 2)
+                    && ((flag & kTrain) == kTrain);
     ImageTransform(srcdptr + k * srcimage_size, nullptr, do_mirror, cropsize_,
         cropsize_, h_offset, w_offset, srcdata.shape()[1], height, width,
         scale_, dptr + image_size);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/input_layer/store.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/store.cc b/src/neuralnet/input_layer/store.cc
index b1b296e..dbb1874 100644
--- a/src/neuralnet/input_layer/store.cc
+++ b/src/neuralnet/input_layer/store.cc
@@ -20,6 +20,9 @@
 *************************************************************/
 
 #include "singa/neuralnet/input_layer.h"
+#include "singa/utils/context.h"
+#include "singa/utils/singleton.h"
+
 namespace singa {
 
 StoreInputLayer::~StoreInputLayer() {
@@ -35,8 +38,13 @@ void StoreInputLayer::Setup(const LayerProto& conf,
   if (conf.partition_dim() == 0) {
     batchsize_ /= conf.num_partitions();
   }
-  if (conf.store_conf().random_skip() > 0)
-    random_skip_ = rand() % conf.store_conf().random_skip();
+  if (conf.store_conf().random_skip() > 0) {
+    std::uniform_int_distribution<int>
+      distribution(0, conf.store_conf().random_skip());
+    auto generator =
+      Singleton<Context>::Instance()->generator(std::this_thread::get_id());
+    random_skip_ = distribution(generator);
+  }
 }
 
 void StoreInputLayer::ComputeFeature(int flag,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/loss_layer/cudnn_softmaxloss.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/loss_layer/cudnn_softmaxloss.cc b/src/neuralnet/loss_layer/cudnn_softmaxloss.cc
new file mode 100644
index 0000000..b18a751
--- /dev/null
+++ b/src/neuralnet/loss_layer/cudnn_softmaxloss.cc
@@ -0,0 +1,80 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/neuralnet/loss_layer.h"
+#include "singa/utils/blob.h"
+#include "singa/utils/math_kernel.h"
+
+namespace singa {
+void CudnnSoftmaxLossLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  CudnnSoftmaxLayer::Setup(conf, vector<Layer*> {srclayers.at(0)});
+  topk_ = conf.softmaxloss_conf().topk();
+  loss_ = accuracy_ = 0.0f;
+  counter_ = 0;
+}
+void CudnnSoftmaxLossLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  CudnnSoftmaxLayer::ComputeFeature(flag, srclayers);
+  // compute loss
+  float *prob = data_.mutable_gpu_data();
+  Blob<int> label(batchsize_);
+  int *labelptr = label.mutable_cpu_data();
+
+  // aux_data: vector<int>, convert vector to int array.
+  for (int i = 0; i < batchsize_; ++i) {
+    labelptr[i] = srclayers[1]->aux_data(this)[i];
+  }
+
+  Blob<float> loss(batchsize_);
+
+  singa_gpu_softmaxloss_forward(batchsize_, dim_, prob, label.gpu_data(),
+      loss.mutable_gpu_data());
+
+  counter_++;
+}
+
+void CudnnSoftmaxLossLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
+  gsrcblob->CopyFrom(data_);
+  float* gsrcptr = gsrcblob->mutable_gpu_data();
+
+  Blob<int> label(batchsize_);
+  int *labelptr = label.mutable_cpu_data();
+
+  // aux_data: vector<int>, convert vector to int array.
+  for (int i = 0; i < batchsize_; ++i) {
+    labelptr[i] = srclayers[1]->aux_data(this)[i];
+  }
+
+  singa_gpu_softmaxloss_backward(batchsize_, dim_, scale_, label.gpu_data(),
+      gsrcptr);
+}
+
+const std::string CudnnSoftmaxLossLayer::ToString(bool debug, int flag) {
+  string disp = "Loss = " + std::to_string(loss_ / counter_)
+    + ", accuracy = " + std::to_string(accuracy_ / counter_);
+  counter_ = 0;
+  loss_ = accuracy_ = 0;
+  return disp;
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/loss_layer/cudnn_softmaxloss.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/loss_layer/cudnn_softmaxloss.cu b/src/neuralnet/loss_layer/cudnn_softmaxloss.cu
deleted file mode 100644
index 53420d3..0000000
--- a/src/neuralnet/loss_layer/cudnn_softmaxloss.cu
+++ /dev/null
@@ -1,82 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/loss_layer.h"
-#include "singa/utils/blob.h"
-#include "singa/utils/math_kernel.h"
-
-namespace singa {
-void CudnnSoftmaxLossLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CudnnSoftmaxLayer::Setup(conf, srclayers);
-  topk_ = conf.softmaxloss_conf().topk();
-  loss_ = accuracy_ = 0.0f;
-  counter_ = 0;
-}
-void CudnnSoftmaxLossLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  CudnnSoftmaxLayer::ComputeFeature(flag, srclayers);
-  // compute loss
-  float *prob = data_.mutable_gpu_data();
-  Blob<int> label(batchsize_);
-  int *labelptr = label.mutable_cpu_data();
-
-  //aux_data: vector<int>, convert vector to int array.
-  for(int i = 0; i < batchsize_; ++i) {
-	labelptr[i] = srclayers[1]->aux_data(this)[i];
-  }
-
-  Blob<float> loss(batchsize_);
-
-  singa_gpu_softmax_loss(prob , label.mutable_gpu_data() , loss.mutable_gpu_data(),
-	  batchsize_, dim_);
-
-  counter_++;
-  // TODO add loss and accuracy
-}
-
-void CudnnSoftmaxLossLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
- // compute gradient
-  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
-  gsrcblob->CopyFrom(data_);
-  float* gsrcptr = gsrcblob->mutable_gpu_data();
-
-  Blob<int> label(batchsize_);
-  int *labelptr = label.mutable_cpu_data();
-
-  //aux_data: vector<int>, convert vector to int array.
-  for(int i = 0; i < batchsize_; ++i) { 
-	labelptr[i] = srclayers[1]->aux_data(this)[i];
-  }
-
-  singa_gpu_softmax_gradient(gsrcptr, label.mutable_gpu_data(), batchsize_, dim_, scale_);
-
-}
-
-const std::string CudnnSoftmaxLossLayer::ToString(bool debug, int flag) {
-  string disp = "Loss = " + std::to_string(loss_ / counter_)
-    + ", accuracy = " + std::to_string(accuracy_ / counter_);
-  counter_ = 0;
-  loss_ = accuracy_ = 0;
-  return disp;
-}
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/loss_layer/euclidean.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/loss_layer/euclidean.cc b/src/neuralnet/loss_layer/euclidean.cc
index 71e5bae..b6aa12a 100644
--- a/src/neuralnet/loss_layer/euclidean.cc
+++ b/src/neuralnet/loss_layer/euclidean.cc
@@ -19,9 +19,8 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/loss_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/loss_layer.h"
 #include "mshadow/tensor.h"
 
 namespace singa {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/loss_layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/loss_layer/softmax.cc b/src/neuralnet/loss_layer/softmax.cc
index 8c100ef..0f3d5bf 100644
--- a/src/neuralnet/loss_layer/softmax.cc
+++ b/src/neuralnet/loss_layer/softmax.cc
@@ -19,10 +19,10 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/loss_layer.h"
 
-#include <algorithm>
 #include <glog/logging.h>
+#include <algorithm>
+#include "singa/neuralnet/loss_layer.h"
 #include "mshadow/tensor.h"
 
 namespace singa {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/activation.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/activation.cc b/src/neuralnet/neuron_layer/activation.cc
index 6f62646..492e453 100644
--- a/src/neuralnet/neuron_layer/activation.cc
+++ b/src/neuralnet/neuron_layer/activation.cc
@@ -30,8 +30,8 @@ void ActivationLayer::Setup(const LayerProto& conf,
   data_.ReshapeLike(srclayers[0]->data(this));
   grad_.ReshapeLike(*(srclayers[0]->mutable_grad(this)));
 }
-void ActivationLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers)
-{
+void
+ActivationLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   switch (layer_conf_.activation_conf().type()) {
     case RELU:
       Map<op::Relu<float>, float>(srclayers[0]->data(this), &data_);
@@ -52,8 +52,8 @@ void ActivationLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers)
         layer_conf_.activation_conf().type();
   }
 }
-void ActivationLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
-{
+void
+ActivationLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   Blob<float> * gsrc = srclayers[0]->mutable_grad(this);
   switch (layer_conf_.activation_conf().type()) {
     case RELU:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/convolution.cc b/src/neuralnet/neuron_layer/convolution.cc
index edfabb6..bde2f08 100644
--- a/src/neuralnet/neuron_layer/convolution.cc
+++ b/src/neuralnet/neuron_layer/convolution.cc
@@ -19,9 +19,8 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/neuron_layer.h"
 #include "singa/utils/singleton.h"
 
 namespace singa {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_activation.cc b/src/neuralnet/neuron_layer/cudnn_activation.cc
index e8a7b41..9088d84 100644
--- a/src/neuralnet/neuron_layer/cudnn_activation.cc
+++ b/src/neuralnet/neuron_layer/cudnn_activation.cc
@@ -33,14 +33,13 @@ void CudnnActivationLayer::InitCudnn() {
     mode_ = CUDNN_ACTIVATION_TANH;
   else if (layer_conf_.activation_conf().type() == RELU)
     mode_ = CUDNN_ACTIVATION_RELU;
-  else {
+  else
     LOG(FATAL) << "Unkown activation: " << layer_conf_.activation_conf().type();
-  }
 
   const auto& shape = data_.shape();
   CHECK_GT(shape.size(), 0);
   // size of each dimension
-  int* sdim= new int[shape.size()];
+  int* sdim = new int[shape.size()];
   int* stride = new int[shape.size()];
   stride[shape.size() -1] = 1;
   int i = shape.size() - 1;
@@ -97,4 +96,4 @@ void CudnnActivationLayer::ComputeGradient(int flag,
         src_desc_,
         srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
 }
-}  // namespace singa
+}   // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_convolution.cc b/src/neuralnet/neuron_layer/cudnn_convolution.cc
index e08b57a..2cf376c 100644
--- a/src/neuralnet/neuron_layer/cudnn_convolution.cc
+++ b/src/neuralnet/neuron_layer/cudnn_convolution.cc
@@ -34,7 +34,8 @@ CudnnConvLayer::~CudnnConvLayer() {
 void CudnnConvLayer::InitCudnn() {
   CudnnLayer::InitCudnn();
   // convert MB to bytes
-  workspace_byte_limit_ = layer_conf_.convolution_conf().workspace_byte_limit() << 20;
+  workspace_byte_limit_
+    = layer_conf_.convolution_conf().workspace_byte_limit() << 20;
 
   CHECK_CUDNN(cudnnCreateTensorDescriptor(&bias_desc_));
   CHECK_CUDNN(cudnnCreateFilterDescriptor(&filter_desc_));
@@ -162,8 +163,8 @@ void CudnnConvLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   }
 }
 
-void CudnnConvLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
-{
+void
+CudnnConvLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   float alpha = 1.f, beta = 0.f;
   Blob<float> workspace(vector<int>{static_cast<int>(workspace_count_)});
   if (bias_) {
@@ -204,4 +205,4 @@ void CudnnConvLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
           srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
   }
 }
-}  /* singa */
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/cudnn_lrn.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_lrn.cc b/src/neuralnet/neuron_layer/cudnn_lrn.cc
index 4a2b695..f976b16 100644
--- a/src/neuralnet/neuron_layer/cudnn_lrn.cc
+++ b/src/neuralnet/neuron_layer/cudnn_lrn.cc
@@ -57,31 +57,33 @@ void CudnnLRNLayer::InitCudnn() {
 void CudnnLRNLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if (!has_init_cudnn_)
     InitCudnn();
+  float alpha = 1.0f, beta = 0.0f;
   CHECK_CUDNN(cudnnLRNCrossChannelForward(handle_,
       norm_desc_,
       mode_,
-      &alpha_,
+      &alpha,
       src_desc_,
       srclayers[0]->data(this).gpu_data(),
-      &beta_,
+      &beta,
       my_desc_,
       data_.mutable_gpu_data()));
 }
 void CudnnLRNLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  float alpha = 1.0f, beta = 0.0f;
   CHECK_CUDNN(cudnnLRNCrossChannelBackward(handle_,
         norm_desc_,
         mode_,
-        &alpha_,
-        my_desc_, // ???
+        &alpha,
+        my_desc_,
         data_.gpu_data(),
         my_desc_,
         grad_.gpu_data(),
         src_desc_,
         srclayers[0]->data(this).gpu_data(),
-        &beta_,
+        &beta,
         src_desc_,
         srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
 }
 
 
-} /* singa */
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_pooling.cc b/src/neuralnet/neuron_layer/cudnn_pooling.cc
index ffdfb3b..217df51 100644
--- a/src/neuralnet/neuron_layer/cudnn_pooling.cc
+++ b/src/neuralnet/neuron_layer/cudnn_pooling.cc
@@ -57,7 +57,6 @@ void CudnnPoolLayer::InitCudnn() {
         pad_x_,
         stride_y_,
         stride_x_));
-
 }
 
 void CudnnPoolLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
@@ -76,8 +75,8 @@ void CudnnPoolLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
         data_.mutable_gpu_data()));
 }
 
-void CudnnPoolLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
-{
+void
+CudnnPoolLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   float alpha = 1.0f, beta = 0.0f;
   CHECK_CUDNN(cudnnPoolingBackward(handle_,
         pool_desc_,
@@ -92,5 +91,5 @@ void CudnnPoolLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
         src_desc_,
         srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
 }
-}  /* singa */
+}  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_softmax.cc b/src/neuralnet/neuron_layer/cudnn_softmax.cc
index 7fade3e..21d17c4 100644
--- a/src/neuralnet/neuron_layer/cudnn_softmax.cc
+++ b/src/neuralnet/neuron_layer/cudnn_softmax.cc
@@ -72,4 +72,4 @@ void CudnnSoftmaxLayer::ComputeGradient(int flag,
         src_desc_,
         srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
 }
-}  /* singa */
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/dropout.cc b/src/neuralnet/neuron_layer/dropout.cc
index 030a663..6e0db76 100644
--- a/src/neuralnet/neuron_layer/dropout.cc
+++ b/src/neuralnet/neuron_layer/dropout.cc
@@ -19,9 +19,8 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/neuron_layer.h"
 #include "singa/utils/singleton.h"
 #include "singa/utils/singa_op.h"
 #include "singa/utils/math_blob.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/inner_product.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/inner_product.cc b/src/neuralnet/neuron_layer/inner_product.cc
index 6b5ec36..7e6318d 100644
--- a/src/neuralnet/neuron_layer/inner_product.cc
+++ b/src/neuralnet/neuron_layer/inner_product.cc
@@ -19,9 +19,8 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/neuron_layer.h"
 #include "singa/utils/singleton.h"
 #include "singa/utils/math_blob.h"
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/lrn.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/lrn.cc b/src/neuralnet/neuron_layer/lrn.cc
index 178b2bf..9a5ba37 100644
--- a/src/neuralnet/neuron_layer/lrn.cc
+++ b/src/neuralnet/neuron_layer/lrn.cc
@@ -19,11 +19,11 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/neuron_layer.h"
 #include "singa/utils/singleton.h"
 
+
 namespace singa {
 
 using std::vector;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/pooling.cc b/src/neuralnet/neuron_layer/pooling.cc
index 5b408ba..4eda2e4 100644
--- a/src/neuralnet/neuron_layer/pooling.cc
+++ b/src/neuralnet/neuron_layer/pooling.cc
@@ -19,11 +19,11 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/neuron_layer.h"
 #include "singa/utils/singleton.h"
 
+
 namespace singa {
 
 using std::vector;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/rbm.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/rbm.cc b/src/neuralnet/neuron_layer/rbm.cc
index 53a1bd9..fadd1df 100644
--- a/src/neuralnet/neuron_layer/rbm.cc
+++ b/src/neuralnet/neuron_layer/rbm.cc
@@ -19,9 +19,8 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/neuron_layer.h"
 #include "singa/utils/singleton.h"
 
 namespace singa {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/relu.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/relu.cc b/src/neuralnet/neuron_layer/relu.cc
index 1e030a0..5d4d954 100644
--- a/src/neuralnet/neuron_layer/relu.cc
+++ b/src/neuralnet/neuron_layer/relu.cc
@@ -19,11 +19,11 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/neuron_layer.h"
 #include "singa/utils/singleton.h"
 
+
 namespace singa {
 
 using std::vector;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/neuron_layer/sigmoid.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/sigmoid.cc b/src/neuralnet/neuron_layer/sigmoid.cc
index c449b36..9348011 100644
--- a/src/neuralnet/neuron_layer/sigmoid.cc
+++ b/src/neuralnet/neuron_layer/sigmoid.cc
@@ -19,11 +19,11 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer.h"
-
 #include <glog/logging.h>
+#include "singa/neuralnet/neuron_layer.h"
 #include "singa/utils/singleton.h"
 
+
 namespace singa {
 
 using std::vector;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/neuralnet/output_layer/argsort.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/output_layer/argsort.cc b/src/neuralnet/output_layer/argsort.cc
index 63969b3..869bc65 100644
--- a/src/neuralnet/output_layer/argsort.cc
+++ b/src/neuralnet/output_layer/argsort.cc
@@ -19,8 +19,8 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/output_layer.h"
 #include <algorithm>
+#include "singa/neuralnet/output_layer.h"
 
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/utils/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/utils/math_kernel.cu b/src/utils/math_kernel.cu
index a4cd513..9c54520 100644
--- a/src/utils/math_kernel.cu
+++ b/src/utils/math_kernel.cu
@@ -21,7 +21,7 @@
 #include <cmath>
 #include <algorithm>
 #include "singa/utils/math_kernel.h"
-#include "mshadow/tensor.h"  //FLT_MIN?
+#include "mshadow/tensor.h"  // FLT_MIN?
 
 #define CU2DBLOCK_X 32
 #define CU2DBLOCK_Y 32
@@ -32,25 +32,25 @@
 // Cuda Kernel Functions
 
 __global__
-void kernel_softmax_loss(const float *prob, const int *label ,
-	float *loss, int n, int dim) {
+void kernel_softmax_loss(const float *prob, const int *label , float *loss,
+    int n, int dim) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
-	const int label_value = static_cast<int>(label[index]);
-	float prob_of_truth = prob[index * dim + label_value];
-	loss[index] -= log(max(prob_of_truth, FLT_MIN));
+    const int label_value = static_cast<int>(label[index]);
+    float prob_of_truth = prob[index * dim + label_value];
+    loss[index] -= log(max(prob_of_truth, FLT_MIN));
   }
 }
 
 __global__
 void kernel_softmax_gradient(float *grad, const int *label ,
-	int n, int dim, float scale) {
+    int n, int dim, float scale) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int num_threads = blockDim.x * gridDim.x;
   for (; index < n; index += num_threads) {
-	int pos = index * dim + static_cast<int>(label[index]);
-	grad[pos] = (grad[pos] - 1.0f) * scale / (1.0 * n);
+    int pos = index * dim + static_cast<int>(label[index]);
+    grad[pos] = (grad[pos] - 1.0f) * scale / (1.0 * n);
   }
 }
 
@@ -128,9 +128,9 @@ void kernel_sum_by_row(const float *src_mat_data,
   int num_threads = blockDim.x * gridDim.x;
   for (; index < rows; index += num_threads) {
     dst_vec_data[index] = 0.0f;
-	for (int k = 0; k < cols; k++) {
-	  dst_vec_data[index] += src_mat_data[index * stride + k];
-	}
+    for (int k = 0; k < cols; k++) {
+      dst_vec_data[index] += src_mat_data[index * stride + k];
+    }
   }
 }
 
@@ -317,16 +317,16 @@ void kernel_threshold(const float *src_data, float *des_data,
 //
 namespace singa {
 
-void singa_gpu_softmax_loss(const float *prob, const int *label,
-	float *loss, int n, int dim) {
-  kernel_softmax_loss<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
-	(prob, label, loss, n, dim);
+void singa_gpu_softmaxloss_forward(int n, int dim, const float *prob,
+    const int *label, float *loss) {
+  kernel_softmax_loss<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(prob, label, loss, n,
+      dim);
 }
 
-void singa_gpu_softmax_gradient(float *grad, const int *label,
-	int n, int dim, float scale) {
-  kernel_softmax_gradient<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
-	(grad, label, n, dim, scale);
+void singa_gpu_softmaxloss_backward(int n, int dim, float scale,
+    const int *label, float *grad) {
+  kernel_softmax_gradient<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(grad, label, n,
+      dim, scale);
 }
 
 void singa_gpu_sum_vec(float *data, float *sum , int n) {
@@ -342,8 +342,8 @@ void singa_gpu_sum_by_col(const float *src_mat_data, float *dst_vec_data,
   int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
   int num_blocks = cols;
 
-  kernel_sum_by_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data,
-      rows, cols, stride);
+  kernel_sum_by_col<<<num_blocks, threads_per_block>>>(src_mat_data,
+      dst_vec_data, rows, cols, stride);
 }
 
 void singa_gpu_sum_by_row(const float *src_mat_data, float *dst_vec_data,
@@ -351,8 +351,8 @@ void singa_gpu_sum_by_row(const float *src_mat_data, float *dst_vec_data,
   int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
   int num_blocks = rows;
 
-  kernel_sum_by_row<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data,
-      rows, cols, stride);
+  kernel_sum_by_row<<<num_blocks, threads_per_block>>>(src_mat_data,
+      dst_vec_data, rows, cols, stride);
 }
 
 void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/eb97097a/src/worker.cc
----------------------------------------------------------------------
diff --git a/src/worker.cc b/src/worker.cc
index d9380b6..a41e3a8 100644
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -61,11 +61,12 @@ Worker::~Worker() {
 }
 
 void Worker::Run() {
-  LOG(ERROR) << "Worker (group = " << grp_id_ <<", id = " << id_ << ") start";
   // setup gpu device
   auto context = Singleton<Context>::Instance();
   int device = context->device_id(std::this_thread::get_id());
-  if (device > 0)
+  LOG(ERROR) << "Worker (group = " << grp_id_ <<", id = " << id_
+    << ") start on device " << device;
+  if (device >= 0)
     context->ActivateDevice(device);
   auto cluster = Cluster::Get();
   int svr_grp = grp_id_ / cluster->nworker_groups_per_server_group();
@@ -90,8 +91,10 @@ void Worker::Run() {
       job_conf_.set_step(step_);
     }
     TrainOneBatch(step_, train_net_);
-    if (DisplayNow(step_) && grp_id_ == 0 && id_ == 0)
-      Display(kTrain | kForward | kBackward, "Train @ step " + std::to_string(step_), train_net_);
+    if (DisplayNow(step_) && grp_id_ == 0 && id_ == 0) {
+      Display(kTrain | kForward | kBackward,
+          "Train @ step " + std::to_string(step_), train_net_);
+    }
     step_++;
   }
 


Mime
View raw message