Mailing-List: contact commits-help@singa.incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@singa.incubator.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: wangwei@apache.org
To: commits@singa.incubator.apache.org
Date: Fri, 24 Jun 2016 06:51:37 -0000
Message-Id: <c1ef8ae087ef42ce9b9998c8ae7cc84d@git.apache.org>
In-Reply-To: <4c896dfa47c34cb19fa31a48f32f8f3c@git.apache.org>
References: <4c896dfa47c34cb19fa31a48f32f8f3c@git.apache.org>
Subject: [4/6] incubator-singa git commit: changed all device pointer to
 shared pointer
archived-at: Fri, 24 Jun 2016 06:51:46 -0000

changed all device pointer to shared pointer


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5651383f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5651383f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5651383f

Branch: refs/heads/dev
Commit: 5651383f5dbe0ab17eeda70f491d837a24bcb4ab
Parents: 077d13e
Author: liyuchenmike@gmail.com <liyuchenmike@gmail.com>
Authored: Wed Jun 22 21:06:38 2016 +0800
Committer: liyuchenmike@gmail.com <liyuchenmike@gmail.com>
Committed: Wed Jun 22 21:06:38 2016 +0800

----------------------------------------------------------------------
 include/singa/core/device.h    |  7 ++--
 include/singa/core/tensor.h    | 10 ++---
 include/singa/model/layer.h    |  2 +-
 src/core/device/cpp_cpu.cc     |  2 +-
 src/core/device/cuda_gpu.cc    |  5 ---
 src/core/device/device.cc      |  2 +-
 src/core/memory/memory.cc      |  3 --
 src/core/tensor/tensor.cc      | 19 +++++-----
 src/model/layer/batchnorm.cc   |  2 +-
 src/model/layer/batchnorm.h    |  2 +-
 src/model/layer/dense.cc       |  2 +-
 src/model/layer/dense.h        |  2 +-
 src/model/layer/dropout.cc     |  2 +-
 src/model/layer/dropout.h      |  2 +-
 test/singa/test_dense.cc       | 33 +++++++----------
 test/singa/test_memory.cc      |  6 +--
 test/singa/test_mse.cc         | 17 ++++-----
 test/singa/test_sgd.cc         |  8 ++--
 test/singa/test_tensor.cc      |  6 +--
 test/singa/test_tensor_math.cc | 74 ++++++++++++++++++-------------------
 20 files changed, 94 insertions(+), 112 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index fc98a23..d2b5b12 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -23,6 +23,7 @@
 #include <vector>
 #include <string>
 #include <functional>
+#include <memory>
 #include "singa/singa_config.h"
 #include "singa/core/common.h"
 #include "singa/core/memory.h"
@@ -75,7 +76,7 @@ class Device {
     return lang_;
   }
 
-  Device* host() const { return host_;}
+  std::shared_ptr<Device> host() const { return host_;}
 
   Context* context(int k) {
     return &ctx_;
@@ -107,7 +108,7 @@ class Device {
   // SafeQueue<Operation> op_queue_;
   // SafeQueue<Operation> op_log_;
   /// The host device
-  Device* host_;
+  std::shared_ptr<Device> host_;
   // TODO(wangwei) define multiple contexts, one per executor
   Context ctx_;
 };
@@ -134,7 +135,7 @@ class CppCPU : public Device {
 };
 
 /// a singleton CppDevice as the host for all devices.
-extern CppCPU defaultDevice;
+extern std::shared_ptr<Device> defaultDevice;
 
 
 // Implement Device using OpenCL libs.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..8f73047 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -67,8 +67,8 @@ class Tensor {
   Tensor();
   explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
   explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
-  Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
-  Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
+  Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
+  Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor &from);
@@ -80,7 +80,7 @@ class Tensor {
   /// blob_ is allocated in constructors.
   Blob *blob() const { return blob_; }
 
-  Device *device() const { return device_; }
+  std::shared_ptr<Device> device() const { return device_; }
 
   /// Return immutable Tensor values with given type.
   template <typename DType>
@@ -125,7 +125,7 @@ class Tensor {
 
   /// Reset the device.
   /// If the target device is a diff device, then do deep data copy.
-  void ToDevice(Device *dev);
+  void ToDevice(std::shared_ptr<Device> dev);
 
   /// Equivalent to ToDevice(host_dev).
   void ToHost();
@@ -192,7 +192,7 @@ class Tensor {
  protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;
-  Device *device_ = nullptr;
+  std::shared_ptr<Device> device_ = nullptr;
   /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
   /// If you want to get an allocated Blob, use blob() instead of blob_.
   Blob *blob_ = nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 82c8edc..ee2b42b 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -125,7 +125,7 @@ class Layer {
 
   /// Move the layer (including its parameters and other internal Tensor) onto
   /// the given device
-  virtual void ToDevice(Device* device) {
+  virtual void ToDevice(std::shared_ptr<Device> device) {
     //for (auto p : param_values_) p->ToDevice(device);
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
index 44f614a..6884e35 100644
--- a/src/core/device/cpp_cpu.cc
+++ b/src/core/device/cpp_cpu.cc
@@ -17,7 +17,7 @@
  */
 #include "singa/core/device.h"
 namespace singa {
-CppCPU defaultDevice(-1, 1);
+std::shared_ptr<Device> defaultDevice=std::make_shared<CppCPU>(-1, 1);
 CppCPU::CppCPU(int id, int num_executors, string scheduler,
          string vm) : Device(id, num_executors, scheduler, vm) {
   lang_ = kCpp;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index d9a0985..4da292f 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -43,7 +43,6 @@ CudaGPU::~CudaGPU() {
   }
 #endif
 	delete pool;
-	LOG(INFO) << "device has been deleted";
 }
 
 CudaGPU::CudaGPU(int id, int num_executors,
@@ -143,14 +142,10 @@ void* CudaGPU::Malloc(int size) {
 
   /// Free cpu memory.
 void CudaGPU::Free(void* ptr) {
-	LOG(INFO) << "Cuda free is called";
-	LOG(INFO) << "pool pointer" << pool << "\n";
-	LOG(INFO) << "pool status:" << ((CnMemPool*)pool)->status;
   if (ptr != nullptr) {
 		//CUDA_CHECK(cudaFree(ptr));
 		pool->Free(ptr);
 	}
-	LOG(INFO) << "free memory is successed";
 }
 
 
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 1d3c446..1889339 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -22,7 +22,7 @@ namespace singa {
 Device::Device(int id, int num_executors, string scheduler, string vm)
     : id_(id), num_executors_(num_executors) {
       // TODO(wangwei) create scheduler and vm.
-  host_ = &defaultDevice;
+  host_ = defaultDevice;
 }
 
 void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index c5878a6..304c101 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -60,7 +60,6 @@ CnMemPool::~CnMemPool() {
 		initialized = false;
 	}
 	mtx.unlock();
-	LOG(INFO) << "cnmem has been freed";
 }
 
 
@@ -70,10 +69,8 @@ void CnMemPool::Malloc(void** ptr, const size_t size) {
 }
 
 void CnMemPool::Free(void* ptr) {
-	LOG(INFO) << "cnmem free is called !!!!!!!!!!!";
 	cnmemStatus_t status = cnmemFree(ptr,NULL);
 	CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
-	LOG(INFO) << "cnmem free is terminated";
 }
 
 void CudaMemPool::Malloc(void** ptr, const size_t size) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..a5b43d8 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -25,29 +25,28 @@
 namespace singa {
 
 Tensor::~Tensor() {
-  // LOG(ERROR) << "~";
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
-Tensor::Tensor() { device_ = &defaultDevice; }
+Tensor::Tensor() { device_ = defaultDevice; }
 
 Tensor::Tensor(const Shape &shape, DataType dtype)
-    : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
-  device_ = &defaultDevice;
+    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+  device_ = defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
 Tensor::Tensor(Shape &&shape, DataType dtype)
-    : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
-  device_ = &defaultDevice;
+    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+  device_ = defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
+Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
+Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
@@ -104,7 +103,7 @@ void Tensor::AsType(DataType type) {
   }
 }
 
-void Tensor::ToDevice(Device *dst) {
+void Tensor::ToDevice(std::shared_ptr<Device> dst) {
   // TODO(wangwei) the comparison is very strict. May compare against device ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
@@ -234,7 +233,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
   CHECK_GE(src.MemSize(), src_offset + nBytes);
   CHECK_GE(dst->MemSize(), dst_offset + nBytes);
 
-  Device *src_dev = src.device(), *dst_dev = dst->device();
+  std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
   Blob *from = src.blob(), *to = dst->blob();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/batchnorm.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.cc b/src/model/layer/batchnorm.cc
index bcd0870..1e6c39b 100644
--- a/src/model/layer/batchnorm.cc
+++ b/src/model/layer/batchnorm.cc
@@ -44,7 +44,7 @@ void BatchNorm::Setup(const LayerConf& conf) {
   param_values_.push_back(&runningVariance_);
 }
 
-void BatchNorm::ToDevice(Device* device) {
+void BatchNorm::ToDevice(std::shared_ptr<Device> device) {
   bnScale_.ToDevice(device);
   bnBias_.ToDevice(device);
   dbnScale_.ToDevice(device);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.h b/src/model/layer/batchnorm.h
index 0255179..83f143d 100644
--- a/src/model/layer/batchnorm.h
+++ b/src/model/layer/batchnorm.h
@@ -67,7 +67,7 @@ class BatchNorm : public Layer {
     runningVariance_.ResetLike(x);
     runningVariance_.CopyData(x);
   }
-  virtual void ToDevice(Device* device) override;
+  virtual void ToDevice(std::shared_ptr<Device> device) override;
 
  protected:
   float factor_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index b349787..d47c1db 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -79,7 +79,7 @@ const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag,
   return std::make_pair(dx, param_grad);
 }
 
-void Dense::ToDevice(Device *device) {
+void Dense::ToDevice(std::shared_ptr<Device> device) {
   weight_.ToDevice(device);
   bias_.ToDevice(device);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dense.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.h b/src/model/layer/dense.h
index a5a6f66..49cb986 100644
--- a/src/model/layer/dense.h
+++ b/src/model/layer/dense.h
@@ -40,7 +40,7 @@ class Dense : public Layer {
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                    const Tensor& grad) override;
 
-  void ToDevice(Device* device) override;
+  void ToDevice(std::shared_ptr<Device> device) override;
 
   size_t num_output() const { return hdim_; }
   size_t num_input() const { return vdim_; }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.cc b/src/model/layer/dropout.cc
index c2c97be..695008e 100644
--- a/src/model/layer/dropout.cc
+++ b/src/model/layer/dropout.cc
@@ -52,7 +52,7 @@ const std::pair<Tensor, vector<Tensor>> Dropout::Backward(int flag,
   return std::make_pair(input_grad, param_grad);
 }
 
-void Dropout::ToDevice(Device* device) {
+void Dropout::ToDevice(std::shared_ptr<Device> device) {
   Layer::ToDevice(device);
   mask_.ToDevice(device);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.h b/src/model/layer/dropout.h
index 5efaf6a..d5da79c 100644
--- a/src/model/layer/dropout.h
+++ b/src/model/layer/dropout.h
@@ -43,7 +43,7 @@ class Dropout : public Layer {
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                    const Tensor& grad) override;
 
-  void ToDevice(Device* device) override;
+  void ToDevice(std::shared_ptr<Device> device) override;
 
   float dropout_ratio() const {
     return dropout_ratio_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index 052d0e8..7ed4d33 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -66,7 +66,6 @@ TEST(Dense, ForwardCpp) {
   dense.set_bias(bias);
 
   singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-  singa::CppCPU host(0, 1);
   const float *outptr1 = out1.data<const float *>();
   EXPECT_EQ(9u, out1.Size());
   for (int i = 0; i < 3; i++)
@@ -76,7 +75,6 @@ TEST(Dense, ForwardCpp) {
                       outptr1[i * 3 + j]);
 }
 #endif  // USE_CBLAS
-#ifdef USE_CUDA
 TEST(Dense, BackwardCpp) {
   Dense dense;
 
@@ -89,7 +87,6 @@ TEST(Dense, BackwardCpp) {
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
   const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  singa::CudaGPU cuda(0, 1);
   singa::Tensor in(singa::Shape{batchsize, vdim});
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
@@ -114,7 +111,6 @@ TEST(Dense, BackwardCpp) {
   grad.CopyDataFromHostPtr(dy, batchsize * hdim);
 
   const auto ret = dense.Backward(singa::kTrain, grad);
-  singa::CppCPU host(0, 1);
   singa::Tensor in_grad = ret.first;
   singa::Tensor dweight = ret.second.at(0);
   singa::Tensor dbias = ret.second.at(1);
@@ -139,7 +135,6 @@ TEST(Dense, BackwardCpp) {
   for (int i = 0; i < 3; i++)
     EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
 }
-#endif
 
 #ifdef USE_CUDA
 TEST(Dense, ForwardCuda) {
@@ -154,25 +149,24 @@ TEST(Dense, ForwardCuda) {
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
   const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  singa::CudaGPU cuda(0, 1);
-  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
   // set weight
   const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+  singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
   weight.CopyDataFromHostPtr(we, hdim * vdim);
 
   const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-  singa::Tensor bias(singa::Shape{hdim}, &cuda);
+  singa::Tensor bias(singa::Shape{hdim}, cuda);
   bias.CopyDataFromHostPtr(bia, hdim);
 
   dense.set_weight(weight);
   dense.set_bias(bias);
 
   singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-  singa::CppCPU host(0, 1);
-  out1.ToDevice(&host);
+  out1.ToHost();
   const float *outptr1 = out1.data<const float *>();
   EXPECT_EQ(9u, out1.Size());
   for (int i = 0; i < 3; i++)
@@ -193,17 +187,17 @@ TEST(Dense, BackwardCuda) {
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
   const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  singa::CudaGPU cuda(0, 1);
-  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
   // set weight
   const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+  singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
   weight.CopyDataFromHostPtr(we, hdim * vdim);
 
   const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-  singa::Tensor bias(singa::Shape{hdim}, &cuda);
+  singa::Tensor bias(singa::Shape{hdim}, cuda);
   bias.CopyDataFromHostPtr(bia, hdim);
 
   dense.set_weight(weight);
@@ -214,15 +208,14 @@ TEST(Dense, BackwardCuda) {
   // grad
   const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
                                       2.0f, 3.0f, 3.0f, 3.0f};
-  singa::Tensor grad(singa::Shape{batchsize, hdim}, &cuda);
+  singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
   grad.CopyDataFromHostPtr(dy, batchsize * hdim);
 
   const auto ret = dense.Backward(singa::kTrain, grad);
-  singa::CppCPU host(0, 1);
   singa::Tensor in_grad = ret.first;
   singa::Tensor dweight = ret.second.at(0);
   singa::Tensor dbias = ret.second.at(1);
-  in_grad.ToDevice(&host);
+  in_grad.ToHost();
   const float *dx = in_grad.data<const float *>();
   EXPECT_EQ(6u, in_grad.Size());
   for (int i = 0; i < 3; i++)
@@ -231,7 +224,7 @@ TEST(Dense, BackwardCuda) {
           (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
            dy[i * 3 + 2] * we[2 * 2 + j]),
           dx[i * 2 + j]);
-  dweight.ToDevice(&host);
+  dweight.ToHost();
   const float *dweightx = dweight.data<const float *>();
   EXPECT_EQ(6u, dweight.Size());
   for (int i = 0; i < 3; i++)
@@ -240,7 +233,7 @@ TEST(Dense, BackwardCuda) {
           (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
            dy[2 * 3 + i] * x[2 * 2 + j]),
           dweightx[i * 2 + j]);
-  dbias.ToDevice(&host);
+  dbias.ToHost();
   const float *dbiasx = dbias.data<const float *>();
   EXPECT_EQ(3u, dbias.Size());
   for (int i = 0; i < 3; i++)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
index f5e464d..90fc99a 100644
--- a/test/singa/test_memory.cc
+++ b/test/singa/test_memory.cc
@@ -75,7 +75,7 @@ TEST(MemPool, CompareCudaCnmem) {
 	singa::CnMemPool cnPool;
 	cnPool.InitPool();
 
-	int numOfTests = 10000;
+	int numOfTests = 5000;
 	int allocSize = 1000000U;
 	struct timeval start,end;
 	double t1,t2;
@@ -93,7 +93,7 @@ TEST(MemPool, CompareCudaCnmem) {
 	
 	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
 	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
-	LOG(INFO) << "cnmem time: " << t2-t1 << " ms" << std::endl;
+	LOG(INFO) << "cnmem memory time: " << t2-t1 << " ms" << std::endl;
 
 	pool = &cudaPool;
 	gettimeofday(&start,NULL);
@@ -106,6 +106,6 @@ TEST(MemPool, CompareCudaCnmem) {
 	
 	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
 	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
-	LOG(INFO) << "cuda time: " << t2-t1 << " ms" << std::endl;
+	LOG(INFO) << "cuda memory time: " << t2-t1 << " ms" << std::endl;
 }
 #endif // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 7c6066e..d2c5125 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -69,9 +69,9 @@ TEST_F(TestMSE, CppBackward) {
 #ifdef USE_CUDA
 TEST_F(TestMSE, CudaForward) {
   singa::MSE* mse = new singa::MSE();
-  singa::CudaGPU dev;
-  p.ToDevice(&dev);
-  t.ToDevice(&dev);
+  auto dev = std::make_shared<singa::CudaGPU>();
+  p.ToDevice(dev);
+  t.ToDevice(dev);
   Tensor loss = mse->Forward(p, t);
 
   loss.ToHost();
@@ -85,18 +85,15 @@ TEST_F(TestMSE, CudaForward) {
     }
     EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
   }
-	LOG(INFO) << "Before delete pxxxxxxxxxxxxxxxxxxxxxxxx";
 	p.ToHost();
-	LOG(INFO) << "Before delete tyyyyyyyyyyyyyyyyyyyyyyy";
 	t.ToHost();
-	LOG(INFO) << "terminate-xxxxxxxxxxxxxxxxxx-";
-	delete mse;
 }
+
 TEST_F(TestMSE, CudaBackward) {
   singa::MSE mse;
-  singa::CudaGPU dev;
-  p.ToDevice(&dev);
-  t.ToDevice(&dev);
+  auto dev = std::make_shared<singa::CudaGPU>();
+  p.ToDevice(dev);
+  t.ToDevice(dev);
   mse.Forward(p, t);
   Tensor grad = mse.Backward();
   grad.ToHost();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_sgd.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
index 71ab15e..3b04ab6 100644
--- a/test/singa/test_sgd.cc
+++ b/test/singa/test_sgd.cc
@@ -88,8 +88,8 @@ TEST(SGD, ApplyWithoutMomentumCuda) {
   const float v[4] = {0.1, 0.2, 0.3, 0.4};
   const float g[4] = {0.1, 0.1, 0.1, 0.1};
 
-  singa::CudaGPU dev;
-  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
   value.CopyDataFromHostPtr(v, 4);
   grad.CopyDataFromHostPtr(g, 4);
 
@@ -124,8 +124,8 @@ TEST(SGD, ApplyWithMomentumCuda) {
   const float v[4] = {0.1, 0.2, 0.3, 0.4};
   const float g[4] = {0.01, 0.02, 0.03, 0.04};
 
-  singa::CudaGPU dev;
-  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
   value.CopyDataFromHostPtr(v, 4);
   grad.CopyDataFromHostPtr(g, 4);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index bd039ad..c351174 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -59,10 +59,10 @@ TEST(TensorClass, AsType) {
 
 TEST(TensorClass, ToDevice) {
   Tensor t(Shape{2,3});
-  EXPECT_EQ(static_cast<Device*>(&singa::defaultDevice), t.device());
-  singa::CppCPU *dev = new singa::CppCPU(0, 1);
+  EXPECT_EQ(singa::defaultDevice, t.device());
+  auto dev = std::make_shared<singa::CppCPU>(0, 1);
   t.ToDevice(dev);
-  EXPECT_NE(static_cast<Device*>(&singa::defaultDevice), t.device());
+  EXPECT_NE(singa::defaultDevice, t.device());
 }
 
 TEST(TensorClass, CopyDataFromHostPtr) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index b18e465..0f998c0 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -255,10 +255,10 @@ TEST_F(TestTensorMath, SumColumnsCpp) {
 #ifdef USE_CUDA
 TEST_F(TestTensorMath, MultCuda) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2, 2}, &dev);
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2, 2}, dev);
   t.CopyDataFromHostPtr(x, 4);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   d.CopyDataFromHostPtr(dat1, 6);
   Tensor C = Mult(d, t);
   C.ToHost();
@@ -274,7 +274,7 @@ TEST_F(TestTensorMath, MultCuda) {
   }
 
   const float y[8] = {1.0f, 2.0f, 3.0f, 4.0f, 1.1f, 2.1f, 3.1f, 4.1f};
-  Tensor s(Shape{4, 2}, &dev);
+  Tensor s(Shape{4, 2}, dev);
   s.CopyDataFromHostPtr(y, 8);
   Tensor D = Mult(d, s.T());
   D.ToHost();
@@ -288,11 +288,11 @@ TEST_F(TestTensorMath, MultCuda) {
       EXPECT_FLOAT_EQ(DPtr[i * 4 + j], tmp);
     }
   }
-  Tensor p(Shape{4, 1}, &dev);
+  Tensor p(Shape{4, 1}, dev);
   p.CopyDataFromHostPtr(x, 4);
-  Tensor q(Shape{1, 4}, &dev);
+  Tensor q(Shape{1, 4}, dev);
   q.SetValue(1.0f);
-  Tensor o(Shape{4, 4}, &dev);
+  Tensor o(Shape{4, 4}, dev);
 
   Mult(p, q, &o);
   o.ToHost();
@@ -308,11 +308,11 @@ TEST_F(TestTensorMath, MultCuda) {
 
 TEST_F(TestTensorMath, AddColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   t.CopyDataFromHostPtr(x, 3);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   AddColumn(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -326,11 +326,11 @@ TEST_F(TestTensorMath, AddColumnCuda) {
 
 TEST_F(TestTensorMath, SubColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   t.CopyDataFromHostPtr(x, 3);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   SubColumn(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -357,11 +357,11 @@ TEST_F(TestTensorMath, MultColumnCpp) {
 #ifdef USE_CUDA
 TEST_F(TestTensorMath, MultColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   t.CopyDataFromHostPtr(x, 3);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   MultColumn(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -373,11 +373,11 @@ TEST_F(TestTensorMath, MultColumnCuda) {
 }
 TEST_F(TestTensorMath, DivColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   t.CopyDataFromHostPtr(x, 3);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   DivColumn(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -389,11 +389,11 @@ TEST_F(TestTensorMath, DivColumnCuda) {
 }
 TEST_F(TestTensorMath, AddRowCuda) {
   const float x[2] = {1.1f, 2.1f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   t.CopyDataFromHostPtr(x, 2);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   AddRow(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -405,11 +405,11 @@ TEST_F(TestTensorMath, AddRowCuda) {
 }
 TEST_F(TestTensorMath, SubRowCuda) {
   const float x[2] = {1.1f, 2.1f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   t.CopyDataFromHostPtr(x, 2);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   SubRow(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -421,11 +421,11 @@ TEST_F(TestTensorMath, SubRowCuda) {
 }
 TEST_F(TestTensorMath, MultRowCuda) {
   const float x[2] = {1.1f, 2.1f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   t.CopyDataFromHostPtr(x, 2);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   MultRow(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -452,11 +452,11 @@ TEST_F(TestTensorMath, DivRowCpp) {
 #ifdef USE_CUDA
 TEST_F(TestTensorMath, DivRowCuda) {
   const float x[2] = {1.1f, 2.1f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   t.CopyDataFromHostPtr(x, 2);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   DivRow(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -467,10 +467,10 @@ TEST_F(TestTensorMath, DivRowCuda) {
   }
 }
 TEST_F(TestTensorMath, SumRowsCuda) {
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   SumRows(d, &t);
   t.ToHost();
   const float *tptr = t.data<const float *>();
@@ -484,10 +484,10 @@ TEST_F(TestTensorMath, SumRowsCuda) {
 	d.ToHost();
 }
 TEST_F(TestTensorMath, SumColumnCuda) {
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   SumColumns(d, &t);
   t.ToHost();
   const float *tptr = t.data<const float *>();