singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kaip...@apache.org
Subject incubator-singa git commit: SINGA-194 Add a Platform singleton
Date Fri, 01 Jul 2016 09:51:39 GMT
Repository: incubator-singa
Updated Branches:
  refs/heads/dev f0bc22889 -> dc013f34f


SINGA-194 Add a Platform singleton

Add the Platform class whose methods are all static.
It includes methods for query hardware GPUs, e.g., num of gpus, memory of each gpu.
It also creats a vector of singa::Device, e.g., CudaGPU.
NOTE:
If multiple CudaGPU devices are created, and they all use CnMemPool,
then they must share the same CnMemPool instance (otherwise there would
be problems in desctructing the CnMemPool).
It is preferred to use Platform to create CudaGPU devices which would
avoid the above problem.

Updated the APIs for Device and DeviceMemPool.
The GetAllocateMem is not accurate (which is larger than actually requested memory size).

NOTE:
The swig would report errors if USE_CUDA is OFF as the CudaGPU is not avaiable.
TODO(wangwei) resolve this problem in later PR for python setup.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/dc013f34
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/dc013f34
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/dc013f34

Branch: refs/heads/dev
Commit: dc013f34fa1c5cb647290ce185b3c2d6e43dcfe0
Parents: f0bc228
Author: Wei Wang <wangwei@comp.nus.edu.sg>
Authored: Tue Jun 28 22:17:57 2016 +0800
Committer: Wei Wang <wangwei@comp.nus.edu.sg>
Committed: Fri Jul 1 17:37:22 2016 +0800

----------------------------------------------------------------------
 include/singa/core/device.h          | 133 +++++++++++++++-------------
 include/singa/core/memory.h          |  44 ++++++----
 include/singa/utils/cuda_utils.h     |   1 +
 src/core/device/cpp_cpu.cc           |   5 +-
 src/core/device/cuda_gpu.cc          | 126 ++++++---------------------
 src/core/device/device.cc            |   4 +-
 src/core/device/platform.cc          | 138 ++++++++++++++++++++++++++++++
 src/core/memory/memory.cc            |  55 ++++++++----
 src/proto/core.proto                 |  17 ++--
 src/python/swig/core_device.i        |   6 +-
 test/singa/test_cpp_cpu.cc           |   8 +-
 test/singa/test_cudnn_activation.cc  |   4 +-
 test/singa/test_cudnn_batchnorm.cc   |   4 +-
 test/singa/test_cudnn_convolution.cc |   8 +-
 test/singa/test_cudnn_dropout.cc     |   4 +-
 test/singa/test_cudnn_lrn.cc         |   4 +-
 test/singa/test_cudnn_pooling.cc     |   4 +-
 test/singa/test_cudnn_softmax.cc     |   8 +-
 test/singa/test_dense.cc             |   4 +-
 test/singa/test_memory.cc            |  15 ++--
 test/singa/test_platform.cc          |  80 +++++++++++++++++
 test/singa/test_tensor.cc            |   2 +-
 22 files changed, 430 insertions(+), 244 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 8c4546f..17fa66a 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -33,18 +33,17 @@
 using std::vector;
 using std::string;
 using std::function;
+using std::shared_ptr;
 namespace singa {
 /// Allocate memory and execute Tensor operations.
 /// There are three types of devices distinguished by their programming
 /// languages, namely cpp, cuda and opencl.
 class Device {
   public:
-  Device() = default;
   /// Constructor with device ID, num of executors (e.g., cuda streams),
-  /// max mem size to use (in MB), identifier of scheduler type (default
-  /// scheduler run operations synchronously) and virtual memory type (default
-  /// vm only provides garbage collection).
-  Device(int id, int num_executors, string scheduler, string vm);
+  /// max mem size to use (in MB)
+  Device(int id, int num_executors);
+
   virtual void SetRandSeed(unsigned seed) = 0;
 
   /// Called by Tensor.
@@ -53,6 +52,12 @@ class Device {
   /// Called by Tensor.
   void FreeBlock(Block* block);
 
+  /// Return the size (bytes) of memory in use
+  /// TODO(wangwei) override this function for all devices.
+  virtual size_t GetAllocatedMem() {
+    return 0u;
+  }
+
   /// Copy data within or across devices.
   void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                       CopyDirection direction, int dst_offset, int src_offset);
@@ -84,7 +89,10 @@ class Device {
 
   int id() const { return id_; }
 
-  virtual ~Device() = 0;
+  virtual ~Device();
+
+ private:
+  Device() {};
 
  protected:
   /// Execute one operation on one executor.
@@ -120,8 +128,7 @@ class Device {
 class CppCPU : public Device {
  public:
   ~CppCPU() {};
-  CppCPU(int id = -1, int num_executors = 1,
-            string scheduler = "sync", string vm = "gc-only");
+  CppCPU();
 
   void SetRandSeed(unsigned seed) override;
  protected:
@@ -149,32 +156,14 @@ extern std::shared_ptr<Device> defaultDevice;
 class CudaGPU : public Device {
  public:
   ~CudaGPU();
-  CudaGPU(int id = 0, int num_executors = 1, string scheduler = "sync",
-         string vm = "gc-only");
-	CudaGPU(const MemPoolConf& mem_conf,
-					int id = 0, int num_executors = 1, string scheduler = "sync");
+  /// Construct the device using default mem pool setting.
+  CudaGPU(int id = 0);
+  /// Construct the device given the physical device ID and memory pool.
+  CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool);
 
   void SetRandSeed(unsigned seed) override;
-  static void DeviceQuery();
-  /// This function checks the availability of GPU #device_id.
-  /// It attempts to create a context on the device by calling cudaFree(0).
-  /// cudaSetDevice() alone is not sufficient to check the availability.
-  /// It lazily records device_id, however, does not initialize a
-  /// context. So it does not know if the host thread has the permission to use
-  /// the device or not.
-  ///
-  /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
-  /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
-  /// even if the device is exclusively occupied by another process or thread.
-  /// Cuda operations that initialize the context are needed to check
-  /// the permission. cudaFree(0) is one of those with no side effect,
-  /// except the context initialization.
-  static bool CheckDevice(const int device_id);
-  /// This function finds the first available device by checking devices with
-  /// ordinal from start_id to the highest available value. In the
-  /// EXCLUSIVE_PROCESS or EXCLUSIVE_THREAD mode, if it succeeds, it also
-  /// claims the device due to the initialization of the context.
-  static int FindDevice(const int start_id);
+  size_t GetAllocatedMem() override;
+
  protected:
   void DoExec(function<void(Context*)>&& fn, int executor) override;
 
@@ -187,41 +176,67 @@ class CudaGPU : public Device {
   /// Free cpu memory.
   void Free(void* ptr) override;
 
-	private:
-	DeviceMemPool* pool;
+ private:
+  void Setup();
+
+ private:
+	shared_ptr<DeviceMemPool> pool_;
 };
 
 /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
 
 #endif  // USE_CUDA
 
-// Implement a CudaHost device, which used cuda functions for memory
-// malloc/free.
-// class CudaHost : public Device {}
-//
-/// The base type of callback argument structure.
-/// The specific arg should inherit from this one.
-/*
-class CallbackArg {
+/// For querying physical devices and creating singa::Device instances.
+class Platform {
  public:
-  template <typename T>
-  T* CastTo() {
-    static_assert(std::is_base_of<CallbackArg, T>::value,
-                  "The casted type must be a sub-class of CallbackArg");
-    return static_cast<T*>(this);
-  }
+  /// Return the number of total avaiable GPUs
+  static int GetNumGPUs();
+
+  /// Return the device IDs of available GPUs.
+  /// TODO(wangwei) return the IDs according to free memory in decending order
+  static const vector<int> GetGPUIDs();
+
+  static const std::pair<size_t, size_t> GetGPUMemSize(const int device);
+  /// Return the memory of a GPU <free, total>
+  static const vector<std::pair<size_t, size_t>> GetGPUMemSize();
+
+  /// Return a string containing all hardware info, e.g., version, memory size.
+  static const string DeviceQuery(int id, bool verbose = false);
+
+  /// Create a set of CudaGPU Device using 'num_devices' free GPUs.
+  static const vector<shared_ptr<Device> >
+  CreateCudaGPUs(const size_t num_devices, size_t init_size = 0);
+
+  /// Create a set of CudaGPU Device using given GPU IDs.
+  static const vector<shared_ptr<Device> >
+  CreateCudaGPUs(const vector<int> &devices, size_t init_size = 0);
+
+  /// Create a set of OpenclGPU Device using 'num_devices' free GPUs.
+  const vector<shared_ptr<Device>> CreateOpenclGPUs(const size_t num_devices);
+
+  /// Create a set of OpenclGPU Device using given GPU IDs.
+  const vector<shared_ptr<Device>> CreateOpenclGPUs(const vector<int>& id);
+  /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/).
+  /// This function checks the availability of GPU #device_id.
+  /// It attempts to create a context on the device by calling cudaFree(0).
+  /// cudaSetDevice() alone is not sufficient to check the availability.
+  /// It lazily records device_id, however, does not initialize a
+  /// context. So it does not know if the host thread has the permission to use
+  /// the device or not.
+  ///
+  /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
+  /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
+  /// even if the device is exclusively occupied by another process or thread.
+  /// Cuda operations that initialize the context are needed to check
+  /// the permission. cudaFree(0) is one of those with no side effect,
+  /// except the context initialization.
+  static bool CheckDevice(const int device_id);
+
+ private:
+  Platform() {};  // No need to construct an instance as it has no member fields
 };
-/// Type of callback functions for executing tensor ops.
-typedef function<void(CallbackArg*)> CallbackFn;
-public:
-  /// Operation has a function, and read/write blocks.
-  typedef struct _Operation {
-    function<void(Context*)> fn;
-    const vector<Block*> read_blocks;
-    const vector<Block*> write_blocks;
-  } Operation;
-
-*/
+
 }  // namespace singa
 
 #endif  // SINGA_CORE_DEVICE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/include/singa/core/memory.h
----------------------------------------------------------------------
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index 7c22ea3..1019ed3 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -19,13 +19,14 @@
 #ifndef SINGA_CORE_MEMORY_H_
 #define SINGA_CORE_MEMORY_H_
 
+#include <mutex>
+#include "singa/proto/core.pb.h"
 #include "singa/singa_config.h"
 
 #ifdef USE_CUDA
 #include "cnmem.h"
 #endif
 
-#include <mutex>
 
 namespace singa {
 
@@ -34,34 +35,41 @@ class VirtualMemory {};
 
 class DeviceMemPool {
  public:
-  virtual void InitPool() = 0;
-  virtual void Malloc(void** ptr, const size_t size) = 0;
-  virtual void Free(void* ptr) = 0;
+  virtual void Malloc(void** ptr, const size_t size)  = 0;
+  virtual void Free(void* ptr)  = 0;
+
+  /// Return a pair for free and total memory managed by this pool.
+  virtual std::pair<size_t, size_t> GetMemUsage() {
+    return std::make_pair(0u, 0u);
+  }
   virtual ~DeviceMemPool(){};
+
+ protected:
+  size_t usage_;
+//  size_t init_size_ = 0, max_size_ = 0;
 };
 
 #ifdef USE_CUDA
 class CnMemPool : public DeviceMemPool {
  public:
-  int status = 1;
-
-  void InitPool();
-
-  /// numDevices: total number of available GPU cards.
-  /// initSize: all devices will be allocated with this size
-  /// manager_flags: pool manager flag (one for all devices)
-  /// flag = 0; default flag
-  /// flag = 1: Prevent the manager from growing its memory consumption
-  /// flag = 2; Prevent the manager from stealing memory.
-  void InitPool(int numDevices, size_t initSize, unsigned flag);
+  // Create the mem pool by setting the devices [0, numDevices), and
+  // initial pool size (MB), and max pool size (no effect currently).
+  CnMemPool(int numDevices = 1, size_t init_size = 256, size_t max_size = 0);
+  CnMemPool(const MemPoolConf& conf);
 
   void Malloc(void** ptr, const size_t size);
   void Free(void* ptr);
 
+  std::pair<size_t, size_t> GetMemUsage() override;
+
   // release all memory and set cnmem manager to unintialized
   ~CnMemPool();
 
+ protected:
+  void Init();
+
  private:
+  MemPoolConf conf_;
   // whether the (global) memory pool has been initialized
   static bool initialized;
   // lock on the initialized variable
@@ -70,10 +78,8 @@ class CnMemPool : public DeviceMemPool {
 
 class CudaMemPool : public DeviceMemPool {
  public:
-  void InitPool(){};
-  void Malloc(void** ptr, const size_t size);
-  void Free(void* ptr);
-  ~CudaMemPool(){};
+  void Malloc(void** ptr, const size_t size) override;
+  void Free(void* ptr) override;
 };
 #endif
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/include/singa/utils/cuda_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
index 24f3eb9..2fe7d27 100644
--- a/include/singa/utils/cuda_utils.h
+++ b/include/singa/utils/cuda_utils.h
@@ -7,6 +7,7 @@
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <curand.h>
 
 inline const char* cublasGetErrorString(cublasStatus_t error) {
   switch (error) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
index 8ccfaf6..401645d 100644
--- a/src/core/device/cpp_cpu.cc
+++ b/src/core/device/cpp_cpu.cc
@@ -17,9 +17,8 @@
  */
 #include "singa/core/device.h"
 namespace singa {
-std::shared_ptr<Device> defaultDevice=std::make_shared<CppCPU>(-1, 1);
-CppCPU::CppCPU(int id, int num_executors, string scheduler,
-         string vm) : Device(id, num_executors, scheduler, vm) {
+std::shared_ptr<Device> defaultDevice=std::make_shared<CppCPU>();
+CppCPU::CppCPU() : Device(0, 1) {
   lang_ = kCpp;
   //host_ = nullptr;
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 5f6ac17..0164752 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -41,44 +41,28 @@ CudaGPU::~CudaGPU() {
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
   }
 #endif
-  delete pool;
 }
+const int kNumCudaStream = 1;
 
-CudaGPU::CudaGPU(int id, int num_executors, string scheduler, string vm)
-    : Device(id, num_executors, scheduler, vm) {
-  if (id == -1) id = FindDevice(0);
-  lang_ = kCuda;
-  ctx_.stream = NULL;  // use the default sync stream
-  // TODO(wangwei) create one handle for each steam?
-  CUDA_CHECK(cudaSetDevice(FindDevice(0)));
-  // use curandCreateGeneratorHost for CudaHost device
-  CURAND_CHECK(
-      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
-  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
-  SetRandSeed(seed);
-  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
-  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
-  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
-
-#ifdef USE_CUDNN
-  // TODO(wangwei) create one handle for each stream?
-  auto status = cudnnCreate(&ctx_.cudnn_handle);
-  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
-#endif  // USE_CUDNN
+CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {
+  MemPoolConf conf;
+  conf.add_device(id);
+  pool_ = std::make_shared<CnMemPool>(conf);
+  Setup();
+}
 
-  // initialize cnmem memory management as default
-  pool = new CnMemPool();
-  ((CnMemPool*)pool)->InitPool();
+CudaGPU::CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool)
+    : Device(id, kNumCudaStream) {
+  CHECK_NE(pool, nullptr);
+  pool_ = pool;
+  Setup();
 }
 
-CudaGPU::CudaGPU(const MemPoolConf& mem_conf, int id, int num_executors,
-                 string scheduler)
-    : Device(id, num_executors, scheduler, "gc-only") {
-  if (id == -1) id = FindDevice(0);
+void CudaGPU::Setup() {
   lang_ = kCuda;
   ctx_.stream = NULL;  // use the default sync stream
   // TODO(wangwei) create one handle for each steam?
-  CUDA_CHECK(cudaSetDevice(FindDevice(0)));
+  CUDA_CHECK(cudaSetDevice(id_));
   // use curandCreateGeneratorHost for CudaHost device
   CURAND_CHECK(
       curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
@@ -93,18 +77,6 @@ CudaGPU::CudaGPU(const MemPoolConf& mem_conf, int id, int num_executors,
   auto status = cudnnCreate(&ctx_.cudnn_handle);
   CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
 #endif  // USE_CUDNN
-
-  // initialize memory management for cuda devices
-  string memoryPoolType = mem_conf.type();
-  if (memoryPoolType.compare("cnmem") == 0) {
-    pool = new CnMemPool();
-    int num_devices = mem_conf.num_devices();
-    size_t alloc_size = mem_conf.alloc_size();
-    unsigned flag = mem_conf.cnmemflag();
-    ((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
-  } else {
-    pool = new CudaMemPool();
-  }
 }
 
 void CudaGPU::SetRandSeed(unsigned seed) {
@@ -121,12 +93,22 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
   // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
 }
 
+size_t CudaGPU::GetAllocatedMem() {
+  if (pool_ != nullptr) {
+    auto ret = pool_->GetMemUsage();
+    return ret.second - ret.first;
+  }
+  LOG(ERROR) << "The memory pool is not set";
+  return 0u;
+}
+
 /// Allocate gpu memory.
 void* CudaGPU::Malloc(int size) {
   void* ptr = nullptr;
   if (size > 0) {
-    // CUDA_CHECK(cudaMalloc((void**)&ptr,size));
-    pool->Malloc((void**)&ptr, size);
+    CUDA_CHECK(cudaSetDevice(id_));
+    pool_->Malloc((void**)&ptr, size);
+    // TODO(wangwei) remove the memset.
     CUDA_CHECK(cudaMemset(ptr, 0, size));
   }
   return ptr;
@@ -135,61 +117,9 @@ void* CudaGPU::Malloc(int size) {
 /// Free gpu memory.
 void CudaGPU::Free(void* ptr) {
   if (ptr != nullptr) {
-    // CUDA_CHECK(cudaFree(ptr));
-    pool->Free(ptr);
-  }
-}
-
-// ==========Following code is from Caffe src/caffe/common.cpp=================
-
-void CudaGPU::DeviceQuery() {
-  cudaDeviceProp prop;
-  int device;
-  if (cudaSuccess != cudaGetDevice(&device)) {
-    printf("No cuda device present.\n");
-    return;
-  }
-  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-  LOG(INFO) << "Device id:                     " << device;
-  LOG(INFO) << "Major revision number:         " << prop.major;
-  LOG(INFO) << "Minor revision number:         " << prop.minor;
-  LOG(INFO) << "Name:                          " << prop.name;
-  LOG(INFO) << "Total global memory:           " << prop.totalGlobalMem;
-  LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
-  LOG(INFO) << "Total registers per block:     " << prop.regsPerBlock;
-  LOG(INFO) << "Warp size:                     " << prop.warpSize;
-  LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
-  LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
-  LOG(INFO) << "Maximum dimension of block:    " << prop.maxThreadsDim[0]
-            << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2];
-  LOG(INFO) << "Maximum dimension of grid:     " << prop.maxGridSize[0] << ", "
-            << prop.maxGridSize[1] << ", " << prop.maxGridSize[2];
-  LOG(INFO) << "Clock rate:                    " << prop.clockRate;
-  LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
-  LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
-  LOG(INFO) << "Concurrent copy and execution: " << (prop.deviceOverlap ? "Yes"
-                                                                        : "No");
-  LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
-  LOG(INFO) << "Kernel execution timeout:      "
-            << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-  return;
-}
-
-bool CudaGPU::CheckDevice(const int device_id) {
-  bool r = ((cudaSuccess == cudaSetDevice(device_id)) &&
-            (cudaSuccess == cudaFree(0)));
-  // reset any error that may have occurred.
-  cudaGetLastError();
-  return r;
-}
-
-int CudaGPU::FindDevice(const int start_id) {
-  int count = 0;
-  CUDA_CHECK(cudaGetDeviceCount(&count));
-  for (int i = start_id; i < count; i++) {
-    if (CheckDevice(i)) return i;
+    CUDA_CHECK(cudaSetDevice(id_));
+    pool_->Free(ptr);
   }
-  return -1;
 }
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 071b891..31d2b2a 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -19,9 +19,9 @@
 #include "singa/core/device.h"
 
 namespace singa {
-Device::Device(int id, int num_executors, string scheduler, string vm)
+Device::Device(int id, int num_executors)
     : id_(id), num_executors_(num_executors) {
-      // TODO(wangwei) create scheduler and vm.
+  // TODO(wangwei) create scheduler and vm.
   host_ = defaultDevice;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/src/core/device/platform.cc
----------------------------------------------------------------------
diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc
new file mode 100644
index 0000000..1e2dc4a
--- /dev/null
+++ b/src/core/device/platform.cc
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/core/device.h"
+#include "singa/singa_config.h"
+#ifdef USE_CUDA
+#include "singa/utils/cuda_utils.h"
+namespace singa {
+int Platform::GetNumGPUs() {
+  int count;
+  CUDA_CHECK(cudaGetDeviceCount(&count));
+  return count;
+}
+
+bool Platform::CheckDevice(const int device_id) {
+  bool r = ((cudaSuccess == cudaSetDevice(device_id)) &&
+            (cudaSuccess == cudaFree(0)));
+  // reset any error that may have occurred.
+  cudaGetLastError();
+  return r;
+}
+
+/// Return the total num of free GPUs
+const vector<int> Platform::GetGPUIDs() {
+  vector<int> gpus;
+  int count = Platform::GetNumGPUs();
+  for (int i = 0; i < count; i++) {
+    if (Platform::CheckDevice(i)) {
+      gpus.push_back(i);
+    }
+  }
+  return gpus;
+}
+
+const std::pair<size_t, size_t> Platform::GetGPUMemSize(const int device) {
+  std::pair<size_t, size_t> ret{ 0, 0 };
+  if (Platform::CheckDevice(device)) {
+    CUDA_CHECK(cudaSetDevice(device));
+    size_t free = 0, total = 0;
+    CUDA_CHECK(cudaMemGetInfo(&free, &total));
+    ret = std::make_pair(free, total);
+  } else {
+    LOG(ERROR) << "The device (ID = " << device << ") is not available";
+  }
+  return ret;
+}
+
+const vector<std::pair<size_t, size_t>> Platform::GetGPUMemSize() {
+  vector<std::pair<size_t, size_t>> mem;
+  int count = Platform::GetNumGPUs();
+  for (int i = 0; i < count; i++) {
+    mem.push_back(Platform::GetGPUMemSize(i));
+  }
+  return mem;
+}
+
+const string Platform::DeviceQuery(int device, bool verbose) {
+  if (cudaSuccess != cudaGetDevice(&device)) {
+    return "The device (ID = " + std::to_string(device) + " is not available" ;
+  }
+  cudaDeviceProp prop;
+  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+  std::ostringstream out;
+  out << "Device id:                     " << device << '\n';
+  out << "Total global memory:           " << prop.totalGlobalMem << '\n';
+  out << "Total shared memory per block: " << prop.sharedMemPerBlock
+      << '\n';
+  out << "Maximum threads per block:     " << prop.maxThreadsPerBlock
+      << '\n';
+  out << "Maximum dimension of block:    "
+      << prop.maxThreadsDim[0 << '\n'] << ", " << prop.maxThreadsDim[1]
+      << ", " << prop.maxThreadsDim[2] << '\n';
+  out << "Maximum dimension of grid:     " << prop.maxGridSize[0] << ", "
+      << "Concurrent copy and execution: "
+      << (prop.deviceOverlap ? "Yes" : "No") << '\n';
+
+  if (verbose) {
+    out << "Major revision number:         " << prop.major << '\n';
+    out << "Minor revision number:         " << prop.minor << '\n';
+    out << "Name:                          " << prop.name << '\n';
+    out << "Total registers per block:     " << prop.regsPerBlock << '\n';
+    out << "Maximum memory pitch:          " << prop.memPitch << '\n';
+    out << "Warp size:                     " << prop.warpSize
+      << prop.maxGridSize[1] << ", " << prop.maxGridSize[2] << '\n';
+    out << "Clock rate:                    " << prop.clockRate << '\n';
+    out << "Number of multiprocessors:     " << prop.multiProcessorCount
+        << '\n';
+    out << "Kernel execution timeout:      "
+        << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << '\n';
+  }
+  return out.str();
+}
+
+const vector<shared_ptr<Device> >
+Platform::CreateCudaGPUs(const size_t num_devices, size_t init_size) {
+  const vector<int> gpus = GetGPUIDs();
+  CHECK_LE(num_devices, gpus.size());
+  vector<int> use_gpus(gpus.begin(), gpus.begin() + num_devices);
+  return CreateCudaGPUs(use_gpus, init_size);
+}
+
+const vector<shared_ptr<Device> >
+Platform::CreateCudaGPUs(const vector<int> &devices, size_t init_size) {
+  MemPoolConf conf;
+  if (init_size > 0)
+    conf.set_init_size(init_size);
+  size_t bytes = conf.init_size() << 20;
+  for (auto device : devices) {
+    conf.add_device(device);
+    CHECK_LE(bytes, Platform::GetGPUMemSize(device).first);
+  }
+  auto pool = std::make_shared<CnMemPool>(conf);
+
+  vector<shared_ptr<Device> > ret;
+  for (auto device : devices) {
+    auto dev = std::make_shared<CudaGPU>(device, pool);
+    ret.push_back(dev);
+  }
+  return ret;
+}
+
+}  // namespace singa
+#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index 7ac6792..63ffc2d 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -25,20 +25,41 @@
 namespace singa {
 bool singa::CnMemPool::initialized = false;
 std::mutex singa::CnMemPool::mtx;
-void CnMemPool::InitPool(int numDevices, size_t initSize, unsigned flag) {
+
+std::pair<size_t, size_t> CnMemPool::GetMemUsage() {
+  size_t free, total;
+  auto status = cnmemMemGetInfo(&free, &total, NULL);
+  CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
+    << cnmemGetErrorString(status);
+  return std::make_pair(free, total);
+}
+
+CnMemPool::CnMemPool(int numDevices, size_t init_size, size_t max_size) {
+  for (int i = 0; i < numDevices; i++)
+    conf_.add_device(i);
+  conf_.set_init_size(init_size);
+  conf_.set_max_size(max_size);
+}
+
+CnMemPool::CnMemPool(const MemPoolConf &conf) { conf_ = conf; }
+
+void CnMemPool::Init() {
   mtx.lock();
-  const size_t kNBytesPerMB = (1u << 20);
   if (!initialized) {
-    CHECK_GE(numDevices, 1);
-    cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
-    for (int i = 0; i < numDevices; i++) {
-      settingPtr[i].device = i;
-      settingPtr[i].size = initSize * kNBytesPerMB;
+    const size_t kNBytesPerMB = (1u << 20);
+    CHECK_GE(conf_.device_size(), 1);
+    cnmemDevice_t *settingPtr = new cnmemDevice_t[conf_.device_size()];
+    CHECK_GT(conf_.init_size(), 0);
+    int i = 0;
+    for (auto device : conf_.device()) {
+      settingPtr[i].device = device;
+      settingPtr[device].size = conf_.init_size() * kNBytesPerMB;
       settingPtr[i].numStreams = 0;
       settingPtr[i].streams = NULL;
       settingPtr[i].streamSizes = 0;
+      i++;
     }
-    cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
+    auto status = cnmemInit(conf_.device_size(), settingPtr, conf_.flag());
     CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
         << " " << cnmemGetErrorString(status);
     delete[] settingPtr;
@@ -47,12 +68,6 @@ void CnMemPool::InitPool(int numDevices, size_t initSize, unsigned flag) {
   mtx.unlock();
 }
 
-void CnMemPool::InitPool() {
-  MemPoolConf conf;
-  InitPool(conf.num_devices(), conf.alloc_size(),
-           cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
-}
-
 CnMemPool::~CnMemPool() {
   mtx.lock();
   if (initialized) {
@@ -64,24 +79,28 @@ CnMemPool::~CnMemPool() {
   mtx.unlock();
 }
 
-void CnMemPool::Malloc(void** ptr, const size_t size) {
+void CnMemPool::Malloc(void **ptr, const size_t size) {
+  if (!initialized)
+    Init();
   cnmemStatus_t status = cnmemMalloc(ptr, size, NULL);
   CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
       << " " << cnmemGetErrorString(status);
 }
 
-void CnMemPool::Free(void* ptr) {
+void CnMemPool::Free(void *ptr) {
+  CHECK(initialized) << "Cannot free the memory as the pool is not initialzied";
   cnmemStatus_t status = cnmemFree(ptr, NULL);
   CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
       << " " << cnmemGetErrorString(status);
 }
 
-void CudaMemPool::Malloc(void** ptr, const size_t size) {
+// ===========================================================================
+void CudaMemPool::Malloc(void **ptr, const size_t size) {
   cudaError_t status = cudaMalloc(ptr, size);
   CHECK_EQ(status, cudaError_t::cudaSuccess);
 }
 
-void CudaMemPool::Free(void* ptr) {
+void CudaMemPool::Free(void *ptr) {
   cudaError_t status = cudaFree(ptr);
   CHECK_EQ(status, cudaError_t::cudaSuccess);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index da32bc9..c88bee9 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -49,14 +49,19 @@ enum CopyDirection {
 // configuration for device memory pool
 message MemPoolConf {
 	optional string type = 1 [default = "cnmem"];
-	optional uint32 num_devices = 2 [default = 1];
 	// allocation size for each device, default is 256 MB
-	optional uint32 alloc_size = 3 [default = 256];
+	optional uint32 init_size = 2 [default = 256];
+  // size limit in MB; report error/warning if this limit is reached.
+  // 0 for unlimited memory, i.e., use as much memory as the device has
+  // not used currently.
+	optional uint32 max_size = 3 [default = 0];
+
 	// memory manager flag for cnmem
-	// cnmemflag = 0: default flag
-	// cnmemflag = 1: prevent the manager from growing its memory consumption
-	// cnmemflag = 2: prevent the manager from stealing memory
-	optional uint32 cnmemflag = 4 [default = 0];
+	// flag = 0: default flag
+	// flag = 1: prevent the manager from growing its memory consumption
+	// flag = 2: prevent the manager from stealing memory
+	optional uint32 flag = 11 [default = 0];
+  repeated uint32 device = 12;
 }
 
 // For tensor serialization

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/src/python/swig/core_device.i
----------------------------------------------------------------------
diff --git a/src/python/swig/core_device.i b/src/python/swig/core_device.i
index 7430620..e410982 100644
--- a/src/python/swig/core_device.i
+++ b/src/python/swig/core_device.i
@@ -46,8 +46,7 @@ namespace singa{
 
   class CppCPU : public Device {
    public:
-    CppCPU(int id = -1, int num_executors = 1,
-           std::string scheduler = "sync", std::string vm = "gc-only");
+    CppCPU();
     void SetRandSeed(unsigned seed) override;
     /* (TODO) add necessary functions of CppCPU class
     */
@@ -55,8 +54,7 @@ namespace singa{
 
   class CudaGPU : public Device {
    public:
-    CudaGPU(int id = 0, int num_executors = 1,
-            std::string scheduler = "sync", std::string vm = "gc-only");
+    CudaGPU();
     void SetRandSeed(unsigned seed) override;
     /* (TODO) add necessary functions of CudaGPU class
     */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_cpu.cc b/test/singa/test_cpp_cpu.cc
index ec5c7e1..0ef6aac 100644
--- a/test/singa/test_cpp_cpu.cc
+++ b/test/singa/test_cpp_cpu.cc
@@ -26,12 +26,12 @@
 using singa::CppCPU;
 using singa::Block;
 TEST(CppCPU, Constructor) {
-  CppCPU dev(0, 1);
+  CppCPU dev;
   EXPECT_EQ(0, dev.id());
 }
 
 TEST(CppCPU, MemoryMallocFree) {
-  CppCPU dev(0, 1);
+  CppCPU dev;
   Block* b = dev.NewBlock(4);
   EXPECT_NE(nullptr, b);
   EXPECT_EQ(4u, b->size());
@@ -39,7 +39,7 @@ TEST(CppCPU, MemoryMallocFree) {
 }
 
 TEST(CppCPU, Exec) {
-  CppCPU dev(0, 1);
+  CppCPU dev;
   Block* b = dev.NewBlock(4);
   int x = 1, y =3, z = 0;
   dev.Exec([x, y, &z](singa::Context *ctx) {
@@ -49,7 +49,7 @@ TEST(CppCPU, Exec) {
 }
 
 TEST(CppCPU, CopyData) {
-  CppCPU dev(0, 1);
+  CppCPU dev;
   Block* b = dev.NewBlock(4);
   char s[] = {'a', 'b', 'c', 'x'};
   dev.CopyDataFromHostPtr(b, s, 4);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_activation.cc b/test/singa/test_cudnn_activation.cc
index 1a619e7..a9ac1a3 100644
--- a/test/singa/test_cudnn_activation.cc
+++ b/test/singa/test_cudnn_activation.cc
@@ -46,7 +46,7 @@ TEST(TCudnnActivation, Setup) {
 TEST(TCudnnActivation, Forward) {
   const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
   size_t n = sizeof(x) / sizeof(float);
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{n}, cuda);
   in.CopyDataFromHostPtr<float>(x, n);
 
@@ -85,7 +85,7 @@ TEST(TCudnnActivation, Forward) {
 TEST(TCudnnActivation, Backward) {
   const float x[] = {2.0f, 3.0f, 3.0f, 7.f, 0.0f, 5.0, 1.5, 2.5, -2.5, 1.5};
   size_t n = sizeof(x) / sizeof(float);
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{n}, cuda);
   in.CopyDataFromHostPtr<float>(x, n);
   float neg_slope = 0.5f;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_batchnorm.cc b/test/singa/test_cudnn_batchnorm.cc
index 7067b16..4f6a38b 100644
--- a/test/singa/test_cudnn_batchnorm.cc
+++ b/test/singa/test_cudnn_batchnorm.cc
@@ -53,7 +53,7 @@ TEST(CudnnBatchNorm, Forward) {
     0.150676, 0.153442, -0.0929899, -0.148675,
     -0.112459, -0.106284, -0.103074, -0.0668811
   };
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
   in.CopyDataFromHostPtr(x, 1*2*4*4);
   const float alpha_[] = {1, 1};
@@ -129,7 +129,7 @@ TEST(CudnnBatchNorm, Backward) {
     0.150676, 0.153442, -0.0929899, -0.148675,
     -0.112459, -0.106284, -0.103074, -0.0668811
   };
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
   x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_convolution.cc b/test/singa/test_cudnn_convolution.cc
index 3b84645..a13016b 100644
--- a/test/singa/test_cudnn_convolution.cc
+++ b/test/singa/test_cudnn_convolution.cc
@@ -63,7 +63,7 @@ TEST(CudnnConvolution, Forward) {
   const size_t batchsize = 1, c = 1, h = 3, w = 3;
   const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                           6.0f, 7.0f, 8.0f, 9.0f};
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * h * w);
 
@@ -113,7 +113,7 @@ TEST(CudnnConvolution, Backward) {
   const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
   const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                                   6.0f, 7.0f, 8.0f, 9.0f};
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
 
@@ -234,7 +234,7 @@ TEST(CudnnConvolution_AT, Forward) {
   const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                           6.0f, 7.0f, 8.0f, 9.0f};
 
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * h * w);
 
@@ -285,7 +285,7 @@ TEST(CudnnConvolution_AT, Backward) {
   const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                                   6.0f, 7.0f, 8.0f, 9.0f};
 
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_dropout.cc b/test/singa/test_cudnn_dropout.cc
index d06a254..7f28aca 100644
--- a/test/singa/test_cudnn_dropout.cc
+++ b/test/singa/test_cudnn_dropout.cc
@@ -49,7 +49,7 @@ TEST(CudnnDropout, Setup) {
 TEST(CudnnDropout, Forward) {
   const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   size_t n = sizeof(x) / sizeof(float);
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{n}, cuda);
   in.CopyDataFromHostPtr(x, n);
 
@@ -90,7 +90,7 @@ TEST(CudnnDropout, Forward) {
 TEST(CudnnDropout, Backward) {
   const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   size_t n = sizeof(x) / sizeof(float);
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{n}, cuda);
   in.CopyDataFromHostPtr(x, n);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_cudnn_lrn.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_lrn.cc b/test/singa/test_cudnn_lrn.cc
index 4ee0c54..23fbe2e 100644
--- a/test/singa/test_cudnn_lrn.cc
+++ b/test/singa/test_cudnn_lrn.cc
@@ -58,7 +58,7 @@ TEST(CudnnLRN, Forward) {
     0.0597329, -0.0530868, 0.0124246, 0.108429,
     0.0451175, 0.0247055, 0.0304345, 0.0179575
   };
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
   in.CopyDataFromHostPtr(x, 1*2*4*4);
 
@@ -127,7 +127,7 @@ TEST(CudnnLRN, Backward) {
     0.0597329, -0.0530868, 0.0124246, 0.108429,
     0.0451175, 0.0247055, 0.0304345, 0.0179575
   };
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
   x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_pooling.cc b/test/singa/test_cudnn_pooling.cc
index 79051a3..5c01889 100644
--- a/test/singa/test_cudnn_pooling.cc
+++ b/test/singa/test_cudnn_pooling.cc
@@ -56,7 +56,7 @@ TEST(CudnnPooling, Forward) {
   const size_t batchsize = 1, c = 1, h = 3, w = 3;
   const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                           6.0f, 7.0f, 8.0f, 9.0f};
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * h * w);
 
@@ -89,7 +89,7 @@ TEST(CudnnPooling, Backward) {
   const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
   const float x[batchsize * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                               6.0f, 7.0f, 8.0f, 9.0f};
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_softmax.cc b/test/singa/test_cudnn_softmax.cc
index d715b33..a7cbe9d 100644
--- a/test/singa/test_cudnn_softmax.cc
+++ b/test/singa/test_cudnn_softmax.cc
@@ -43,7 +43,7 @@ TEST(CudnnSoftmax, Setup) {
 TEST(CudnnSoftmax, Forward1D) {
   const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
   size_t n = sizeof(x) / sizeof(float);
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Shape shape = {n};
   singa::Tensor in(shape, cuda);
   in.CopyDataFromHostPtr<float>(x, n);
@@ -69,7 +69,7 @@ TEST(CudnnSoftmax, Backward1D) {
   const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
   size_t n = sizeof(x) / sizeof(float);
   singa::Shape shape = {n};
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(shape, cuda);
   in.CopyDataFromHostPtr<float>(x, n);
 
@@ -103,7 +103,7 @@ TEST(CudnnSoftmax, Forward2D) {
   size_t n = sizeof(x) / sizeof(float);
   size_t batch = 2, c = 3;
   singa::Shape shape = {batch, c};
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(shape, cuda);
   in.CopyDataFromHostPtr<float>(x, n);
 
@@ -130,7 +130,7 @@ TEST(CudnnSoftmax, Backward2D) {
   const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
   size_t n = sizeof(x) / sizeof(float);
   size_t batch = 2, c = 3;
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Shape shape = {batch, c};
   singa::Tensor in(shape, cuda);
   in.CopyDataFromHostPtr<float>(x, n);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index e80384f..f4ecdfc 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -146,7 +146,7 @@ TEST(Dense, ForwardCuda) {
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
   const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
@@ -183,7 +183,7 @@ TEST(Dense, BackwardCuda) {
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
   const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  auto cuda = std::make_shared<singa::CudaGPU>();
   singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
index b0df226..33a3747 100644
--- a/test/singa/test_memory.cc
+++ b/test/singa/test_memory.cc
@@ -24,20 +24,15 @@
 #include "singa/core/memory.h"
 #include "singa/singa_config.h"
 #include "singa/utils/timer.h"
-#include <sys/time.h>
+#include "singa/utils/cuda_utils.h"
 
 #ifdef USE_CUDA
-TEST(CnmemPool, PoolInit) {
-  singa::CnMemPool pool;
-  pool.InitPool();
-}
-
+/*
 TEST(CnmemPool, PoolInitAll) {
-  singa::CnMemPool pool;
+  singa::CnMemPool pool(1);
   int nDevices;
   cudaGetDeviceCount(&nDevices);
   CHECK_GE(nDevices, 1);
-  pool.InitPool(nDevices, 32, 0);
 }
 
 TEST(CnmemPool, UsePool) {
@@ -55,7 +50,6 @@ TEST(CnmemPool, UsePool) {
     delete[] memPtrs;
   }
 }
-
 TEST(CudaMemPool, UsePool) {
   singa::CudaMemPool pool;
   int numOfTests = 10;
@@ -70,11 +64,11 @@ TEST(CudaMemPool, UsePool) {
     delete[] memPtrs;
   }
 }
+*/
 
 TEST(MemPool, CompareCudaCnmem) {
   singa::CudaMemPool cudaPool;
   singa::CnMemPool cnPool;
-  cnPool.InitPool();
 
   int numOfTests = 5000;
   int allocSize = 32;
@@ -82,6 +76,7 @@ TEST(MemPool, CompareCudaCnmem) {
   singa::DeviceMemPool* pool = NULL;
   pool = &cnPool;
 
+  CUDA_CHECK(cudaSetDevice(0));
   singa::Timer tick;
   for (int i = 0; i < numOfTests; i++) {
     int* memPtrs = NULL;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_platform.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_platform.cc b/test/singa/test_platform.cc
new file mode 100644
index 0000000..a7c2b10
--- /dev/null
+++ b/test/singa/test_platform.cc
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "gtest/gtest.h"
+#include "singa/core/device.h"
+
+#ifdef USE_CUDA
+using singa::Platform;
+TEST(Platform, NumGPUs) {
+  int n = Platform::GetNumGPUs();
+  EXPECT_GE(n, 0);
+  EXPECT_LE(n, 32);
+}
+
+TEST(Platform, QueryMem) {
+  int n = Platform::GetNumGPUs();
+  auto ids = Platform::GetGPUIDs();
+  EXPECT_EQ(ids.size(), n);
+  auto mem = Platform::GetGPUMemSize();
+  for (auto x : mem)
+    EXPECT_GT(x.second, x.first);
+}
+
+TEST(Platform, CreateDevice) {
+  auto dev = Platform::CreateCudaGPUs(1).at(0);
+  int size[] = { 128, 256, 3, 24 };
+  {
+    auto ptr = dev->NewBlock(size[0]);
+    auto allocated = dev->GetAllocatedMem();
+    EXPECT_LE(size[0], allocated);
+    dev->FreeBlock(ptr);
+    allocated = dev->GetAllocatedMem();
+  }
+  {
+    auto ptr0 = dev->NewBlock(size[0]);
+    auto ptr1 = dev->NewBlock(size[1]);
+    auto ptr2 = dev->NewBlock(size[2]);
+    auto allocated = dev->GetAllocatedMem();
+    EXPECT_LE(size[0] + size[1] + size[2], allocated);
+    auto ptr3 = dev->NewBlock(size[3]);
+    allocated = dev->GetAllocatedMem();
+    EXPECT_LE(size[0] + size[1] + size[2] + size[3], allocated);
+    dev->FreeBlock(ptr0);
+    dev->FreeBlock(ptr1);
+    dev->FreeBlock(ptr2);
+//    allocated = dev->GetAllocatedMem();
+//    EXPECT_EQ(size[3], allocated);
+    dev->FreeBlock(ptr3);
+//    allocated = dev->GetAllocatedMem();
+//    EXPECT_EQ(0, allocated);
+  }
+}
+
+TEST(Platform, CreateMultDevice) {
+  int n = Platform::GetNumGPUs();
+  auto devs = Platform::CreateCudaGPUs(n);
+  for (auto dev : devs) {
+    auto b = dev->NewBlock(32);
+    EXPECT_LE(32, dev->GetAllocatedMem());
+    dev->FreeBlock(b);
+  }
+}
+#endif
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dc013f34/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index cc7f60f..f6f2ca3 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -60,7 +60,7 @@ TEST(TensorClass, AsType) {
 TEST(TensorClass, ToDevice) {
   Tensor t(Shape{2,3});
   EXPECT_EQ(singa::defaultDevice, t.device());
-  auto dev = std::make_shared<singa::CppCPU>(0, 1);
+  auto dev = std::make_shared<singa::CppCPU>();
   t.ToDevice(dev);
   EXPECT_NE(singa::defaultDevice, t.device());
 }


Mime
View raw message