singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wang...@apache.org
Subject [6/6] incubator-singa git commit: SINGA-172 OpenCL device support and implementation
Date Sat, 30 Jul 2016 05:11:27 GMT
SINGA-172 OpenCL device support and implementation

Move opencl_device.h into device.h.
Remove the option for building opencl test.

One test of OpenCL failed, TensorMult.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/464dcda6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/464dcda6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/464dcda6

Branch: refs/heads/dev
Commit: 464dcda634f6198fe74d8932ac999e538e5f4065
Parents: 3f6b5e3
Author: Wei Wang <wangwei@comp.nus.edu.sg>
Authored: Sat Jul 30 13:09:05 2016 +0800
Committer: Wei Wang <wangwei@comp.nus.edu.sg>
Committed: Sat Jul 30 13:09:05 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                     |   2 +-
 include/singa/core/common.h        |   6 +-
 include/singa/core/device.h        | 101 +++++++++++++++-
 include/singa/core/opencl_device.h | 132 ---------------------
 src/core/device/opencl_device.cc   |  20 ++--
 test/CMakeLists.txt                |   4 +-
 test/singa/test_opencl.cc          | 198 ++++++++++++++++----------------
 7 files changed, 215 insertions(+), 248 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/464dcda6/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f862f0..23f8ef6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ OPTION(USE_OPENCV "Use opencv" OFF)
 OPTION(USE_LMDB "Use LMDB libs" OFF)
 OPTION(USE_PYTHON "Generate py wrappers" ON)
 OPTION(USE_OPENCL "Use OpenCL" OFF)
-OPTION(BUILD_OPENCL_TESTS "Build OpenCL tests" OFF)
+#OPTION(BUILD_OPENCL_TESTS "Build OpenCL tests" OFF)
 
 INCLUDE("cmake/Dependencies.cmake")
 INCLUDE("cmake/Utils.cmake")

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/464dcda6/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 9586286..caa7c67 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -39,8 +39,8 @@
 #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 #define CL_HPP_TARGET_OPENCL_VERSION 120
 #include <CL/cl2.hpp>
-#include <map>
-#endif
+#include <unordered_map>
+#endif  // USE_OPENCL
 
 using std::atomic;
 
@@ -99,7 +99,7 @@ typedef struct _Context {
 #endif // USE_CUDA
 
 #ifdef USE_OPENCL
-  std::shared_ptr<std::map<std::string, cl::Kernel>> kernels;
+  std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
   cl::CommandQueue ocl_cmdq;
   cl::Context ocl_ctx;
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/464dcda6/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 36c9dc2..cd9a811 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -36,6 +36,12 @@
 #endif // USE_CUDA
 
 #ifdef USE_OPENCL
+// http://github.khronos.org/OpenCL-CLHPP/
+// cl2.hpp includes cl.h, do not re-include.
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#include <unordered_map>
+#include <CL/cl2.hpp>
 #include "singa/utils/opencl_utils.h"
 #endif // USE_OPENCL
 
@@ -198,12 +204,103 @@ class CudaGPU : public Device {
 
 #endif  // USE_CUDA
 
+#ifdef USE_OPENCL
+// Implement Device using OpenCL libs.
+class OpenclDevice : public singa::Device {
+public:
+
+  // TODO: Constructor arguments to consider:
+  // Path to kernel sources?
+  // Select only certain device types?
+  OpenclDevice(int id = 0, int num_executors = 1);
+  ~OpenclDevice();
+
+  /// Get the specified kernel.
+  cl::Kernel GetKernel(const std::string& kname, cl_int* status = nullptr);
+
+  /// Get the command queue associated with this device.
+  cl::CommandQueue GetCmdQ() { return cmdq; }
+
+  /// Prints information about all Devices in each Platform.
+  void PrintAllDeviceInfo();
+
+  /// Prints status about CL source code builds.
+  void PrintClBuildInfo(cl::Program &p);
+
+// Overridden, inherited methods
+  void SetRandSeed(unsigned seed) override;
+
+  void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
+                      CopyDirection direction, int dst_offset = 0,
+                      int src_offset = 0);
+/*
+  void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes = 0,
+                           size_t dst_offset = 0) override;*/
+
+protected:
+  /// The OpenCL device that this object represents.
+  /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
+  /// object.
+  cl::Device this_device;
+
+  /// Each OpenclDevice has one OpenCL context. It is created along with the
+  /// creation of this object.
+  cl::Context ocl_ctx;
+
+  /// The CommandQueue that is associated with this device.
+  /// Since each OpenclDevice contains only one cl::Device and one cl::Context,
+  /// it naturally also contains one cl::CommandQueue that is associated
+  /// with said Device and Context.
+  cl::CommandQueue cmdq;
+
+  /// A list of kernels that has been compiled on this device.
+  std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
 
+  /// Searches the given paths for all .cl files and builds
+  /// OpenCL programs, then stores them in the Kernels map.
+  void BuildPrograms(const std::string &kdir = cl_src_path);
+
+// Overridden, inherited methods.
+
+  void DoExec(function<void(Context*)>&& fn, int executor) override;
+
+  void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                  CopyDirection direction, Context* ctx = nullptr) override;
+
+  /// Allocates memory on this OpenCL device
+  /// by creating and returning an empty cl::Buffer object.
+  /// with the indicated size.
+  void* Malloc(int size) override;
+
+  /// Converts the void pointer into a Buffer object, then deletes the object.
+  /// This has the effect of freeing up device memory.
+  void Free(void* ptr) override;
+
+private:
+
+  /// Copies a data block from host to device.
+  /// src: a pointer to an array of data.
+  /// dst: a pointer to a cl::Buffer object.
+  void WriteToDevice(cl::Buffer* dst, const void* src, const size_t size);
+
+  /// Reads a data block from device to host.
+  /// src: a pointer to an cl::Buffer object.
+  /// dst: a pointer to an malloc'ed empty array.
+  void ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size);
+
+  /// Duplicates a block of data on the device.
+  /// src: a pointer to the original cl::Buffer object.
+  /// dst: a pointer to the new cl::Buffer object to copy the data into.
+  void CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size);
+
+  static const std::string cl_src_path;
+};
+#endif  // USE_OPENCL
 
 /// This class queries all available calculating devices on a given machine
 /// grouped according to manufacturer or device drivers. All methods should be static.
 /// If CUDA or OPENCL are not enabled, then the respective related methods should
-/// return something that indicates their absence (for example, 0 devices); 
+/// return something that indicates their absence (for example, 0 devices);
 /// however they should always be available regardless of compile-time switches.
 class Platform {
 public:
@@ -261,7 +358,7 @@ public:
 private:
 #ifdef USE_OPENCL
   cl::Platform clPlatform;
-#endif
+#endif  // USE_OPENCL
 };
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/464dcda6/include/singa/core/opencl_device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/opencl_device.h b/include/singa/core/opencl_device.h
deleted file mode 100644
index 14b6fe7..0000000
--- a/include/singa/core/opencl_device.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SINGA_CORE_OPENCL_DEVICE_H_
-#define SINGA_CORE_OPENCL_DEVICE_H_
-
-#include "singa/core/device.h"
-
-#ifdef USE_OPENCL
-// http://github.khronos.org/OpenCL-CLHPP/
-// cl2.hpp includes cl.h, do not re-include.
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#include <map>
-#include <memory>
-#include <CL/cl2.hpp>
-
-#include "singa/utils/opencl_utils.h"
-
-namespace singa {
-
-// Implement Device using OpenCL libs.
-class OpenclDevice : public singa::Device {
-public:
-
-  // TODO: Constructor arguments to consider:
-  // Path to kernel sources?
-  // Select only certain device types?
-  OpenclDevice(int id = 0, int num_executors = 1);
-  ~OpenclDevice();
-
-  /// Get the specified kernel.
-  cl::Kernel GetKernel(const std::string& kname, cl_int* status = nullptr);
-
-  /// Get the command queue associated with this device.
-  cl::CommandQueue GetCmdQ() { return cmdq; }
-
-  /// Prints information about all Devices in each Platform.
-  void PrintAllDeviceInfo();
-
-  /// Prints status about CL source code builds.
-  void PrintClBuildInfo(cl::Program &p);
-
-// Overridden, inherited methods
-  void SetRandSeed(unsigned seed) override;
-
-  void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
-                      CopyDirection direction, int dst_offset = 0, 
-                      int src_offset = 0);
-/*
-  void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes = 0, 
-                           size_t dst_offset = 0) override;*/
-
-protected:
-  /// The OpenCL device that this object represents.
-  /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
-  /// object.
-  cl::Device this_device;
-
-  /// Each OpenclDevice has one OpenCL context. It is created along with the 
-  /// creation of this object.
-  cl::Context ocl_ctx;
-
-  /// The CommandQueue that is associated with this device.
-  /// Since each OpenclDevice contains only one cl::Device and one cl::Context,
-  /// it naturally also contains one cl::CommandQueue that is associated
-  /// with said Device and Context. 
-  cl::CommandQueue cmdq;
-  
-  /// A list of kernels that has been compiled on this device.
-  std::shared_ptr<std::map<std::string, cl::Kernel>> kernels;
-  
-  /// Searches the given paths for all .cl files and builds
-  /// OpenCL programs, then stores them in the Kernels map.
-  void BuildPrograms(const std::string &kdir = cl_src_path);
-
-// Overridden, inherited methods.
-
-  void DoExec(function<void(Context*)>&& fn, int executor) override;
-
-  void CopyToFrom(void* dst, const void* src, size_t nBytes,
-                  CopyDirection direction, Context* ctx = nullptr) override;
-
-  /// Allocates memory on this OpenCL device
-  /// by creating and returning an empty cl::Buffer object.
-  /// with the indicated size.
-  void* Malloc(int size) override;
-
-  /// Converts the void pointer into a Buffer object, then deletes the object.
-  /// This has the effect of freeing up device memory.
-  void Free(void* ptr) override;
-
-private:
-
-  /// Copies a data block from host to device.
-  /// src: a pointer to an array of data.
-  /// dst: a pointer to a cl::Buffer object.
-  void WriteToDevice(cl::Buffer* dst, const void* src, const size_t size);
-
-  /// Reads a data block from device to host.
-  /// src: a pointer to an cl::Buffer object.
-  /// dst: a pointer to an malloc'ed empty array.
-  void ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size);
-
-  /// Duplicates a block of data on the device.
-  /// src: a pointer to the original cl::Buffer object.
-  /// dst: a pointer to the new cl::Buffer object to copy the data into.
-  void CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size);
-
-  static const std::string cl_src_path;
-};
-
-} // namespace singa
-
-#endif // USE_OPENCL
-
-#endif // SINGA_CORE_OPENCL_DEVICE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/464dcda6/src/core/device/opencl_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc
index d4d1fe5..b941cd2 100644
--- a/src/core/device/opencl_device.cc
+++ b/src/core/device/opencl_device.cc
@@ -21,7 +21,7 @@
 #include <sstream>
 #include <string>
 
-#include "singa/core/opencl_device.h"
+#include "singa/core/device.h"
 #include "singa/utils/tinydir.h"
 
 #ifdef USE_OPENCL
@@ -32,11 +32,11 @@ namespace singa {
 
 const string OpenclDevice::cl_src_path = "../src/core/tensor";
 
-OpenclDevice::OpenclDevice(int id, int num_executors) 
+OpenclDevice::OpenclDevice(int id, int num_executors)
 	: Device(id, num_executors) {
   lang_ = kOpencl;
-  this->kernels = std::make_shared<std::map<std::string, cl::Kernel>>();
-  
+  this->kernels = std::make_shared<std::unordered_map<string, cl::Kernel>>();
+
   // Create the OpenCL Device, Context, and CommandQueue.
   /// TODO: This merely chooses the first device on the first platform.
   cl_int status = CL_SUCCESS;
@@ -44,7 +44,7 @@ OpenclDevice::OpenclDevice(int id, int num_executors)
   std::vector<cl::Platform> platforms;
   status = cl::Platform::get(&platforms);
   OCL_CHECK(status, "Failed to find any OpenCL platforms!");
-  
+
   std::vector<cl::Device> devices;
   status = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
   OCL_CHECK(status, "Failed to get list of devices from platform!");
@@ -57,7 +57,7 @@ OpenclDevice::OpenclDevice(int id, int num_executors)
   OCL_CHECK(status, "Failed to create a command queue!");
 
   BuildPrograms();
-  
+
   ctx_.kernels = kernels;
   ctx_.ocl_cmdq = cmdq;
   ctx_.ocl_ctx = ocl_ctx;
@@ -65,7 +65,7 @@ OpenclDevice::OpenclDevice(int id, int num_executors)
 
 
 OpenclDevice::~OpenclDevice() {
-  
+
   // Flush and finish the command queue.
   cmdq.flush();
   cmdq.finish();
@@ -150,7 +150,7 @@ void OpenclDevice::BuildPrograms(const std::string &kdir) {
 	  std::vector<cl::Kernel> built_kernels;
 	  status = program.createKernels(&built_kernels);
 	  OCL_CHECK(status, "Failed to create kernels in built program.");
-	  
+
 	  for (auto k : built_kernels) {
 		std::string name = k.getInfo<CL_KERNEL_FUNCTION_NAME>(&status);
 		this->kernels->insert(std::make_pair(name, k));
@@ -221,7 +221,7 @@ void OpenclDevice::Free(void* p) {
 
 void OpenclDevice::WriteToDevice(cl::Buffer* dst, const void* src, const size_t size) {
   cl_int status = CL_SUCCESS;
-  
+
   status = cmdq.enqueueWriteBuffer(*dst, CL_TRUE, 0, size, src);
   OCL_CHECK(status, "Unable to write data to OpenCL device.");
 }
@@ -229,7 +229,7 @@ void OpenclDevice::WriteToDevice(cl::Buffer* dst, const void* src, const
size_t
 
 void OpenclDevice::ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size) {
   cl_int status = CL_SUCCESS;
-  
+
   status = cmdq.enqueueReadBuffer(*src, CL_TRUE, 0, size, dst);
   OCL_CHECK(status, "Unable to read data from OpenCL device.");
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/464dcda6/test/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 632a2cd..044d65a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -4,12 +4,12 @@ ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" "gtest/gtest-all.cc")
 
 AUX_SOURCE_DIRECTORY(singa singa_test_source)
 
-IF(NOT BUILD_OPENCL_TESTS)
+IF(NOT USE_OPENCL)
     MESSAGE(STATUS "Skipping OpenCL tests")
     LIST(REMOVE_ITEM singa_test_source "singa/test_opencl.cc")
 ENDIF()
 
-ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source}) 
+ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source})
 ADD_DEPENDENCIES(test_singa singa_core singa_utils)
 MESSAGE(STATUS "link libs" ${singa_linker_libs})
 TARGET_LINK_LIBRARIES(test_singa gtest singa_core singa_utils singa_model

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/464dcda6/test/singa/test_opencl.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_opencl.cc b/test/singa/test_opencl.cc
index 0a335e5..3ce1889 100644
--- a/test/singa/test_opencl.cc
+++ b/test/singa/test_opencl.cc
@@ -20,16 +20,16 @@
 *************************************************************/
 
 #include "gtest/gtest.h"
-#include "singa/core/opencl_device.h"
+#include "singa/core/device.h"
 #include "singa/core/tensor.h"
 #include "singa/proto/core.pb.h"
-
-using singa::OpenclDevice;
 using singa::CppCPU;
 using singa::Block;
 using singa::Shape;
 using singa::Tensor;
 
+#ifdef USE_OPENCL
+using singa::OpenclDevice;
 class OpenCL_TensorMath : public ::testing::Test {
 protected:
 
@@ -38,36 +38,36 @@ protected:
       float4[i] = (float)i;
       float4zero[i] = 0.0f;
     }
-    
+
     for (int i = 0; i < 16; i++) {
       float16[i] = (float)i;
       float16zero[i] = 0.0f;
     }
-    
+
     auto ocl_dev = std::make_shared<OpenclDevice>();
-    
+
     tf4in = Tensor(Shape{1, 4}, ocl_dev);
     tf4in.CopyDataFromHostPtr(float4, 4);
-    
+
     tf4zin = Tensor(Shape{1, 4}, ocl_dev);
     tf4zin.CopyDataFromHostPtr(float4zero, 4);
 
     tf16in = Tensor(Shape{4, 4}, ocl_dev);
     tf16in.CopyDataFromHostPtr(float16, 16);
-    
+
     tf16zin = Tensor(Shape{4, 4}, ocl_dev);
     tf16zin.CopyDataFromHostPtr(float16zero, 16);
-    
+
     float empty[10000] = {};
     empty10k = Tensor(Shape{10000}, ocl_dev);
     empty10k.CopyDataFromHostPtr(empty, 10000);
   }
-  
+
   float float4[4];
   float float4zero[4];
   float float16[16];
   float float16zero[16];
-  
+
   Tensor tf4in, tf16in;
   Tensor tf4zin, tf16zin;
   Tensor empty10k;
@@ -101,19 +101,19 @@ TEST(OpenclDevice, MemoryAllocFree) {
 TEST(OpenclDevice, CopyDataToFrom) {
   OpenclDevice dev;
   CppCPU host;
-  
+
   Block* a = host.NewBlock(4);
   Block* b = dev.NewBlock(4);
   Block* c = host.NewBlock(4);
-  
+
   // Allocate the Block object on the host.
   char s[] = {'a', 'b', 'c', 'x'};
   host.CopyDataFromHostPtr(a, s, 4);
-  
+
   // Copy back and forth.
   dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
   dev.CopyDataToFrom(c, b, 4, singa::kDeviceToHost);
-  
+
   const char* astr = static_cast<const char*>(c->data());
   EXPECT_EQ('a', astr[0]);
   EXPECT_EQ('b', astr[1]);
@@ -124,21 +124,21 @@ TEST(OpenclDevice, CopyDataToFrom) {
 TEST(OpenclDevice, DuplicateDataOnDevice) {
   OpenclDevice dev;
   CppCPU host;
-  
+
   Block* a = host.NewBlock(4);
   Block* b = dev.NewBlock(4);
   Block* c = dev.NewBlock(4);
   Block* d = host.NewBlock(4);
-  
+
   // Allocate the Block object on the host.
   char s[] = {'a', 'b', 'c', 'x'};
   host.CopyDataFromHostPtr(a, s, 4);
-  
+
   // Copy to device and duplicate.
   dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
   dev.CopyDataToFrom(c, b, 4, singa::kDeviceToDevice);
   dev.CopyDataToFrom(d, c, 4, singa::kDeviceToHost);
-  
+
   const char* astr = static_cast<const char*>(d->data());
   EXPECT_EQ('a', astr[0]);
   EXPECT_EQ('b', astr[1]);
@@ -150,7 +150,7 @@ TEST(OpenclDevice, DuplicateDataOnDevice) {
 TEST_F(OpenCL_TensorMath, CopyDataToDevice) {
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_EQ(1.0f, out[1]);
   EXPECT_EQ(3.0f, out[3]);
 }
@@ -158,10 +158,10 @@ TEST_F(OpenCL_TensorMath, CopyDataToDevice) {
 
 TEST_F(OpenCL_TensorMath, MemberAbs) {
   tf4in = Abs(tf4in);
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_EQ(0.0f, out[0]);
   EXPECT_EQ(1.0f, out[1]);
   EXPECT_EQ(2.0f, out[2]);
@@ -171,10 +171,10 @@ TEST_F(OpenCL_TensorMath, MemberAbs) {
 
 TEST_F(OpenCL_TensorMath, MemberExp) {
   tf4in = Exp(tf4in);
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_NEAR(exp(0.0f), out[0], 1e-5);
   EXPECT_NEAR(exp(1.0f), out[1], 1e-5);
   EXPECT_NEAR(exp(2.0f), out[2], 1e-5);
@@ -184,10 +184,10 @@ TEST_F(OpenCL_TensorMath, MemberExp) {
 
 TEST_F(OpenCL_TensorMath, MemberLog) {
   tf4in = Log(tf4in);
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
 //  EXPECT_NEAR(log(0.0f), out[0], 1e-5); // Evaluates to neg infinity.
   EXPECT_NEAR(log(1.0f), out[1], 1e-5);
   EXPECT_NEAR(log(2.0f), out[2], 1e-5);
@@ -198,10 +198,10 @@ TEST_F(OpenCL_TensorMath, MemberLog) {
 TEST_F(OpenCL_TensorMath, MemberReLU) {
   tf4in -= 2.0f;
   Tensor result = ReLU(tf4in);
-  
+
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(0.0f, out[1], 1e-5);
   EXPECT_NEAR(0.0f, out[2], 1e-5);
@@ -211,10 +211,10 @@ TEST_F(OpenCL_TensorMath, MemberReLU) {
 
 TEST_F(OpenCL_TensorMath, MemberSigmoid) {
   tf4in = Sigmoid(tf4in);
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_NEAR(1.0f / (1.0f + exp(-0.0f)), out[0], 1e-5);
   EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), out[1], 1e-5);
   EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), out[2], 1e-5);
@@ -223,10 +223,10 @@ TEST_F(OpenCL_TensorMath, MemberSigmoid) {
 
 TEST_F(OpenCL_TensorMath, MemberSign) {
   tf4in -= 1.0f;
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_NEAR(-1.0f, out[0], 1e-5);
   EXPECT_NEAR(0.0f, out[1], 1e-5);
   EXPECT_NEAR(1.0f, out[2], 1e-5);
@@ -236,10 +236,10 @@ TEST_F(OpenCL_TensorMath, MemberSign) {
 
 TEST_F(OpenCL_TensorMath, MemberSqrt) {
   tf4in = Sqrt(tf4in);
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(1.0f, out[1], 1e-5);
   EXPECT_NEAR(sqrt(2.0f), out[2], 1e-5);
@@ -249,10 +249,10 @@ TEST_F(OpenCL_TensorMath, MemberSqrt) {
 
 TEST_F(OpenCL_TensorMath, MemberSquare) {
   tf4in = Square(tf4in);
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(1.0f, out[1], 1e-5);
   EXPECT_NEAR(4.0f, out[2], 1e-5);
@@ -262,10 +262,10 @@ TEST_F(OpenCL_TensorMath, MemberSquare) {
 
 TEST_F(OpenCL_TensorMath, MemberTanh) {
   tf4in = Tanh(tf4in);
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(tanh(1.0f), out[1], 1e-5);
   EXPECT_NEAR(tanh(2.0f), out[2], 1e-5);
@@ -278,7 +278,7 @@ TEST_F(OpenCL_TensorMath, Sum) {
 
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(1.0f, out[1], 1e-5);
   EXPECT_NEAR(2.0f, out[2], 1e-5);
@@ -287,10 +287,10 @@ TEST_F(OpenCL_TensorMath, Sum) {
 
 TEST_F(OpenCL_TensorMath, MemberLT) {
   Tensor result = tf4in < 2.0f;
-  
+
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(1.0f, out[0]);
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(0.0f, out[2]);
@@ -300,10 +300,10 @@ TEST_F(OpenCL_TensorMath, MemberLT) {
 
 TEST_F(OpenCL_TensorMath, MemberLE) {
   Tensor result = tf4in <= 2.0f;
-  
+
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(1.0f, out[0]);
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(1.0f, out[2]);
@@ -313,10 +313,10 @@ TEST_F(OpenCL_TensorMath, MemberLE) {
 
 TEST_F(OpenCL_TensorMath, MemberGT) {
   Tensor result = tf4in > 2.0f;
-  
+
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(0.0f, out[1]);
   EXPECT_FLOAT_EQ(0.0f, out[2]);
@@ -326,10 +326,10 @@ TEST_F(OpenCL_TensorMath, MemberGT) {
 
 TEST_F(OpenCL_TensorMath, MemberGE) {
   Tensor result = tf4in >= 2.0f;
-  
+
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(0.0f, out[1]);
   EXPECT_FLOAT_EQ(1.0f, out[2]);
@@ -342,17 +342,17 @@ TEST_F(OpenCL_TensorMath, MemberPow) {
 
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(4.0f, out[2]);
   EXPECT_FLOAT_EQ(9.0f, out[3]);
-  
+
   result = Pow(tf4in, tf4in);
-  
+
   result.ToHost();
   const float* out1 = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(1.0f, out1[0]); // 0 ^ 0 is 1, apparently.
   EXPECT_FLOAT_EQ(1.0f, out1[1]);
   EXPECT_FLOAT_EQ(4.0f, out1[2]);
@@ -365,17 +365,17 @@ TEST_F(OpenCL_TensorMath, MemberSub) {
 
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(2.0f, out[2]);
   EXPECT_FLOAT_EQ(3.0f, out[3]);
-  
+
   result = tf4in - 0.0f;
 
   result.ToHost();
   const float* out1 = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(0.0f, out1[0]);
   EXPECT_FLOAT_EQ(1.0f, out1[1]);
   EXPECT_FLOAT_EQ(2.0f, out1[2]);
@@ -388,17 +388,17 @@ TEST_F(OpenCL_TensorMath, MemberEltwiseMult) {
 
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(0.0f, out[1]);
   EXPECT_FLOAT_EQ(0.0f, out[2]);
   EXPECT_FLOAT_EQ(0.0f, out[3]);
-  
+
   result = tf4in * 10.0f;
 
   result.ToHost();
   const float* out1 = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(0.0f, out1[0]);
   EXPECT_FLOAT_EQ(10.0f, out1[1]);
   EXPECT_FLOAT_EQ(20.0f, out1[2]);
@@ -411,27 +411,27 @@ TEST_F(OpenCL_TensorMath, MemberDiv) {
 
   result.ToHost();
   const float* out = result.data<float>();
-  
+
 //  EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by zero.
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(1.0f, out[2]);
   EXPECT_FLOAT_EQ(1.0f, out[3]);
-  
+
   result = tf4in / 10.0f;
 
   result.ToHost();
   const float* out1 = result.data<float>();
-  
+
   EXPECT_FLOAT_EQ(0.0f, out1[0]);
   EXPECT_FLOAT_EQ(0.1f, out1[1]);
   EXPECT_FLOAT_EQ(0.2f, out1[2]);
   EXPECT_FLOAT_EQ(0.3f, out1[3]);
-  
+
   result = Div(10.0f, tf4in);
 
   result.ToHost();
   const float* out2 = result.data<float>();
-  
+
 //  EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by 0.
   EXPECT_FLOAT_EQ(10.0f, out2[1]);
   EXPECT_FLOAT_EQ(5.0f, out2[2]);
@@ -446,7 +446,7 @@ TEST_F(OpenCL_TensorMath, Bernoulli) {
   const float p = 0.3f;
 
   Bernoulli(p, &empty10k);
-  
+
   empty10k.ToHost();
   const float* out = empty10k.data<float>();
 
@@ -454,53 +454,53 @@ TEST_F(OpenCL_TensorMath, Bernoulli) {
   for (int i = 0; i < 10000; i++) sum += out[i];
 
   float mean = sum / 10000;
-  
+
   EXPECT_NEAR(mean, p, 1e-2);
-  
+
   sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
   float variance = sum / 9999;
-  
+
   EXPECT_NEAR(variance, p * (1 - p), 1e-2);
 }
 
 
 TEST_F(OpenCL_TensorMath, Gaussian) {
   Gaussian(0.0f, 1.0f, &empty10k);
-  
+
   empty10k.ToHost();
   const float* out = empty10k.data<float>();
-  
+
   float sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += out[i];
   float mean = sum / 10000;
-  
+
   EXPECT_NEAR(mean, 0.0f, 1e-2);
-  
+
   sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
   float variance = sum / 9999;
-  
+
   EXPECT_NEAR(variance, 1.0f, 1e-2);
 }
 
 
 TEST_F(OpenCL_TensorMath, Uniform) {
   Uniform(0.1f, 0.2f, &empty10k);
-  
+
   empty10k.ToHost();
   const float* out = empty10k.data<float>();
-  
+
   float sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += out[i];
   float mean = sum / 10000;
-  
+
   EXPECT_NEAR(mean, 0.15f, 1e-2);
-  
+
   sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
   float variance = sum / 9999;
-  
+
   EXPECT_NEAR(variance, 0.01f, 1e-2);
 }
 
@@ -514,38 +514,38 @@ TEST_F(OpenCL_TensorMath, EltwiseAdd) {
 
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_EQ(0.0f, out[0]);
   EXPECT_EQ(2.0f, out[1]);
   EXPECT_EQ(4.0f, out[2]);
   EXPECT_EQ(6.0f, out[3]);
-  
+
   result = tf4in + tf4zin;
 
   result.ToHost();
   const float* out1 = result.data<float>();
-  
+
   EXPECT_EQ(0.0f, out1[0]);
   EXPECT_EQ(1.0f, out1[1]);
   EXPECT_EQ(2.0f, out1[2]);
   EXPECT_EQ(3.0f, out1[3]);
-  
+
   result = Tensor(tf4in.shape(), tf4in.device(), tf4in.data_type());
   Add(tf4in, tf4in, &result);
 
   result.ToHost();
   const float* out2 = result.data<float>();
-  
+
   EXPECT_EQ(0.0f, out2[0]);
   EXPECT_EQ(2.0f, out2[1]);
   EXPECT_EQ(4.0f, out2[2]);
   EXPECT_EQ(6.0f, out2[3]);
-  
+
   result = tf4in + 1.0f;
-  
+
   result.ToHost();
   const float* out3 = result.data<float>();
-  
+
   EXPECT_EQ(1.0f, out3[0]);
   EXPECT_EQ(2.0f, out3[1]);
   EXPECT_EQ(3.0f, out3[2]);
@@ -556,10 +556,10 @@ TEST_F(OpenCL_TensorMath, EltwiseAdd) {
 TEST_F(OpenCL_TensorMath, SetValue) {
   const float one_third = 1.0f / 3.0f;
   empty10k.SetValue(one_third);
-  
+
   empty10k.ToHost();
   const float* out = empty10k.data<float>();
-  
+
   EXPECT_EQ(one_third, out[0]);
   EXPECT_EQ(one_third, out[1]);
   EXPECT_EQ(one_third, out[1024]);
@@ -571,10 +571,10 @@ TEST_F(OpenCL_TensorMath, SetValue) {
 
 TEST_F(OpenCL_TensorMath, Axpy) {
   Axpy(10.0f, tf4in, &tf4in);
-  
+
   tf4in.ToHost();
   const float* out = tf4in.data<float>();
-  
+
   EXPECT_EQ(0.0f, out[0]);  // 0 * 10 + 0 = 0
   EXPECT_EQ(11.0f, out[1]); // 1 * 10 + 1 = 11
   EXPECT_EQ(22.0f, out[2]); // 2 * 10 + 2 = 22
@@ -583,39 +583,39 @@ TEST_F(OpenCL_TensorMath, Axpy) {
 
 TEST_F(OpenCL_TensorMath, Mult) {
   Tensor result = Mult(tf4in, tf4zin.T()); // Multiply with zero.
-  
+
   result.ToHost();
   const float* out = result.data<float>();
-  
+
   EXPECT_EQ(0.0f, out[0]); // 1x4 * 4x1 = 1x1.
-  
+
   result = Mult(tf4in, tf4in.T());
-  
+
   result.ToHost();
   const float* out0 = result.data<float>();
-  
+
   EXPECT_EQ(14.0f, out0[0]); // 1x4 * 4x1 = 1x1.
-  
+
   tf16zin.SetValue(10.0f); // Multiply with 10.0.
   result = Mult(tf16in, tf16zin); // 4x4 * 4x4 = 4x4.
-  
+
   result.ToHost();
   const float* out1 = result.data<float>();
   EXPECT_EQ(240.0f, out1[0]);
   EXPECT_EQ(280.0f, out1[1]);
   EXPECT_EQ(320.0f, out1[2]);
   EXPECT_EQ(360.0f, out1[3]);
-  
+
   EXPECT_EQ(240.0f, out1[4]);
   EXPECT_EQ(280.0f, out1[5]);
   EXPECT_EQ(320.0f, out1[6]);
   EXPECT_EQ(360.0f, out1[7]);
-  
+
   EXPECT_EQ(240.0f, out1[8]);
   EXPECT_EQ(280.0f, out1[9]);
   EXPECT_EQ(320.0f, out1[10]);
   EXPECT_EQ(360.0f, out1[11]);
-  
+
   EXPECT_EQ(240.0f, out1[12]);
   EXPECT_EQ(280.0f, out1[13]);
   EXPECT_EQ(320.0f, out1[14]);
@@ -625,3 +625,5 @@ TEST_F(OpenCL_TensorMath, Mult) {
 
 
 // TODO: ComputeCrossEntropy, SoftmaxCrossEntropy
+//
+#endif  // USE_OPENCL


Mime
View raw message