Return-Path: X-Original-To: apmail-singa-commits-archive@minotaur.apache.org Delivered-To: apmail-singa-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id AE4D018AEE for ; Mon, 16 Nov 2015 06:09:18 +0000 (UTC) Received: (qmail 56556 invoked by uid 500); 16 Nov 2015 06:09:18 -0000 Delivered-To: apmail-singa-commits-archive@singa.apache.org Received: (qmail 56536 invoked by uid 500); 16 Nov 2015 06:09:18 -0000 Mailing-List: contact commits-help@singa.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@singa.incubator.apache.org Delivered-To: mailing list commits@singa.incubator.apache.org Received: (qmail 56527 invoked by uid 99); 16 Nov 2015 06:09:18 -0000 Received: from Unknown (HELO spamd1-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 16 Nov 2015 06:09:18 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd1-us-west.apache.org (ASF Mail Server at spamd1-us-west.apache.org) with ESMTP id 2BC2EC6856 for ; Mon, 16 Nov 2015 06:09:18 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd1-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: 1.216 X-Spam-Level: * X-Spam-Status: No, score=1.216 tagged_above=-999 required=6.31 tests=[KAM_ASCII_DIVIDERS=0.8, KAM_LAZY_DOMAIN_SECURITY=1, RP_MATCHES_RCVD=-0.585, URIBL_BLOCKED=0.001] autolearn=disabled Received: from mx1-us-west.apache.org ([10.40.0.8]) by localhost (spamd1-us-west.apache.org [10.40.0.7]) (amavisd-new, port 10024) with ESMTP id YtKUoFOr9Jjy for ; Mon, 16 Nov 2015 06:09:03 +0000 (UTC) Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx1-us-west.apache.org (ASF Mail Server at mx1-us-west.apache.org) with SMTP id 7184D23297 for ; Mon, 16 Nov 2015 06:08:48 +0000 (UTC) Received: (qmail 55180 invoked by uid 99); 16 Nov 2015 06:08:47 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 16 Nov 2015 06:08:47 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id B0552E025B; Mon, 16 Nov 2015 06:08:47 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: kaiping@apache.org To: commits@singa.incubator.apache.org Date: Mon, 16 Nov 2015 06:08:51 -0000 Message-Id: In-Reply-To: <8ab1dba3c59c4d0c9da3661f9f273d3e@git.apache.org> References: <8ab1dba3c59c4d0c9da3661f9f273d3e@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [05/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface SINGA-80 New Blob Level and Address Level Math Operation Interface Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/4728f7ce Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/4728f7ce Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/4728f7ce Branch: refs/heads/master Commit: 4728f7ce4fb4e1e019690600550c039748de50a7 Parents: c13e037 Author: seaok Authored: Tue Nov 3 10:45:17 2015 +0800 Committer: Wei Wang Committed: Mon Nov 9 17:04:48 2015 +0800 ---------------------------------------------------------------------- include/singa/blob/math_addr.h | 34 +++- include/singa/blob/math_blob.h | 86 ++++---- include/singa/blob/math_kernel.h | 49 ++++- include/singa/blob/singa_op.h | 92 ++++++++- src/blob/math_blob.cc | 4 +- src/blob/math_kernel.cu | 371 +++++++++++++++++++++++++++++++++- src/test/test_math.cc | 128 +++++++++++- 7 files changed, 708 insertions(+), 56 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4728f7ce/include/singa/blob/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h index a6663ab..4895343 100644 --- a/include/singa/blob/math_addr.h +++ b/include/singa/blob/math_addr.h @@ -79,18 +79,42 @@ template void cpu_expand_f(const float * A,const int m, const int n void gpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C); void gpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C); void gpu_axpy(const float * A, const int n, const float alpha, float * B); +float gpu_dot(const float * A, const float * B, const int n); //element-wise -template void gpu_e_f(const int n, const float alpha, float * A); -template void gpu_e_f(const int n,const float * A,const float alpha, const float beta,float * B); -template void gpu_e_f(const int n,const float * A,const float * B,const float alpha, const float beta,float * C); +template void gpu_e_f(const int n, const float alpha, float * A) +{ + Op::CudaMap(alpha, A, n); +} + +template void gpu_e_f(const int n,const float * A,const float alpha, float * B) +{ + Op::CudaMap(alpha, A, B, n); +} + +template void gpu_e_f(const int n,const float * A,const float * B,const float alpha, const float beta,float * C) +{ + Op::CudaMap(alpha, beta, A, B, C, n); +} // element-wise generalized operation defined in Op //matrix/vector expand/reduce -template void gpu_reduce_f(const float * A,const int m, const int n, float * B); +template void gpu_reduce_f(const float * A,const int m, const int n, float * B) +{ + for(int i = 0 ; i < m ; i++) + { + Op::CudaMap(A+i*n, n, B[i]); + } +} //reduce each row of A to an element of B e.g. the sum operation in softmax -template void gpu_expand_f(const float * A,const int m, const int n, float * B); +template void gpu_expand_f(const float * A,const int m, const int n, float * B) +{ + for(int i = 0 ; i < m ; i++) + { + Op::CudaMap(A[i], n, B+i*n); + } +} //expand each element in A into a row of B http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4728f7ce/include/singa/blob/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h index d5991a7..ee0fb60 100644 --- a/include/singa/blob/math_blob.h +++ b/include/singa/blob/math_blob.h @@ -207,46 +207,52 @@ void E_Func(XPU xpu, Blob * A, float alpha) if(xpu == gpu) { //gpu part + int n = get_size(A->shape()); + gpu_e_f(n, alpha, A->mutable_gpu_data()); } } template void E_Func(XPU xpu, const Blob & A, Blob * B, float alpha) { - if(xpu == cpu) + if(check_shape_equal(A, *B, *B)) { - if(check_shape_equal(A, *B, *B)) - { int n = get_size(A.shape()); - cpu_e_f(n, A.cpu_data(), alpha, B->mutable_cpu_data()); - } - else{ - // report errors here - } - } - if(xpu == gpu) - { - //gpu part + if(xpu == cpu) + { + cpu_e_f(n, A.cpu_data(), alpha, B->mutable_cpu_data()); + } + + if(xpu == gpu) + { + //gpu part + gpu_e_f(n, A.gpu_data(), alpha, B->mutable_gpu_data()); + } } + else{ + // report errors here + } } template void E_Func(XPU xpu, const Blob & A, const Blob & B, Blob * C, float alpha, float beta) { - if(xpu == cpu) + if(check_shape_equal(A, B, *C)) { - if(check_shape_equal(A, B, *C)) + int n = get_size(A.shape()); + + if(xpu == cpu) { - int n = get_size(A.shape()); cpu_e_f(n, A.cpu_data(), B.cpu_data(), alpha, beta, C->mutable_cpu_data()); } - else{ - // report errors here + if(xpu == gpu) + { + //gpu part + gpu_e_f(n, A.gpu_data(), B.gpu_data(), alpha, beta, C->mutable_gpu_data()); } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } } @@ -394,21 +400,23 @@ void Bernoulli(XPU xpu, Blob & A, float p, int n = 1); template void Reduce_F(XPU xpu, const Blob & A, Blob * B) { - if(xpu == cpu) + if(check_shape_mv(A, *B)) { - if(check_shape_mv(A, *B)) + int m = get_size(B->shape()); + int n = get_size(A.shape()) / m; + + if(xpu == cpu) { - int m = get_size(B->shape()); - int n = get_size(A.shape()) / m; cpu_reduce_f(A.cpu_data(), m, n, B->mutable_cpu_data()); } - else{ - // report errors here + if(xpu == gpu) + { + //gpu part + gpu_reduce_f(A.gpu_data(), m, n, B->mutable_gpu_data()); } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } } //reduce each row of A to an element of B e.g. the sum operation in softmax @@ -416,21 +424,23 @@ void Reduce_F(XPU xpu, const Blob & A, Blob * B) template void Expand_F(XPU xpu, const Blob & A, Blob * B) { - if(xpu == cpu) + if(check_shape_mv(*B, A)) { - if(check_shape_mv(*B, A)) + int m = get_size(A.shape()); + int n = get_size(B->shape()) / m; + + if(xpu == cpu) { - int m = get_size(A.shape()); - int n = get_size(B->shape()) / m; cpu_expand_f(A.cpu_data(), m, n, B->mutable_cpu_data()); } - else{ - // report errors here + if(xpu == gpu) + { + //gpu part + gpu_expand_f(A.gpu_data(), m, n, B->mutable_gpu_data()); } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } } //expand each element in A into a row of B http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4728f7ce/include/singa/blob/math_kernel.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_kernel.h b/include/singa/blob/math_kernel.h index 9aaf4c2..f5d3e34 100644 --- a/include/singa/blob/math_kernel.h +++ b/include/singa/blob/math_kernel.h @@ -4,9 +4,54 @@ namespace singa{ extern "C" { - void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride); + void singa_gpu_sum_vec(float *data, float *sum , long n); + + void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride); + + void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data, long rows, long cols, long stride); + + void singa_gpu_set_value(float *data, float value, long n); + + void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_scale_grad(float *data, float alpha, long n); + + void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + + void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + + void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + + void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); - void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float *des_mat_data, long rows, long cols, long stride); }; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4728f7ce/include/singa/blob/singa_op.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h index b36c001..33ef4f8 100644 --- a/include/singa/blob/singa_op.h +++ b/include/singa/blob/singa_op.h @@ -3,6 +3,9 @@ #include #include +#include +#include "cublas_v2.h" +#include "singa/blob/math_kernel.h" namespace singa { enum XPU { cpu, gpu, any}; @@ -14,28 +17,45 @@ namespace singa_op { inline static void Map(float alpha, float & a) { a= alpha; } + inline static void CudaMap(float alpha, float * a, int n) { + singa::singa_gpu_set_value(a, alpha, n); + } }; struct Scale { inline static void Map(float alpha, const float & a, float & b) { - b = alpha*a; + b = alpha* a; + } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_scale(a,b,alpha,n); } }; + struct Scale_grad { inline static void Map(float alpha, float & output) { output = alpha; } + inline static void CudaMap(float alpha, float *output, int n) { + singa::singa_gpu_scale_grad(output,alpha,n); + } }; struct Exp { inline static void Map(float alpha, const float & a, float & b) { b = pow(a, alpha); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_exp(a,b,alpha,n); + } }; + struct Exp_grad { inline static void Map(float alpha, const float & a, float & b) { - b = a * log(alpha); // log is the natrual log based on e + b = a * log(alpha); + } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_exp_grad(a,b,alpha,n); } }; @@ -43,92 +63,144 @@ namespace singa_op { inline static void Map(float alpha, const float & a, float & b) { b = 1.0f / (1.0f + expf(-a * alpha)); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_sigmoid(a,b,alpha,n); + } }; + struct Gsigmoid_grad { inline static void Map(float alpha, const float & a, float & b) { b = alpha * a * ( 1.0f - a ); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_sigmoid_grad(a,b,alpha,n); + } }; struct Grelu { inline static void Map(float alpha, const float & a, float & b) { b = ( 1 - alpha ) * std::max( a, 0.0f ) + alpha * a; } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_relu(a,b,alpha,n); + } }; + struct Grelu_grad { inline static void Map(float alpha, const float & a, float & b) { b = a > 0.0f ? 1.0f : alpha; } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_relu_grad(a,b,alpha,n); + } }; struct Gtanh { inline static void Map(float alpha, const float & a, float & b) { b = tanhf( a * alpha ); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_tanh(a,b,alpha,n); + } }; + struct Gtanh_grad { inline static void Map(float alpha, const float & a, float & b) { b = alpha * ( 1.0f - a * a ); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_tanh_grad(a,b,alpha,n); + } }; struct Softplus { inline static void Map(float alpha, const float & a, float & b) { b = logf(1 + expf(a)); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_softplus(a,b,alpha,n); + } }; + struct Softplus_grad { inline static void Map(float alpha, const float & a, float & b) { b = 1.0f / (1.0f + expf(-a)); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_softplus_grad(a,b,alpha,n); + } }; struct Square { inline static void Map(float alpha, const float & a, float & b) { b = a * a; } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_square(a,b,alpha,n); + } }; struct Square_grad { inline static void Map(float alpha, const float & a, float & b) { b = 2 * sqrt(a); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_square_grad(a,b,alpha,n); + } }; struct Sqrt { inline static void Map(float alpha, const float & a, float & b) { b = sqrt(a); } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_sqrt(a,b,alpha,n); + } }; struct Threshold { inline static void Map(float alpha, const float & a, float & b) { b = a < alpha ? 1.0f : 0.0f; } + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_threshold(a,b,alpha,n); + } }; struct Add { inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { c = a + b; } + inline static void CudaMap(float alpha, float beta, const float * a, const float * b, float *c, int n) { + singa::singa_gpu_add(a,b,c,alpha,beta,n); + } }; struct Sub { inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { c = a - b; } + inline static void CudaMap(float alpha, float beta, const float * a, const float * b, float *c, int n) { + singa::singa_gpu_sub(a,b,c,alpha,beta,n); + } }; struct Mult { inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { c = a * b; } + inline static void CudaMap(float alpha, float beta, const float * a, const float * b, float *c, int n) { + singa::singa_gpu_mult(a,b,c,alpha,beta,n); + } }; struct Div { inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { c = a / b; } + inline static void CudaMap(float alpha, float beta, const float * a, const float * b, float *c, int n) { + singa::singa_gpu_div(a,b,c,alpha,beta,n); + } }; struct Sum { @@ -139,6 +211,16 @@ namespace singa_op { b += a[i]; } } + + inline static void CudaMap(const float * a, int n, float & b) { + float *sum = NULL; + cudaMalloc((void**)&sum, n*sizeof(float)); + + singa::singa_gpu_sum_vec(a,sum,n); + + cudaMemcpyAsync(&b, sum, sizeof(float), cudaMemcpyDeviceToDevice); + cudaFree(sum); + } }; struct Expand_Div { @@ -148,6 +230,9 @@ namespace singa_op { b[i] /= a; } } + inline static void CudaMap(const float & a, int n, float * b) { + singa::singa_gpu_scale(b,b,a,n); + } }; struct Repmat { @@ -157,6 +242,9 @@ namespace singa_op { b[i] = a; } } + inline static void CudaMap(const float & a, int n, float * b) { + singa::singa_gpu_set_value(b,a,n); + } }; }; // namespace op http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4728f7ce/src/blob/math_blob.cc ---------------------------------------------------------------------- diff --git a/src/blob/math_blob.cc b/src/blob/math_blob.cc index 9421367..bd0e6ee 100644 --- a/src/blob/math_blob.cc +++ b/src/blob/math_blob.cc @@ -166,7 +166,7 @@ void MVAdd(XPU xpu, const Blob & A, Blob * B, float alpha, float b if(xpu == gpu) { - singa_add_vec_row(B->gpu_data(),A.gpu_data(),A.gpu_data(),m,n,n); + singa_gpu_add_vec_row(B->gpu_data(),A.gpu_data(),A.gpu_data(),m,n,n); //gpu part } } @@ -192,7 +192,7 @@ void MVSum(XPU xpu, const Blob & A, Blob * B, float alpha, float b } if(xpu == gpu) { - singa_sum_col(A.gpu_data(),B->gpu_data(),m,n,n); + singa_gpu_sum_col(A.gpu_data(),B->gpu_data(),m,n,n); //gpu part } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4728f7ce/src/blob/math_kernel.cu ---------------------------------------------------------------------- diff --git a/src/blob/math_kernel.cu b/src/blob/math_kernel.cu index 6b2a709..4c828d5 100644 --- a/src/blob/math_kernel.cu +++ b/src/blob/math_kernel.cu @@ -1,3 +1,4 @@ +#include #include "singa/blob/math_kernel.h" #define CU2DBLOCK_X 32 @@ -8,8 +9,42 @@ //Cuda Kernel Functions + __global__ -void kernel_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) +void kernel_sum_vec(float *data, float *sum , long n) +{ + int THREADS = blockDim.x; + + __shared__ float aux[CU1DBLOCK]; + int steps = (n - 1) / THREADS + 1; + aux[threadIdx.x] = data[threadIdx.x]; + + for(int i=1; i 1) { + int half_point = ((1+total_threads) >> 1); + if (threadIdx.x < half_point) { + if(threadIdx.x+half_point < total_threads) { + aux[threadIdx.x] += aux[threadIdx.x + half_point]; + } + } + __syncthreads(); + total_threads = ((total_threads+1) >> 1); + } + + __syncthreads(); + *sum = aux[0]; +} + +__global__ +void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) { int j = blockIdx.x; int THREADS = blockDim.x; @@ -44,7 +79,7 @@ void kernel_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long co } __global__ -void kernel_add_vec_row(float *src_vec_data, float *src_mat_data, float* des_mat_data,long rows, long cols, long stride) +void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, float* des_mat_data,long rows, long cols, long stride) { long i = blockIdx.x * blockDim.x + threadIdx.x; long j = blockIdx.y * blockDim.y + threadIdx.y; @@ -57,10 +92,230 @@ void kernel_add_vec_row(float *src_vec_data, float *src_mat_data, float* des_mat } } +__global__ static +void kernel_set_value(float *data, float value, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index 0.0f ? 1.0f : alpha; + } +} + + +__global__ +void kernel_tanh(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index CU1DBLOCK ? CU1DBLOCK : n; + // here, we only need one block + long num_blocks = 1; + + kernel_sum_vec<<>>(data, sum, n); +} + +void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) { long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows; long num_blocks = cols; @@ -68,11 +323,117 @@ void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long col kernel_sum_col<<>>(src_mat_data, dst_vec_data, rows, cols, stride); } -void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride) +void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride) { dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y); dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1)); kernel_add_vec_row<<>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride); } -}//namespace singa +void singa_gpu_set_value(float *data, float value, long n) +{ + kernel_set_value<<>>(data, value, n); +} + +void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_scale<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_scale_grad(float *data, float alpha, long n) +{ + kernel_scale_grad<<>>(data, alpha, n); +} + +void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_exp<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_exp_grad<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_sigmoid<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_sigmoid_grad<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_relu<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_relu_grad<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_tanh<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_tanh_grad<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_softplus<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_softplus_grad<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_square<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_square_grad<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_sqrt<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_threshold<<>>(src_data, des_data, alpha, n); +} + +void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + kernel_add<<>>(src_data_a, src_data_b, des_data, alpha, beta, n); +} + +void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + kernel_sub<<>>(src_data_a, src_data_b, des_data, alpha, beta, n); +} + +void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + kernel_mult<<>>(src_data_a, src_data_b, des_data, alpha, beta, n); +} + +void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + kernel_div<<>>(src_data_a, src_data_b, des_data, alpha, beta, n); +} + + +}//namespace singa_gpu http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4728f7ce/src/test/test_math.cc ---------------------------------------------------------------------- diff --git a/src/test/test_math.cc b/src/test/test_math.cc index 3856d1d..a8a9490 100644 --- a/src/test/test_math.cc +++ b/src/test/test_math.cc @@ -100,6 +100,42 @@ TEST(MathTest, TestAxpyCPU) { } } +TEST(MathTest, TestEopCPU) { + + float A[10] = {}; + float B[10] = {}; + float C[10] = {}; + float D[10] = {}; + float O[10] = {}; + + for(int i = 0; i < 10; i++) + { + A[i] = i; + B[i] = -i; + C[i] = i; + + } + + cpu_e_f(5, 15, O); + for(int i = 0; i < 5; i++) + { + ASSERT_EQ(O[i]-15,0); + } + for(int i = 5; i < 10; i++) + { + ASSERT_EQ(O[i],0); + } + cpu_e_f(10, C, 2, C); + for(int i = 0; i < 10; i++) + { + ASSERT_EQ(C[i]-2*i,0); + } + cpu_e_f(10, A, B, 0, 0, O); + for(int i = 0; i < 10; i++) + { + ASSERT_EQ(O[i],0); + } +} TEST(MathTest, TestGemmGPU) { float A[3][2] = {}; @@ -314,7 +350,7 @@ TEST(MathTest, TestSingaSumColGPU) { cudaMalloc((void**)&B_gpu, 4*sizeof(float)); cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice); - singa_sum_col(A_gpu,B_gpu,3,4,4); + singa_gpu_sum_col(A_gpu,B_gpu,3,4,4); cudaMemcpy(B,B_gpu,4*sizeof(float),cudaMemcpyDeviceToHost); @@ -367,7 +403,7 @@ TEST(MathTest, TestSingaAddVecRowGPU) { cudaMemcpy(A_gpu,A,3*4*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice); - singa_add_vec_row(B_gpu,A_gpu,C_gpu,3,4,4); + singa_gpu_add_vec_row(B_gpu,A_gpu,C_gpu,3,4,4); cudaMemcpy(C,C_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost); @@ -383,3 +419,91 @@ TEST(MathTest, TestSingaAddVecRowGPU) { cudaFree(B_gpu); cudaFree(C_gpu); } + + +TEST(MathTest, TestSingaSetValueGPU) { + + float A[3][4]; + + float* A_gpu=NULL; + float* B_gpu=NULL; + + cudaMalloc((void**)&A_gpu, 3*4*sizeof(float)); + + cudaMemcpy(A_gpu,A,3*4*sizeof(float),cudaMemcpyHostToDevice); + + singa_gpu_set_value(A_gpu,4.0,3*4); + + cudaMemcpy(A,A_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost); + + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + ASSERT_EQ(A[i][j],4.0f); + } + } + + cudaFree(A_gpu); +} + + +TEST(MathTest, TestEopGPU) { + + float A[10] = {}; + float B[10] = {}; + float C[10] = {}; + float D[10] = {}; + float O[10] = {}; + + for(int i = 0; i < 10; i++) + { + A[i] = i; + B[i] = -i; + C[i] = i; + O[i] = 0.0f; + + } + + float* A_gpu=NULL; + float* B_gpu=NULL; + float* C_gpu=NULL; + float* O_gpu=NULL; + + cudaMalloc((void**)&A_gpu, 10*sizeof(float)); + cudaMalloc((void**)&B_gpu, 10*sizeof(float)); + cudaMalloc((void**)&C_gpu, 10*sizeof(float)); + cudaMalloc((void**)&O_gpu, 10*sizeof(float)); + + cudaMemcpy(A_gpu,A,10*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(B_gpu,B,10*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(C_gpu,C,10*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(O_gpu,O,10*sizeof(float),cudaMemcpyHostToDevice); + + gpu_e_f(5, 15, O_gpu); + cudaMemcpy(O,O_gpu,10*sizeof(float),cudaMemcpyDeviceToHost); + + for(int i = 0; i < 5; i++) + { + ASSERT_EQ(O[i]-15,0); + } + for(int i = 5; i < 10; i++) + { + ASSERT_EQ(O[i],0); + } + gpu_e_f(10, C_gpu, 2, C_gpu); + cudaMemcpy(C,C_gpu,10*sizeof(float),cudaMemcpyDeviceToHost); + + for(int i = 0; i < 10; i++) + { + ASSERT_EQ(C[i]-2*i,0); + } + + gpu_e_f(10, A_gpu, B_gpu, 0, 0, O_gpu); + cudaMemcpy(O,O_gpu,10*sizeof(float),cudaMemcpyDeviceToHost); + + for(int i = 0; i < 10; i++) + { + ASSERT_EQ(O[i],0); + } +}