singa-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [singa] chrishkchris commented on issue #555: SINGA-490 Optimization on GPU Malloc and Cudastream
Date Wed, 13 Nov 2019 07:48:27 GMT
chrishkchris commented on issue #555: SINGA-490 Optimization on GPU Malloc and Cudastream
URL: https://github.com/apache/singa/pull/555#issuecomment-553281211
 
 
   Double checking of cuda codes (I removed the cuda memset, so I need to check everything
related to cuda)
   
   I have to check three files: 1. tensor.cc, 2. tenor_math_cuda.h, 3. math_kernal.cu
   
   1. tensor.cc: check if (i) after a new tensor is created (ii) output tensor, their values
are read before any assignments
   
   For examples:
   
   (a) SumColumns and SumRows are based on Mult on the input M (or its transpose T) and "one",
while "one" is initialized with setValue.
   
   ```cpp
   void SumColumns(const Tensor &M, Tensor *v) {
     if (M.transpose()) {
       Tensor X = Transpose(M);
       SumRows(X, v);
     } else {
       CHECK_EQ(M.nDim(), 2u);
       // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
       size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
       CHECK_EQ(nb_row, v->Size());
   
       Tensor one(Shape{nb_col}, M.device(), M.data_type());
       one.SetValue(1.0f);  // TODO(wangwei) cast type
       Mult(M, one, v);
     }
   }
   void SumRows(const Tensor &M, Tensor *v) {
     if (M.transpose()) {
       Tensor X = Transpose(M);
       SumColumns(X, v);
     } else {
       CHECK_EQ(M.nDim(), 2u);
       // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
       size_t nb_row = M.shape(0), nb_col = M.shape(1);
       CHECK_EQ(nb_col, v->Size());
   
       Tensor one(Shape{nb_row}, M.device(), M.data_type());
       one.SetValue(1.0f);  // TODO(wangwei) cast type
       Tensor X = Transpose(M);
       Mult(X, one, v);
     }
   }
   ```
   
   (b) Sum is based on SumRows and SumColumns, where the created tensor out is used as output
only (no reading)
   
   ```cpp
   Tensor Sum(const Tensor &M, int axis) {
     if (axis == 0) {
       Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
       SumRows(M, &out);
       return out;
     } else {
       CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
       Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
       SumColumns(M, &out);
       return out;
     }
   }
   ```
   
   
   (c) Mult of tensors A and tensor B: It creates a output tensor out and pass it to the GEMV
or GEMM defined in tenor_math_cuda.h
   
   ```cpp
   Tensor Mult(const Tensor &A, const Tensor &B) {
     Shape s;
     s.push_back(A.shape(0));
     if (B.nDim() == 2) s.push_back(B.shape(1));
     Tensor out(s, A.device(), A.data_type());
     Mult(A, B, &out);
     return out;
   }
   
   void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
     Mult(1.0f, A, B, 0.0f, out);
   }
   
   template <typename SType>
   void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
             Tensor *C) {
     CHECK_EQ(A.shape().size(), 2u);
     if (B.nDim() == 1u) {
       TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
         auto a = TypeCast<SType, DType>(alpha);
         auto b = TypeCast<SType, DType>(beta);
         C->device()->Exec([a, A, b, B, C](Context * ctx) {
           GEMV<DType, Lang>(a, A, B, b, C, ctx);
         }, {A.block(), B.block()}, {C->block()});
       });
     } else {
       CHECK(!C->transpose());
       TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
         auto a = TypeCast<SType, DType>(alpha);
         auto b = TypeCast<SType, DType>(beta);
         C->device()->Exec([a, A, b, B, C](Context * ctx) {
           GEMM<DType, Lang>(a, A, B, b, C,
           ctx);
         }, {A.block(), B.block()}, {C->block()});
       });
     }
   }
   ```
   
   2. tenor_math_cuda.h: check if (i) after a new tensor is created (ii) output tensor, their
values are read before any assignments
   
   For examples: 
   
   (a) Two tensors EltwiseMult: New tensor t is created but passed into cudnn transform as
output (to store the transform of input), as well as the original output pointer.
   
   ```cpp
   void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
                                       const Tensor& in2, Tensor* out,
                                       Context* ctx) {
     const float* inPtr1 = static_cast<const float*>(in1.block()->data());
     const float* inPtr2 = static_cast<const float*>(in2.block()->data());
     float* outPtr = static_cast<float*>(out->block()->mutable_data());
     const size_t num = in1.Size();
   
     //if both in1 and in2 are not transposed, and have the same strides,
     //we proceed to normal cuda::mult
     if (!in1.transpose() && !in2.transpose() && (in1.stride() == in2.stride()))
{
       cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
     } else { //else we check whether in1 or in2 or both are transposed
       if (in1.transpose() && in2.transpose()) {
         Tensor t(in1.shape(), in1.device(), in1.data_type());
         Transform<float, lang::Cuda>(in1, &t, ctx);
         Transform<float, lang::Cuda>(in2, out, ctx);
         float* tPtr = static_cast<float*>(t.block()->mutable_data());
         cuda::mult(num, tPtr, outPtr, outPtr, ctx->stream);
       } else if (in1.transpose()) {
         Transform<float, lang::Cuda>(in1, out, ctx);
         cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
       } else if (in2.transpose()) {
         Transform<float, lang::Cuda>(in2, out, ctx);
         cuda::mult(num, inPtr1, outPtr, outPtr, ctx->stream);
       }
     }
   }
   ```
   
   (b) One tensor one constant EltwiseMult: Pass it to cuda kernal (math_kernal.cu) directly
   
   ```cpp
   /// out = in * x
   template <>
   void EltwiseMult<float, lang::Cuda>(const Tensor& in,
                                       const float x, Tensor* out, Context* ctx) {
     const float* inPtr = static_cast<const float*>(in.block()->data());
     float* outPtr = static_cast<float*>(out->block()->mutable_data());
     const size_t num = in.Size();
     cuda::mult(num, inPtr, x, outPtr, ctx->stream);
   }
   
   ```
   3. math_kernal.cu: check if any cuda kernel read from the output value before assignment
   
   For examples: 
   
   (a) cuda sum column (I think this sum column is not in use anymore, while using the general
one in tensor.cc line 1162 based on Mult instead): the output is set 0 before adding all the
columns up
    
   ```cpp
   __global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
                                  int rows, int cols, int stride) {
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     int num_threads = blockDim.x * gridDim.x;
     for (; index < rows; index += num_threads) {
       dst_vec_data[index] = 0.0f;
       for (int k = 0; k < cols; k++) {
         dst_vec_data[index] += src_mat_data[index * stride + k];
       }
     }
   }
   ```
   
   (b) cuda sum row (I think this sum column is not in use anymore, while using the general
one in tensor.cc line 1147 based on Mult instead): the output is set to the aux local variable
after the calculation is completed
   
   ```cpp
   __global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
                                  int rows, int cols, int stride) {
     int j = blockIdx.x;
     int THREADS = blockDim.x;
     if (j >= cols) {
       return;
     }
   
     __shared__ float aux[CU1DBLOCK];
     int steps = (rows - 1) / THREADS + 1;
     aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
     for (int i = 1; i < steps; ++i) {
       if (threadIdx.x + i * THREADS < rows) {
         aux[threadIdx.x] +=
             src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
       }
     }
   
     int total_threads = THREADS;
     __syncthreads();
     while (total_threads > 1) {
       int half_point = ((1 + total_threads) >> 1);
       if (threadIdx.x < half_point) {
         if (threadIdx.x + half_point < total_threads) {
           aux[threadIdx.x] += aux[threadIdx.x + half_point];
         }
       }
       __syncthreads();
       total_threads = ((total_threads + 1) >> 1);
     }
   
     __syncthreads();
     dst_vec_data[j] = aux[0];
   }
   ```
   
   (c) KernelComputeCrossEntropy: loss is set 0 before using
   
   ```cpp
   __global__ void KernelComputeCrossEntropy(const bool int_target, const size_t batchsize,
                                             const size_t dim, const float *p,
                                             const int *t, float *loss) {
     size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
     size_t num_threads = blockDim.x * gridDim.x;
     if (int_target) {
       for (; sample < batchsize; sample += num_threads) {
         float prob_of_truth = p[sample * dim + t[sample]];
         loss[sample] = -std::log(max(prob_of_truth, FLT_MIN));
       }
     } else {
       for (; sample < batchsize; sample += num_threads) {
         float sum = 0.f;
         for (size_t j = 0; j < dim; j++) {
           sum += t[sample * dim + j];
         }
         loss[sample] = 0;
         for (size_t j = 0, offset = sample * dim; j < dim; j++, offset++) {
           loss[sample] -= t[offset] / sum * std::log(max(p[offset], FLT_MIN));
         }
       }
     }
   }
   ```
   
   Results: new values are assigned to the output, while there is no read from output before
assignment
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message