mxnet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [incubator-mxnet] haojin2 commented on a change in pull request #17014: [NumPy] Add NumPy support for norm
Date Thu, 09 Jan 2020 22:18:10 GMT
haojin2 commented on a change in pull request #17014: [NumPy] Add NumPy support for norm
URL: https://github.com/apache/incubator-mxnet/pull/17014#discussion_r364987599
 
 

 ##########
 File path: src/operator/numpy/linalg/broadcast_reduce_customized-inl.cuh
 ##########
 @@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015-2017 by Contributors
+ * \file broadcast_reduce_customized-inl.cuh
+ * \brief CUDA implementations for binary broadcast and reduce
+ * \author Antti-Pekka Hynninen
+*/
+#ifndef MXNET_OPERATOR_NUMPY_LINALG_BROADCAST_REDUCE_INL_CUSTOMIZED_CUH_
+#define MXNET_OPERATOR_NUMPY_LINALG_BROADCAST_REDUCE_INL_CUSTOMIZED_CUH_
+
+#include "../../tensor/broadcast_reduce-inl.cuh"
+
+using namespace mshadow::cuda;
+
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename
OP, int unroll>
+__launch_bounds__(nthread_reduce)
+__global__ void reduce_kernel_wr(const int N, const int M, const bool addto,
+                                 const DType* __restrict big, OType *small,
+                                 const Shape<ndim> big_shape0, const Shape<ndim>
small_shape,
+                                 const Shape<ndim> big_shape, const Shape<ndim>
big_stride,
+                                 const int Mnext, const bool do_transpose,
+                                 Reducer* reducer = nullptr) {
+  extern __shared__ char shTileChar[];
+  AType* shTile = (AType*)(shTileChar);
+  const int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  const int bx = (do_transpose) ? blockDim.y : blockDim.x;
+  const int by = (do_transpose) ? blockDim.x : blockDim.y;
+  const int tidx = (do_transpose) ? tid / by : threadIdx.x;
+  const int tidy = (do_transpose) ? tid % by : threadIdx.y;
+  bool need_clean = !reducer;
+  reducer = reducer ? reducer : new Reducer();
+  for (int m0 = blockIdx.y; m0 < Mnext; m0 += gridDim.y) {
+    // This TB handles M range [Mstart, ...., Mend - 1]
+    const int Mstart = (int)((uint64_t)M*(uint64_t)m0/(uint64_t)Mnext);
+    const int Mend   = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext);
+    for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) {
+      int idx = idx0 + tidx;
+      Shape<ndim> coord = unravel(idx, small_shape);
+      int idx_big0 = ravel(coord, big_shape0);
+
+      AType val, residual;
+      reducer->SetInitValue(val, residual);
+      if (idx < N) {
+        for (int k = tidy + Mstart; k < Mend; k += by*unroll) {
+          int idx_big[unroll];
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride);
+          }
+          DType tmp[unroll];
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            if (k + u*by < Mend) {
+              tmp[u] = OP::Map(big[idx_big[u]]);
+            }
+          }
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            if (k + u*by < Mend) reducer->Reduce(val, AType(tmp[u]), residual);
+          }
+        }
+      }
+
+      // Shared memory block bx * by. Reduction is along by. Final result is in tidy=0
+      if (by > 1) {
+        // Fix bx to avoid bank conflicts. Assumes warpSize number of banks
+        const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx
+ 1) : bx;
+        const int it0 = tidx + tidy*fbx;
+        shTile[it0 * 2] = val;
+        shTile[it0 * 2 + 1] = residual;
+        __syncthreads();
+        for (int t=1;t < by;t <<= 1) {
+          AType tmp, tmp_residual;
+          reducer->SetInitValue(tmp, tmp_residual);
+          if (tidy + t < by) {
+            tmp = shTile[(it0 + t*fbx) * 2];
+            tmp_residual = shTile[(it0 + t*fbx) * 2 + 1];
+          }
+          __syncthreads();
+          reducer->Merge(shTile[it0 * 2], shTile[it0 * 2 + 1], tmp, tmp_residual);
+          __syncthreads();
+        }
+        if (idx < N && tidy == 0) {
+          reducer->Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
+          assign(&small[idx + m0*N], addto, OType(shTile[tidx * 2]));
+        }
+      } else {
+        if (idx < N) {
+          reducer->Finalize(val, residual);
+          assign(&small[idx + m0*N], addto, OType(val));
+        }
+      }
+    }
+  }
+  if (need_clean) {
+    delete reducer;
+  }
+}
+
+template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2, int unroll>
+__launch_bounds__(nthread_reduce)
+__global__ void reduce_kernel_wr(const int N, const int M, const bool addto,
+                                 const DType* __restrict big, const DType* __restrict lhs,
+                                 const DType* __restrict rhs, DType *small,
+                                 const Shape<ndim> big_shape0, const Shape<ndim>
lhs_shape0,
+                                 const Shape<ndim> rhs_shape0, const Shape<ndim>
small_shape,
+                                 const Shape<ndim> big_shape, const Shape<ndim>
lhs_shape,
+                                 const Shape<ndim> rhs_shape, const Shape<ndim>
big_stride,
+                                 const Shape<ndim> lhs_stride, const Shape<ndim>
rhs_stride,
+                                 const int Mnext, const bool do_transpose,
+                                 Reducer* reducer = nullptr) {
+  extern __shared__ char shTileChar[];
+  DType* shTile = (DType*)(shTileChar);
+  const int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  const int bx = (do_transpose) ? blockDim.y : blockDim.x;
+  const int by = (do_transpose) ? blockDim.x : blockDim.y;
+  const int tidx = (do_transpose) ? tid / by : threadIdx.x;
+  const int tidy = (do_transpose) ? tid % by : threadIdx.y;
+  bool need_clean = !reducer;
+  reducer = reducer ? reducer : new Reducer();
+  for (int m0 = blockIdx.y; m0 < Mnext; m0 += gridDim.y) {
+    // This TB handles M range [Mstart, ...., Mend - 1]
+    const int Mstart = (int)((uint64_t)M*(uint64_t)m0/(uint64_t)Mnext);
+    const int Mend   = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext);
+    for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) {
+      int idx = idx0 + tidx;
+      Shape<ndim> coord = unravel(idx, small_shape);
+      int idx_big0 = ravel(coord, big_shape0);
+      int idx_lhs0 = ravel(coord, lhs_shape0);
+      int idx_rhs0 = ravel(coord, rhs_shape0);
+
+      DType val, residual;
+      reducer->SetInitValue(val, residual);
+      if (idx < N) {
+        for (int k = tidy + Mstart; k < Mend; k += by*unroll) {
+          int idx_big[unroll];
+          int idx_lhs[unroll];
+          int idx_rhs[unroll];
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride);
+            idx_lhs[u] = idx_lhs0 + unravel_dot(k + u*by, lhs_shape, lhs_stride);
+            idx_rhs[u] = idx_rhs0 + unravel_dot(k + u*by, rhs_shape, rhs_stride);
+          }
+          DType tmp[unroll];
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            if (k + u*by < Mend) {
+              tmp[u] = OP1::Map(big[idx_big[u]], OP2::Map(lhs[idx_lhs[u]], rhs[idx_rhs[u]]));
+            }
+          }
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            if (k + u*by < Mend) reducer->Reduce(val, tmp[u], residual);
+          }
+        }
+      }
+
+      // Shared memory block bx * by. Reduction is along by. Final result is in tidy=0
+      if (by > 1) {
+        // Fix bx to avoid bank conflicts. Assumes warpSize number of banks
+        const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx
+ 1) : bx;
+        const int it0 = tidx + tidy*fbx;
+        shTile[it0 * 2] = val;
+        shTile[it0 * 2 + 1] = residual;
+        __syncthreads();
+        for (int t=1;t < by;t <<= 1) {
+          DType tmp, tmp_residual;
+          reducer->SetInitValue(tmp, tmp_residual);
+          if (tidy + t < by) {
+            tmp = shTile[(it0 + t*fbx) * 2];
+            tmp_residual = shTile[(it0 + t*fbx) * 2 + 1];
+          }
+          __syncthreads();
+          reducer->Merge(shTile[it0 * 2], shTile[it0 * 2 + 1], tmp, tmp_residual);
+          __syncthreads();
+        }
+        if (idx < N && tidy == 0) {
+          reducer->Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
+          assign(&small[idx + m0*N], addto, shTile[tidx * 2]);
+        }
+      } else {
+        if (idx < N) {
+          reducer->Finalize(val, residual);
+          assign(&small[idx + m0*N], addto, val);
+        }
+      }
+    }
+  }
+  if (need_clean) {
+    delete reducer;
+  }
+}
+
+// Simple reduction of lines when M is small
+template<typename Reducer, typename DType>
+__launch_bounds__(kMaxThreadsPerBlock)
+__global__ void reduce_lines_kernel_wr(const int N, const int M, const bool addto,
+  const int small_in_stride, const DType* __restrict small_in, DType *small_out,
+  Reducer* reducer = nullptr) {
+  bool need_clean = !reducer;
+  reducer = reducer ? reducer : new Reducer();
+  for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x)
{
+
+    DType val, residual;
+    reducer->SetInitValue(val, residual);
+    for (int k = 0; k < M; k++) {
+      reducer->Reduce(val, small_in[idx + k*small_in_stride], residual);
+    }
+
+    if (idx < N) {
+      reducer->Finalize(val, residual);
+      assign(&small_out[idx], addto, val);
+    }
+
+  }
+  if (need_clean) {
+    delete reducer;
+  }
+}
+
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename
OP>
+__launch_bounds__(kMaxThreadsPerBlock)
+__global__ void reduce_kernel_M1_wr(const int N, const bool addto,
+                                    const DType* __restrict big, OType *small, const Shape<ndim>
bshape,
+                                    const Shape<ndim> sshape, Reducer* reducer = nullptr)
{
+  bool need_clean = !reducer;
+  reducer = reducer ? reducer : new Reducer();
 
 Review comment:
   I think we can probably move all those reducer creations out of the kernels and put on
the host side.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message