Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id AC2BB200B32 for ; Wed, 8 Jun 2016 23:40:05 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id AB0F0160A2E; Wed, 8 Jun 2016 21:40:05 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 5C1F5160A58 for ; Wed, 8 Jun 2016 23:40:03 +0200 (CEST) Received: (qmail 73723 invoked by uid 500); 8 Jun 2016 21:40:00 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 71895 invoked by uid 99); 8 Jun 2016 21:39:59 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Jun 2016 21:39:59 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 97BA3E094D; Wed, 8 Jun 2016 21:39:59 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: apalumbo@apache.org To: commits@mahout.apache.org Date: Wed, 08 Jun 2016 21:40:28 -0000 Message-Id: <80a90c38ab8b48d69bd0b5b71f0b0e6c@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [31/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241 archived-at: Wed, 08 Jun 2016 21:40:05 -0000 http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp new file mode 100644 index 0000000..51d99e1 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp @@ -0,0 +1,2809 @@ +#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp rupp@iue.tuwien.ac.at + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp + @brief Implementations of operations using sparse matrices using CUDA +*/ + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/vector.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/cuda/common.hpp" +#include "viennacl/linalg/cuda/vector_operations.hpp" + +#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp" + +//#ifdef VIENNACL_WITH_SPGEMM_RMERGE + #include "viennacl/linalg/cuda/spgemm_rmerge.hpp" +//#else +// #include "viennacl/linalg/cuda/spgemm.hpp" +//#endif + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +// +// Compressed matrix +// + +namespace detail +{ + + template + __global__ void csr_row_info_extractor_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * result, + unsigned int size, + unsigned int option) + { + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < size; + row += gridDim.x * blockDim.x) + { + NumericT value = 0; + unsigned int row_end = row_indices[row+1]; + + switch (option) + { + case 0: //inf-norm + for (unsigned int i = row_indices[row]; i < row_end; ++i) + value = max(value, fabs(elements[i])); + break; + + case 1: //1-norm + for (unsigned int i = row_indices[row]; i < row_end; ++i) + value += fabs(elements[i]); + break; + + case 2: //2-norm + for (unsigned int i = row_indices[row]; i < row_end; ++i) + value += elements[i] * elements[i]; + value = sqrt(value); + break; + + case 3: //diagonal entry + for (unsigned int i = row_indices[row]; i < row_end; ++i) + { + if (column_indices[i] == row) + { + value = elements[i]; + break; + } + } + break; + + default: + break; + } + result[row] = value; + } + } + + + template + void row_info(compressed_matrix const & mat, + vector_base & vec, + viennacl::linalg::detail::row_info_types info_selector) + { + csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(mat.size1()), + static_cast(info_selector) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel"); + } + + struct spmv_pure + { + template + __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = Ax; } + }; + + struct spmv_alpha_beta + { + template + __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = alpha * Ax + ((beta != 0) ? beta * result : 0); } + }; + +} //namespace detail + + + +template +__global__ void compressed_matrix_vec_mul_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + const NumericT * x, + unsigned int start_x, + unsigned int inc_x, + NumericT alpha, + NumericT * result, + unsigned int start_result, + unsigned int inc_result, + unsigned int size_result, + NumericT beta) +{ + __shared__ NumericT shared_elements[512]; + + const unsigned int id_in_row = threadIdx.x % SubWarpSizeV; + const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1); + const unsigned int block_start = blockIdx.x * block_increment; + const unsigned int block_stop = min(block_start + block_increment, size_result); + + for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV; + row < block_stop; + row += blockDim.x / SubWarpSizeV) + { + NumericT dot_prod = NumericT(0); + unsigned int row_end = row_indices[row+1]; + for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV) + dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x]; + + shared_elements[threadIdx.x] = dot_prod; + if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1]; + if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2]; + if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4]; + if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8]; + if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16]; + + if (id_in_row == 0) + AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, shared_elements[threadIdx.x], beta); + } +} + + +template +__global__ void compressed_matrix_vec_mul_adaptive_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const unsigned int * row_blocks, + const NumericT * elements, + unsigned int num_blocks, + const NumericT * x, + unsigned int start_x, + unsigned int inc_x, + NumericT alpha, + NumericT * result, + unsigned int start_result, + unsigned int inc_result, + unsigned int size_result, + NumericT beta) +{ + __shared__ NumericT shared_elements[1024]; + + for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x) + { + unsigned int row_start = row_blocks[block_id]; + unsigned int row_stop = row_blocks[block_id + 1]; + unsigned int element_start = row_indices[row_start]; + unsigned int element_stop = row_indices[row_stop]; + unsigned int rows_to_process = row_stop - row_start; + + if (rows_to_process > 1) // CSR stream with one thread per row + { + // load to shared buffer: + for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x) + shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x]; + + __syncthreads(); + + // use one thread per row to sum: + for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x) + { + NumericT dot_prod = 0; + unsigned int thread_row_start = row_indices[row] - element_start; + unsigned int thread_row_stop = row_indices[row + 1] - element_start; + for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) + dot_prod += shared_elements[i]; + AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, dot_prod, beta); + } + } + // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary) + else // CSR vector for a single row + { + // load and sum to shared buffer: + shared_elements[threadIdx.x] = 0; + for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x) + shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x]; + + // reduction to obtain final result + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride]; + } + + if (threadIdx.x == 0) + AlphaBetaHandlerT::apply(result[row_start * inc_result + start_result], alpha, shared_elements[0], beta); + } + + __syncthreads(); // avoid race conditions + } +} + + + + +/** @brief Carries out matrix-vector multiplication with a compressed_matrix +* +* Implementation of the convenience expression result = prod(mat, vec); +* +* @param mat The matrix +* @param vec The vector +* @param result The result vector +*/ +template +void prod_impl(const viennacl::compressed_matrix & mat, + const viennacl::vector_base & vec, + NumericT alpha, + viennacl::vector_base & result, + NumericT beta) +{ + static bool first = true; + static bool is_maxwell = false; + + // check whether the CUDA device is from the Maxwell family. + // Only run once, because the query to the backend takes about the same time as a kernel launch (~15us), thus being too expensive to query each time. + // + // Note: This might result in non-optimal kernels being selected if multiple Maxwell- and non-Maxwell GPUs are available in the system and devices are switched at runtime. + // However, this situation is certainly rare, hence the the benefits of this singleton outweigh the disadvantages encountered in such a corner case. + if (first) + { + cudaDeviceProp prop; + int device_index = 0; + + cudaError_t err_flag = cudaGetDevice(&device_index); + if (err_flag == cudaSuccess) + { + err_flag = cudaGetDeviceProperties(&prop, device_index); + if (err_flag == cudaSuccess && prop.major >= 5) + is_maxwell = true; + } + first = false; + } + + if (is_maxwell && double(mat.nnz()) / double(mat.size1()) > 6.4) // less than 10% of threads expected to idle + { + if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0) + compressed_matrix_vec_mul_kernel<8, detail::spmv_alpha_beta, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here + viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(vec.start()), + static_cast(vec.stride()), + alpha, + viennacl::cuda_arg(result), + static_cast(result.start()), + static_cast(result.stride()), + static_cast(result.size()), + beta + ); + else + compressed_matrix_vec_mul_kernel<8, detail::spmv_pure, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here + viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(vec.start()), + static_cast(vec.stride()), + alpha, + viennacl::cuda_arg(result), + static_cast(result.start()), + static_cast(result.stride()), + static_cast(result.size()), + beta + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel"); + } + else if (!is_maxwell && double(mat.nnz()) / double(mat.size1()) > 12.0) // less than 25% of threads expected to idle + { + if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0) + compressed_matrix_vec_mul_kernel<16, detail::spmv_alpha_beta, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp) + viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(vec.start()), + static_cast(vec.stride()), + alpha, + viennacl::cuda_arg(result), + static_cast(result.start()), + static_cast(result.stride()), + static_cast(result.size()), + beta + ); + else + compressed_matrix_vec_mul_kernel<16, detail::spmv_pure, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp) + viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(vec.start()), + static_cast(vec.stride()), + alpha, + viennacl::cuda_arg(result), + static_cast(result.start()), + static_cast(result.stride()), + static_cast(result.size()), + beta + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel"); + } + else + { + if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0) + compressed_matrix_vec_mul_adaptive_kernel<<<512, 256>>>(viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle3()), + viennacl::cuda_arg(mat.handle()), + static_cast(mat.blocks1()), + viennacl::cuda_arg(vec), + static_cast(vec.start()), + static_cast(vec.stride()), + alpha, + viennacl::cuda_arg(result), + static_cast(result.start()), + static_cast(result.stride()), + static_cast(result.size()), + beta + ); + else + compressed_matrix_vec_mul_adaptive_kernel<<<512, 256>>>(viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle3()), + viennacl::cuda_arg(mat.handle()), + static_cast(mat.blocks1()), + viennacl::cuda_arg(vec), + static_cast(vec.start()), + static_cast(vec.stride()), + alpha, + viennacl::cuda_arg(result), + static_cast(result.start()), + static_cast(result.stride()), + static_cast(result.size()), + beta + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_adaptive_kernel"); + } +} + +/** @brief Helper struct for accessing an element of a row- or column-major matrix. + * + * @param LayoutT The layout tag: Either row_major or column_major + */ +template +struct mat_mult_matrix_index +{ + static __device__ unsigned int apply(unsigned int i, unsigned int j, + unsigned int row_start, unsigned int row_inc, + unsigned int col_start, unsigned int col_inc, + unsigned int internal_rows, unsigned int internal_cols) + { + return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc; + } +}; + +/** \cond */ +template<> +struct mat_mult_matrix_index +{ + static __device__ unsigned int apply(unsigned int i, unsigned int j, + unsigned int row_start, unsigned int row_inc, + unsigned int col_start, unsigned int col_inc, + unsigned int internal_rows, unsigned int internal_cols) + { + return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows; + } +}; +/** \endcond */ + + +template +__global__ void compressed_matrix_d_mat_mul_kernel( + const unsigned int * sp_mat_row_indices, + const unsigned int * sp_mat_col_indices, + const NumericT * sp_mat_elements, + const NumericT * d_mat, + unsigned int d_mat_row_start, + unsigned int d_mat_col_start, + unsigned int d_mat_row_inc, + unsigned int d_mat_col_inc, + unsigned int d_mat_row_size, + unsigned int d_mat_col_size, + unsigned int d_mat_internal_rows, + unsigned int d_mat_internal_cols, + NumericT * result, + unsigned int result_row_start, + unsigned int result_col_start, + unsigned int result_row_inc, + unsigned int result_col_inc, + unsigned int result_row_size, + unsigned int result_col_size, + unsigned int result_internal_rows, + unsigned int result_internal_cols) +{ + for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x) + { + unsigned int row_start = sp_mat_row_indices[row]; + unsigned int row_end = sp_mat_row_indices[row+1]; + + for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) + { + NumericT r = 0; + + for (unsigned int k = row_start; k < row_end; k++) + { + unsigned int j = sp_mat_col_indices[k]; + NumericT x = sp_mat_elements[k]; + NumericT y = d_mat[ DMatIndexT::apply(j, col, + d_mat_row_start, d_mat_row_inc, + d_mat_col_start, d_mat_col_inc, + d_mat_internal_rows, d_mat_internal_cols) ]; + + r += x * y; + } + + result[ResultIndexT::apply(row, col, + result_row_start, result_row_inc, + result_col_start, result_col_inc, + result_internal_rows, result_internal_cols)] = r; + } + } +} + + +/** @brief Carries out sparse_matrix-dense_matrix multiplication first matrix being compressed +* +* Implementation of the convenience expression result = prod(mat, vec); +* +* @param sp_mat The sparse matrix +* @param d_mat The dense matrix +* @param result The result matrix +*/ +template +void prod_impl(const viennacl::compressed_matrix & sp_mat, + const viennacl::matrix_base & d_mat, + viennacl::matrix_base & result) +{ + if (d_mat.row_major() && result.row_major()) + { + compressed_matrix_d_mat_mul_kernel, mat_mult_matrix_index ><<<128, 128>>> + (viennacl::cuda_arg(sp_mat.handle1()), + viennacl::cuda_arg(sp_mat.handle2()), + viennacl::cuda_arg(sp_mat.handle()), + + viennacl::cuda_arg(d_mat), + static_cast(viennacl::traits::start1(d_mat)), static_cast(viennacl::traits::start2(d_mat)), + static_cast(viennacl::traits::stride1(d_mat)), static_cast(viennacl::traits::stride2(d_mat)), + static_cast(viennacl::traits::size1(d_mat)), static_cast(viennacl::traits::size2(d_mat)), + static_cast(viennacl::traits::internal_size1(d_mat)), static_cast(viennacl::traits::internal_size2(d_mat)), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel"); + } + else if (d_mat.row_major() && !result.row_major()) + { + compressed_matrix_d_mat_mul_kernel, mat_mult_matrix_index ><<<128, 128>>> + (viennacl::cuda_arg(sp_mat.handle1()), + viennacl::cuda_arg(sp_mat.handle2()), + viennacl::cuda_arg(sp_mat.handle()), + + viennacl::cuda_arg(d_mat), + static_cast(viennacl::traits::start1(d_mat)), static_cast(viennacl::traits::start2(d_mat)), + static_cast(viennacl::traits::stride1(d_mat)), static_cast(viennacl::traits::stride2(d_mat)), + static_cast(viennacl::traits::size1(d_mat)), static_cast(viennacl::traits::size2(d_mat)), + static_cast(viennacl::traits::internal_size1(d_mat)), static_cast(viennacl::traits::internal_size2(d_mat)), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel"); + } + else if (!d_mat.row_major() && result.row_major()) + { + compressed_matrix_d_mat_mul_kernel, mat_mult_matrix_index ><<<128, 128>>> + (viennacl::cuda_arg(sp_mat.handle1()), + viennacl::cuda_arg(sp_mat.handle2()), + viennacl::cuda_arg(sp_mat.handle()), + + viennacl::cuda_arg(d_mat), + static_cast(viennacl::traits::start1(d_mat)), static_cast(viennacl::traits::start2(d_mat)), + static_cast(viennacl::traits::stride1(d_mat)), static_cast(viennacl::traits::stride2(d_mat)), + static_cast(viennacl::traits::size1(d_mat)), static_cast(viennacl::traits::size2(d_mat)), + static_cast(viennacl::traits::internal_size1(d_mat)), static_cast(viennacl::traits::internal_size2(d_mat)), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel"); + } + else + { + compressed_matrix_d_mat_mul_kernel, mat_mult_matrix_index ><<<128, 128>>> + (viennacl::cuda_arg(sp_mat.handle1()), + viennacl::cuda_arg(sp_mat.handle2()), + viennacl::cuda_arg(sp_mat.handle()), + + viennacl::cuda_arg(d_mat), + static_cast(viennacl::traits::start1(d_mat)), static_cast(viennacl::traits::start2(d_mat)), + static_cast(viennacl::traits::stride1(d_mat)), static_cast(viennacl::traits::stride2(d_mat)), + static_cast(viennacl::traits::size1(d_mat)), static_cast(viennacl::traits::size2(d_mat)), + static_cast(viennacl::traits::internal_size1(d_mat)), static_cast(viennacl::traits::internal_size2(d_mat)), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel"); + } +} + + +template +__global__ void compressed_matrix_d_tr_mat_mul_kernel( + const unsigned int * sp_mat_row_indices, + const unsigned int * sp_mat_col_indices, + const NumericT * sp_mat_elements, + const NumericT * d_mat, + unsigned int d_mat_row_start, + unsigned int d_mat_col_start, + unsigned int d_mat_row_inc, + unsigned int d_mat_col_inc, + unsigned int d_mat_row_size, + unsigned int d_mat_col_size, + unsigned int d_mat_internal_rows, + unsigned int d_mat_internal_cols, + NumericT * result, + unsigned int result_row_start, + unsigned int result_col_start, + unsigned int result_row_inc, + unsigned int result_col_inc, + unsigned int result_row_size, + unsigned int result_col_size, + unsigned int result_internal_rows, + unsigned int result_internal_cols) +{ + for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x) + { + unsigned int row_start = sp_mat_row_indices[row]; + unsigned int row_end = sp_mat_row_indices[row+1]; + + for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) + { + NumericT r = 0; + + for (unsigned int k = row_start; k < row_end; k++) + { + unsigned int j = sp_mat_col_indices[k]; + NumericT x = sp_mat_elements[k]; + NumericT y = d_mat[ DMatIndexT::apply(col, j, + d_mat_row_start, d_mat_row_inc, + d_mat_col_start, d_mat_col_inc, + d_mat_internal_rows, d_mat_internal_cols) ]; + + r += x * y; + } + + result [ ResultIndexT::apply(row, col, + result_row_start, result_row_inc, + result_col_start, result_col_inc, + result_internal_rows, result_internal_cols) ] = r; + } + } + +} + +/** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed +* and the second transposed +* +* Implementation of the convenience expression result = prod(sp_mat, d_mat); +* +* @param sp_mat The sparse matrix +* @param d_mat The transposed dense matrix proxy +* @param result The result matrix +*/ +template +void prod_impl(const viennacl::compressed_matrix & sp_mat, + const viennacl::matrix_expression< const viennacl::matrix_base, + const viennacl::matrix_base, + viennacl::op_trans > & d_mat, + viennacl::matrix_base & result) +{ + + if (d_mat.lhs().row_major() && result.row_major()) + { + compressed_matrix_d_tr_mat_mul_kernel, mat_mult_matrix_index ><<<128, 128>>> + (viennacl::cuda_arg(sp_mat.handle1()), + viennacl::cuda_arg(sp_mat.handle2()), + viennacl::cuda_arg(sp_mat.handle()), + + viennacl::cuda_arg(d_mat.lhs()), + static_cast(viennacl::traits::start1(d_mat.lhs())), static_cast(viennacl::traits::start2(d_mat.lhs())), + static_cast(viennacl::traits::stride1(d_mat.lhs())), static_cast(viennacl::traits::stride2(d_mat.lhs())), + static_cast(viennacl::traits::size1(d_mat.lhs())), static_cast(viennacl::traits::size2(d_mat.lhs())), + static_cast(viennacl::traits::internal_size1(d_mat.lhs())), static_cast(viennacl::traits::internal_size2(d_mat.lhs())), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel"); + } + else if (d_mat.lhs().row_major() && !result.row_major()) + { + compressed_matrix_d_tr_mat_mul_kernel, mat_mult_matrix_index ><<<128, 128>>> + (viennacl::cuda_arg(sp_mat.handle1()), + viennacl::cuda_arg(sp_mat.handle2()), + viennacl::cuda_arg(sp_mat.handle()), + + viennacl::cuda_arg(d_mat.lhs()), + static_cast(viennacl::traits::start1(d_mat.lhs())), static_cast(viennacl::traits::start2(d_mat.lhs())), + static_cast(viennacl::traits::stride1(d_mat.lhs())), static_cast(viennacl::traits::stride2(d_mat.lhs())), + static_cast(viennacl::traits::size1(d_mat.lhs())), static_cast(viennacl::traits::size2(d_mat.lhs())), + static_cast(viennacl::traits::internal_size1(d_mat.lhs())), static_cast(viennacl::traits::internal_size2(d_mat.lhs())), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel"); + } + else if (!d_mat.lhs().row_major() && result.row_major()) + { + compressed_matrix_d_tr_mat_mul_kernel, mat_mult_matrix_index ><<<128, 128>>> + (viennacl::cuda_arg(sp_mat.handle1()), + viennacl::cuda_arg(sp_mat.handle2()), + viennacl::cuda_arg(sp_mat.handle()), + + viennacl::cuda_arg(d_mat.lhs()), + static_cast(viennacl::traits::start1(d_mat.lhs())), static_cast(viennacl::traits::start2(d_mat.lhs())), + static_cast(viennacl::traits::stride1(d_mat.lhs())), static_cast(viennacl::traits::stride2(d_mat.lhs())), + static_cast(viennacl::traits::size1(d_mat.lhs())), static_cast(viennacl::traits::size2(d_mat.lhs())), + static_cast(viennacl::traits::internal_size1(d_mat.lhs())), static_cast(viennacl::traits::internal_size2(d_mat.lhs())), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel"); + } + else + { + compressed_matrix_d_tr_mat_mul_kernel, mat_mult_matrix_index ><<<128, 128>>> + (viennacl::cuda_arg(sp_mat.handle1()), + viennacl::cuda_arg(sp_mat.handle2()), + viennacl::cuda_arg(sp_mat.handle()), + + viennacl::cuda_arg(d_mat.lhs()), + static_cast(viennacl::traits::start1(d_mat.lhs())), static_cast(viennacl::traits::start2(d_mat.lhs())), + static_cast(viennacl::traits::stride1(d_mat.lhs())), static_cast(viennacl::traits::stride2(d_mat.lhs())), + static_cast(viennacl::traits::size1(d_mat.lhs())), static_cast(viennacl::traits::size2(d_mat.lhs())), + static_cast(viennacl::traits::internal_size1(d_mat.lhs())), static_cast(viennacl::traits::internal_size2(d_mat.lhs())), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel"); + } +} + + +// +// triangular solves for compressed_matrix +// + +template +__global__ void compressed_matrix_diagonal_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * result, + unsigned int size) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < size; + row += gridDim.x * blockDim.x) + { + NumericT diag = NumericT(0); + unsigned int row_end = row_indices[row+1]; + for (unsigned int i = row_indices[row]; i < row_end; ++i) + { + unsigned int col_index = column_indices[i]; + if (col_index == row) + { + diag = elements[i]; + break; + } + } + result[row] = diag; + } +} + + +/** @brief Carries out triangular inplace solves +* +* @param mat The matrix +* @param vec The vector holding the right hand side. Is overwritten by the solution. +*/ +template +typename viennacl::enable_if< viennacl::is_any_sparse_matrix::value>::type +inplace_solve(const SparseMatrixT & mat, + viennacl::vector_base & vec, + viennacl::linalg::unit_lower_tag) +{ + csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(mat.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel"); +} + + +/** @brief Carries out triangular inplace solves +* +* @param mat The matrix +* @param vec The vector holding the right hand side. Is overwritten by the solution. +*/ +template +typename viennacl::enable_if< viennacl::is_any_sparse_matrix::value>::type +inplace_solve(const SparseMatrixT & mat, + viennacl::vector_base & vec, + viennacl::linalg::lower_tag) +{ + csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(mat.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel"); +} + + + +/** @brief Carries out triangular inplace solves +* +* @param mat The matrix +* @param vec The vector holding the right hand side. Is overwritten by the solution. +*/ +template +typename viennacl::enable_if< viennacl::is_any_sparse_matrix::value>::type +inplace_solve(const SparseMatrixT & mat, + viennacl::vector_base & vec, + viennacl::linalg::unit_upper_tag) +{ + csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(mat.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel"); +} + + +/** @brief Carries out triangular inplace solves +* +* @param mat The matrix +* @param vec The vector holding the right hand side. Is overwritten by the solution. +*/ +template +typename viennacl::enable_if< viennacl::is_any_sparse_matrix::value>::type +inplace_solve(const SparseMatrixT & mat, + viennacl::vector_base & vec, + viennacl::linalg::upper_tag) +{ + csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(vec), + static_cast(mat.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel"); +} + + + +// transposed + +/** @brief Carries out triangular inplace solves +* +* @param mat The matrix +* @param vec The vector holding the right hand side. Is overwritten by the solution. +*/ +template +typename viennacl::enable_if< viennacl::is_any_sparse_matrix::value>::type +inplace_solve(const matrix_expression & mat, + viennacl::vector_base & vec, + viennacl::linalg::unit_lower_tag) +{ + csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.lhs().handle1()), + viennacl::cuda_arg(mat.lhs().handle2()), + viennacl::cuda_arg(mat.lhs().handle()), + viennacl::cuda_arg(vec), + static_cast(mat.lhs().size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel"); +} + + +/** @brief Carries out triangular inplace solves +* +* @param mat The matrix +* @param vec The vector holding the right hand side. Is overwritten by the solution. +*/ +template +typename viennacl::enable_if< viennacl::is_any_sparse_matrix::value>::type +inplace_solve(const matrix_expression & mat, + viennacl::vector_base & vec, + viennacl::linalg::lower_tag) +{ + viennacl::vector diagonal(vec.size()); + + compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.lhs().handle1()), + viennacl::cuda_arg(mat.lhs().handle2()), + viennacl::cuda_arg(mat.lhs().handle()), + viennacl::cuda_arg(diagonal), + static_cast(mat.size1()) + ); + + csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.lhs().handle1()), + viennacl::cuda_arg(mat.lhs().handle2()), + viennacl::cuda_arg(mat.lhs().handle()), + viennacl::cuda_arg(diagonal), + viennacl::cuda_arg(vec), + static_cast(mat.lhs().size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel"); +} + + +/** @brief Carries out triangular inplace solves +* +* @param mat The matrix +* @param vec The vector holding the right hand side. Is overwritten by the solution. +*/ +template +typename viennacl::enable_if< viennacl::is_any_sparse_matrix::value>::type +inplace_solve(const matrix_expression & mat, + viennacl::vector_base & vec, + viennacl::linalg::unit_upper_tag) +{ + csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.lhs().handle1()), + viennacl::cuda_arg(mat.lhs().handle2()), + viennacl::cuda_arg(mat.lhs().handle()), + viennacl::cuda_arg(vec), + static_cast(mat.lhs().size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel"); +} + + +/** @brief Carries out triangular inplace solves +* +* @param mat The matrix +* @param vec The vector holding the right hand side. Is overwritten by the solution. +*/ +template +typename viennacl::enable_if< viennacl::is_any_sparse_matrix::value>::type +inplace_solve(const matrix_expression & mat, + viennacl::vector_base & vec, + viennacl::linalg::upper_tag) +{ + viennacl::vector diagonal(vec.size()); + + compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.lhs().handle1()), + viennacl::cuda_arg(mat.lhs().handle2()), + viennacl::cuda_arg(mat.lhs().handle()), + viennacl::cuda_arg(diagonal), + static_cast(mat.size1()) + ); + + csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg(mat.lhs().handle1()), + viennacl::cuda_arg(mat.lhs().handle2()), + viennacl::cuda_arg(mat.lhs().handle()), + viennacl::cuda_arg(diagonal), + viennacl::cuda_arg(vec), + static_cast(mat.lhs().size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel"); +} + +namespace detail +{ + // + // block solves + // + template + void block_inplace_solve(const matrix_expression, + const compressed_matrix, + op_trans> & L, + viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks, + vector_base const & /* L_diagonal */, //ignored + vector_base & vec, + viennacl::linalg::unit_lower_tag) + { + csr_block_trans_unit_lu_forward<<>>(viennacl::cuda_arg(L.lhs().handle1()), + viennacl::cuda_arg(L.lhs().handle2()), + viennacl::cuda_arg(L.lhs().handle()), + viennacl::cuda_arg(block_indices), + viennacl::cuda_arg(vec), + static_cast(L.lhs().size1()) + ); + } + + + template + void block_inplace_solve(const matrix_expression, + const compressed_matrix, + op_trans> & U, + viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks, + vector_base const & U_diagonal, + vector_base & vec, + viennacl::linalg::upper_tag) + { + csr_block_trans_lu_backward<<>>(viennacl::cuda_arg(U.lhs().handle1()), + viennacl::cuda_arg(U.lhs().handle2()), + viennacl::cuda_arg(U.lhs().handle()), + viennacl::cuda_arg(U_diagonal), + viennacl::cuda_arg(block_indices), + viennacl::cuda_arg(vec), + static_cast(U.lhs().size1()) + ); + } + + +} + + +// +// Compressed Compressed Matrix +// + +template +__global__ void compressed_compressed_matrix_vec_mul_kernel( + const unsigned int * row_jumper, + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + unsigned int nonzero_rows, + const NumericT * x, + unsigned int start_x, + unsigned int inc_x, + NumericT alpha, + NumericT * result, + unsigned int start_result, + unsigned int inc_result, + unsigned int size_result, + NumericT beta) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < nonzero_rows; + i += gridDim.x * blockDim.x) + { + NumericT dot_prod = NumericT(0); + unsigned int row_end = row_jumper[i+1]; + for (unsigned int j = row_jumper[i]; j < row_end; ++j) + dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x]; + + unsigned int index = row_indices[i] * inc_result + start_result; + if (beta != 0) result[index] += alpha * dot_prod; + else result[index] = alpha * dot_prod; + } +} + + +/** @brief Carries out matrix-vector multiplication with a compressed_compressed_matrix +* +* Implementation of the convenience expression result = prod(mat, vec); +* +* @param mat The matrix +* @param vec The vector +* @param result The result vector +*/ +template +void prod_impl(const viennacl::compressed_compressed_matrix & mat, + const viennacl::vector_base & vec, + NumericT alpha, + viennacl::vector_base & result, + NumericT beta) +{ + if (beta < 0 || beta > 0) + viennacl::linalg::cuda::av(result, result, beta, 1, false, false); + else + result.clear(); + + compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg(mat.handle1()), + viennacl::cuda_arg(mat.handle3()), + viennacl::cuda_arg(mat.handle2()), + viennacl::cuda_arg(mat.handle()), + static_cast(mat.nnz1()), + viennacl::cuda_arg(vec), + static_cast(vec.start()), + static_cast(vec.stride()), + alpha, + viennacl::cuda_arg(result), + static_cast(result.start()), + static_cast(result.stride()), + static_cast(result.size()), + beta + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel"); +} + +// +// Coordinate Matrix +// + + +namespace detail +{ + + template + __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index) + const NumericT * elements, + const unsigned int * group_boundaries, + NumericT * result, + unsigned int option) + { + __shared__ unsigned int shared_rows[128]; + __shared__ NumericT inter_results[128]; + + uint2 tmp; + NumericT val; + unsigned int last_index = blockDim.x - 1; + unsigned int group_start = group_boundaries[blockIdx.x]; + unsigned int group_end = group_boundaries[blockIdx.x + 1]; + unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x + + unsigned int local_index = 0; + + for (unsigned int k = 0; k < k_end; ++k) + { + local_index = group_start + k * blockDim.x + threadIdx.x; + + tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); + val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; + + //check for carry from previous loop run: + if (threadIdx.x == 0 && k > 0) + { + if (tmp.x == shared_rows[last_index]) + { + switch (option) + { + case 0: //inf-norm + case 3: //diagonal entry + val = max(val, fabs(inter_results[last_index])); + break; + + case 1: //1-norm + val = fabs(val) + inter_results[last_index]; + break; + + case 2: //2-norm + val = sqrt(val * val + inter_results[last_index]); + break; + + default: + break; + } + } + else + { + switch (option) + { + case 0: //inf-norm + case 1: //1-norm + case 3: //diagonal entry + result[shared_rows[last_index]] = inter_results[last_index]; + break; + + case 2: //2-norm + result[shared_rows[last_index]] = sqrt(inter_results[last_index]); + default: + break; + } + } + } + + //segmented parallel reduction begin + __syncthreads(); + shared_rows[threadIdx.x] = tmp.x; + switch (option) + { + case 0: + case 3: + inter_results[threadIdx.x] = val; + break; + case 1: + inter_results[threadIdx.x] = fabs(val); + break; + case 2: + inter_results[threadIdx.x] = val * val; + default: + break; + } + __syncthreads(); + + for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) + { + NumericT left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; + __syncthreads(); + switch (option) + { + case 0: //inf-norm + case 3: //diagonal entry + inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left); + break; + + case 1: //1-norm + inter_results[threadIdx.x] += left; + break; + + case 2: //2-norm + inter_results[threadIdx.x] += left; + break; + + default: + break; + } + __syncthreads(); + } + //segmented parallel reduction end + + if (threadIdx.x != last_index && + shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] && + inter_results[threadIdx.x] != 0) + { + result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x]; + } + + __syncthreads(); + } //for k + + if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0) + result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x]; + } + + template + void row_info(coordinate_matrix const & mat, + vector_base & vec, + viennacl::linalg::detail::row_info_types info_selector) + { + coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg(mat.handle12()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(mat.handle3()), + viennacl::cuda_arg(vec), + static_cast(info_selector) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor"); + } + +} //namespace detail + + +template +__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index) + const NumericT * elements, + const unsigned int * group_boundaries, + const NumericT * x, + unsigned int start_x, + unsigned int inc_x, + NumericT alpha, + NumericT * result, + unsigned int start_result, + unsigned int inc_result, + NumericT beta) +{ + __shared__ unsigned int shared_rows[128]; + __shared__ NumericT inter_results[128]; + + uint2 tmp; + NumericT val; + unsigned int group_start = group_boundaries[blockIdx.x]; + unsigned int group_end = group_boundaries[blockIdx.x + 1]; + unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x + + unsigned int local_index = 0; + + for (unsigned int k = 0; k < k_end; ++k) + { + local_index = group_start + k * blockDim.x + threadIdx.x; + + tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); + val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0; + + //check for carry from previous loop run: + if (threadIdx.x == 0 && k > 0) + { + if (tmp.x == shared_rows[blockDim.x-1]) + val += inter_results[blockDim.x-1]; + else if (beta != 0) + result[shared_rows[blockDim.x-1] * inc_result + start_result] += alpha * inter_results[blockDim.x-1]; + else + result[shared_rows[blockDim.x-1] * inc_result + start_result] = alpha * inter_results[blockDim.x-1]; + } + + //segmented parallel reduction begin + __syncthreads(); + shared_rows[threadIdx.x] = tmp.x; + inter_results[threadIdx.x] = val; + NumericT left = 0; + __syncthreads(); + + for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) + { + left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; + __syncthreads(); + inter_results[threadIdx.x] += left; + __syncthreads(); + } + //segmented parallel reduction end + + if (local_index < group_end - 1 && threadIdx.x < blockDim.x-1 && + shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1]) + { + if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x]; + else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x]; + } + + __syncthreads(); + } //for k + + if (local_index + 1 == group_end) { + if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x]; + else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x]; + } +} + + +/** @brief Carries out matrix-vector multiplication with a coordinate_matrix +* +* Implementation of the convenience expression result = prod(mat, vec); +* +* @param mat The matrix +* @param vec The vector +* @param result The result vector +*/ +template +void prod_impl(const viennacl::coordinate_matrix & mat, + const viennacl::vector_base & vec, + NumericT alpha, + viennacl::vector_base & result, + NumericT beta) +{ + if (beta < 0 || beta > 0) + viennacl::linalg::cuda::av(result, result, beta, 1, false, false); + else + result.clear(); + + coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg(mat.handle12()), + viennacl::cuda_arg(mat.handle()), + viennacl::cuda_arg(mat.handle3()), + viennacl::cuda_arg(vec), + static_cast(vec.start()), + static_cast(vec.stride()), + alpha, + viennacl::cuda_arg(result), + static_cast(result.start()), + static_cast(result.stride()), + beta + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel"); +} + + + + +template +__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index) + const NumericT * elements, + const unsigned int * group_boundaries, + const NumericT * d_mat, + unsigned int d_mat_row_start, + unsigned int d_mat_col_start, + unsigned int d_mat_row_inc, + unsigned int d_mat_col_inc, + unsigned int d_mat_row_size, + unsigned int d_mat_col_size, + unsigned int d_mat_internal_rows, + unsigned int d_mat_internal_cols, + NumericT * result, + unsigned int result_row_start, + unsigned int result_col_start, + unsigned int result_row_inc, + unsigned int result_col_inc, + unsigned int result_row_size, + unsigned int result_col_size, + unsigned int result_internal_rows, + unsigned int result_internal_cols) +{ + __shared__ unsigned int shared_rows[128]; + __shared__ NumericT inter_results[128]; + + uint2 tmp; + NumericT val; + unsigned int group_start = group_boundaries[blockIdx.x]; + unsigned int group_end = group_boundaries[blockIdx.x + 1]; + unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x + + unsigned int local_index = 0; + + for (unsigned int result_col = 0; result_col < result_col_size; ++result_col) + { + for (unsigned int k = 0; k < k_end; ++k) + { + local_index = group_start + k * blockDim.x + threadIdx.x; + + tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); + val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col, + d_mat_row_start, d_mat_row_inc, + d_mat_col_start, d_mat_col_inc, + d_mat_internal_rows, d_mat_internal_cols) ] : 0; + + //check for carry from previous loop run: + if (threadIdx.x == 0 && k > 0) + { + if (tmp.x == shared_rows[blockDim.x-1]) + val += inter_results[blockDim.x-1]; + else + result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col, + result_row_start, result_row_inc, + result_col_start, result_col_inc, + result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1]; + } + + //segmented parallel reduction begin + __syncthreads(); + shared_rows[threadIdx.x] = tmp.x; + inter_results[threadIdx.x] = val; + NumericT left = 0; + __syncthreads(); + + for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) + { + left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; + __syncthreads(); + inter_results[threadIdx.x] += left; + __syncthreads(); + } + //segmented parallel reduction end + + if (local_index < group_end && threadIdx.x < blockDim.x-1 && + shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1]) + { + result[ResultIndexT::apply(tmp.x, result_col, + result_row_start, result_row_inc, + result_col_start, result_col_inc, + result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x]; + } + + __syncthreads(); + } //for k + + if (local_index + 1 == group_end) + result[ResultIndexT::apply(tmp.x, result_col, + result_row_start, result_row_inc, + result_col_start, result_col_inc, + result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x]; + } +} + + +/** @brief Carries out Compressed Matrix(COO)-Dense Matrix multiplication +* +* Implementation of the convenience expression result = prod(sp_mat, d_mat); +* +* @param sp_mat The Sparse Matrix (Coordinate format) +* @param d_mat The Dense Matrix +* @param result The Result Matrix +*/ +template +void prod_impl(const viennacl::coordinate_matrix & sp_mat, + const viennacl::matrix_base & d_mat, + viennacl::matrix_base & result) +{ + if (d_mat.row_major() && result.row_major()) + { + coordinate_matrix_d_mat_mul_kernel, mat_mult_matrix_index ><<<64, 128>>> + (viennacl::cuda_arg(sp_mat.handle12()), + viennacl::cuda_arg(sp_mat.handle()), + viennacl::cuda_arg(sp_mat.handle3()), + + viennacl::cuda_arg(d_mat), + static_cast(viennacl::traits::start1(d_mat)), static_cast(viennacl::traits::start2(d_mat)), + static_cast(viennacl::traits::stride1(d_mat)), static_cast(viennacl::traits::stride2(d_mat)), + static_cast(viennacl::traits::size1(d_mat)), static_cast(viennacl::traits::size2(d_mat)), + static_cast(viennacl::traits::internal_size1(d_mat)), static_cast(viennacl::traits::internal_size2(d_mat)), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel"); + } + else if (d_mat.row_major() && !result.row_major()) + { + coordinate_matrix_d_mat_mul_kernel, mat_mult_matrix_index ><<<64, 128>>> + (viennacl::cuda_arg(sp_mat.handle12()), + viennacl::cuda_arg(sp_mat.handle()), + viennacl::cuda_arg(sp_mat.handle3()), + + viennacl::cuda_arg(d_mat), + static_cast(viennacl::traits::start1(d_mat)), static_cast(viennacl::traits::start2(d_mat)), + static_cast(viennacl::traits::stride1(d_mat)), static_cast(viennacl::traits::stride2(d_mat)), + static_cast(viennacl::traits::size1(d_mat)), static_cast(viennacl::traits::size2(d_mat)), + static_cast(viennacl::traits::internal_size1(d_mat)), static_cast(viennacl::traits::internal_size2(d_mat)), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel"); + } + else if (!d_mat.row_major() && result.row_major()) + { + coordinate_matrix_d_mat_mul_kernel, mat_mult_matrix_index ><<<64, 128>>> + (viennacl::cuda_arg(sp_mat.handle12()), + viennacl::cuda_arg(sp_mat.handle()), + viennacl::cuda_arg(sp_mat.handle3()), + + viennacl::cuda_arg(d_mat), + static_cast(viennacl::traits::start1(d_mat)), static_cast(viennacl::traits::start2(d_mat)), + static_cast(viennacl::traits::stride1(d_mat)), static_cast(viennacl::traits::stride2(d_mat)), + static_cast(viennacl::traits::size1(d_mat)), static_cast(viennacl::traits::size2(d_mat)), + static_cast(viennacl::traits::internal_size1(d_mat)), static_cast(viennacl::traits::internal_size2(d_mat)), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel"); + } + else + { + coordinate_matrix_d_mat_mul_kernel, mat_mult_matrix_index ><<<64, 128>>> + (viennacl::cuda_arg(sp_mat.handle12()), + viennacl::cuda_arg(sp_mat.handle()), + viennacl::cuda_arg(sp_mat.handle3()), + + viennacl::cuda_arg(d_mat), + static_cast(viennacl::traits::start1(d_mat)), static_cast(viennacl::traits::start2(d_mat)), + static_cast(viennacl::traits::stride1(d_mat)), static_cast(viennacl::traits::stride2(d_mat)), + static_cast(viennacl::traits::size1(d_mat)), static_cast(viennacl::traits::size2(d_mat)), + static_cast(viennacl::traits::internal_size1(d_mat)), static_cast(viennacl::traits::internal_size2(d_mat)), + + viennacl::cuda_arg(result), + static_cast(viennacl::traits::start1(result)), static_cast(viennacl::traits::start2(result)), + static_cast(viennacl::traits::stride1(result)), static_cast(viennacl::traits::stride2(result)), + static_cast(viennacl::traits::size1(result)), static_cast(viennacl::traits::size2(result)), + static_cast(viennacl::traits::internal_size1(result)), static_cast(viennacl::traits::internal_size2(result)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel"); + } + +} + +template +__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index) + const NumericT * elements, + const unsigned int * group_boundaries, + const NumericT * d_mat, + unsigned int d_mat_row_start, + unsigned int d_mat_col_start, + unsigned int d_mat_row_inc, + unsigned int d_mat_col_inc, + unsigned int d_mat_row_size, + unsigned int d_mat_col_size, + unsigned int d_mat_internal_rows, + unsigned int d_mat_internal_cols, + NumericT * result, + unsigned int result_row_start, + unsigned int result_col_start, + unsigned int result_row_inc, + unsigned int result_col_inc, + unsigned int result_row_size, + unsigned int result_col_size, + unsigned int result_internal_rows, + unsigned int result_internal_cols) +{ + __shared__ unsigned int shared_rows[128]; + __shared__ NumericT inter_results[128]; + + uint2 tmp; + NumericT val; + unsigned int group_start = group_boundaries[blockIdx.x]; + unsigned int group_end = group_boundaries[blockIdx.x + 1]; + unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x + + uns