Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 15901200B21 for ; Fri, 10 Jun 2016 18:52:13 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 141F9160A62; Fri, 10 Jun 2016 16:52:13 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 5F15D160A60 for ; Fri, 10 Jun 2016 18:52:11 +0200 (CEST) Received: (qmail 12251 invoked by uid 500); 10 Jun 2016 16:52:08 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 10262 invoked by uid 99); 10 Jun 2016 16:52:06 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 10 Jun 2016 16:52:06 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id A76B4E38B5; Fri, 10 Jun 2016 16:52:06 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: apalumbo@apache.org To: commits@mahout.apache.org Date: Fri, 10 Jun 2016 16:52:26 -0000 Message-Id: <12070017985a4cfab0095d7e7e1b4830@git.apache.org> In-Reply-To: <889bf3d38fb948a99fb684162c4bd182@git.apache.org> References: <889bf3d38fb948a99fb684162c4bd182@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [21/51] [partial] mahout git commit: Revert "(nojira) add native-viennaCL module to codebase. closes apache/mahout#241" archived-at: Fri, 10 Jun 2016 16:52:13 -0000 http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp deleted file mode 100644 index ee6626c..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp +++ /dev/null @@ -1,880 +0,0 @@ -#ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_ -#define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp rupp@iue.tuwien.ac.at - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/host_based/iterative_operations.hpp - @brief Implementations of specialized kernels for fast iterative solvers using OpenMP on the CPU -*/ - -#include -#include //for std::max and std::min - -#include "viennacl/forwards.h" -#include "viennacl/scalar.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/meta/predicate.hpp" -#include "viennacl/meta/enable_if.hpp" -#include "viennacl/traits/size.hpp" -#include "viennacl/traits/start.hpp" -#include "viennacl/linalg/host_based/common.hpp" -#include "viennacl/linalg/detail/op_applier.hpp" -#include "viennacl/traits/stride.hpp" - - -// Minimum vector size for using OpenMP on vector operations: -#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE - #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000 -#endif - -namespace viennacl -{ -namespace linalg -{ -namespace host_based -{ - -namespace detail -{ - /** @brief Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorithm. - * - * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0) - */ - template - void pipelined_prod_impl(compressed_matrix const & A, - vector_base const & p, - vector_base & Ap, - NumericT const * r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - typedef NumericT value_type; - - value_type * Ap_buf = detail::extract_raw_pointer(Ap.handle()) + viennacl::traits::start(Ap); - value_type const * p_buf = detail::extract_raw_pointer(p.handle()) + viennacl::traits::start(p); - value_type const * elements = detail::extract_raw_pointer(A.handle()); - unsigned int const * row_buffer = detail::extract_raw_pointer(A.handle1()); - unsigned int const * col_buffer = detail::extract_raw_pointer(A.handle2()); - value_type * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - - value_type inner_prod_ApAp = 0; - value_type inner_prod_pAp = 0; - value_type inner_prod_Ap_r0star = 0; - for (long row = 0; row < static_cast(A.size1()); ++row) - { - value_type dot_prod = 0; - value_type val_p_diag = p_buf[static_cast(row)]; //likely to be loaded from cache if required again in this row - - vcl_size_t row_end = row_buffer[row+1]; - for (vcl_size_t i = row_buffer[row]; i < row_end; ++i) - dot_prod += elements[i] * p_buf[col_buffer[i]]; - - // update contributions for the inner products (Ap, Ap) and (p, Ap) - Ap_buf[static_cast(row)] = dot_prod; - inner_prod_ApAp += dot_prod * dot_prod; - inner_prod_pAp += val_p_diag * dot_prod; - inner_prod_Ap_r0star += r0star ? dot_prod * r0star[static_cast(row)] : value_type(0); - } - - data_buffer[ buffer_chunk_size] = inner_prod_ApAp; - data_buffer[2 * buffer_chunk_size] = inner_prod_pAp; - if (r0star) - data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star; - } - - - - /** @brief Implementation of a fused matrix-vector product with a coordinate_matrix for an efficient pipelined CG algorithm. - * - * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0) - */ - template - void pipelined_prod_impl(coordinate_matrix const & A, - vector_base const & p, - vector_base & Ap, - NumericT const * r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - typedef NumericT value_type; - - value_type * Ap_buf = detail::extract_raw_pointer(Ap.handle()) + viennacl::traits::start(Ap);; - value_type const * p_buf = detail::extract_raw_pointer(p.handle()) + viennacl::traits::start(p);; - value_type const * elements = detail::extract_raw_pointer(A.handle()); - unsigned int const * coord_buffer = detail::extract_raw_pointer(A.handle12()); - value_type * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - - // flush result buffer (cannot be expected to be zero) - for (vcl_size_t i = 0; i< Ap.size(); ++i) - Ap_buf[i] = 0; - - // matrix-vector product with a general COO format - for (vcl_size_t i = 0; i < A.nnz(); ++i) - Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]]; - - // computing the inner products (Ap, Ap) and (p, Ap): - // Note: The COO format does not allow to inject the subsequent operations into the matrix-vector product, because row and column ordering assumptions are too weak - value_type inner_prod_ApAp = 0; - value_type inner_prod_pAp = 0; - value_type inner_prod_Ap_r0star = 0; - for (vcl_size_t i = 0; i - void pipelined_prod_impl(ell_matrix const & A, - vector_base const & p, - vector_base & Ap, - NumericT const * r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - typedef NumericT value_type; - - value_type * Ap_buf = detail::extract_raw_pointer(Ap.handle()) + viennacl::traits::start(Ap);; - value_type const * p_buf = detail::extract_raw_pointer(p.handle()) + viennacl::traits::start(p);; - value_type const * elements = detail::extract_raw_pointer(A.handle()); - unsigned int const * coords = detail::extract_raw_pointer(A.handle2()); - value_type * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - - value_type inner_prod_ApAp = 0; - value_type inner_prod_pAp = 0; - value_type inner_prod_Ap_r0star = 0; - for (vcl_size_t row = 0; row < A.size1(); ++row) - { - value_type sum = 0; - value_type val_p_diag = p_buf[static_cast(row)]; //likely to be loaded from cache if required again in this row - - for (unsigned int item_id = 0; item_id < A.internal_maxnnz(); ++item_id) - { - vcl_size_t offset = row + item_id * A.internal_size1(); - value_type val = elements[offset]; - - if (val) - sum += (p_buf[coords[offset]] * val); - } - - Ap_buf[row] = sum; - inner_prod_ApAp += sum * sum; - inner_prod_pAp += val_p_diag * sum; - inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0); - } - - data_buffer[ buffer_chunk_size] = inner_prod_ApAp; - data_buffer[2 * buffer_chunk_size] = inner_prod_pAp; - if (r0star) - data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star; - } - - - /** @brief Implementation of a fused matrix-vector product with an sliced_ell_matrix for an efficient pipelined CG algorithm. - * - * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0) - */ - template - void pipelined_prod_impl(sliced_ell_matrix const & A, - vector_base const & p, - vector_base & Ap, - NumericT const * r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - typedef NumericT value_type; - - value_type * Ap_buf = detail::extract_raw_pointer(Ap.handle()) + viennacl::traits::start(Ap);; - value_type const * p_buf = detail::extract_raw_pointer(p.handle()) + viennacl::traits::start(p);; - value_type const * elements = detail::extract_raw_pointer(A.handle()); - IndexT const * columns_per_block = detail::extract_raw_pointer(A.handle1()); - IndexT const * column_indices = detail::extract_raw_pointer(A.handle2()); - IndexT const * block_start = detail::extract_raw_pointer(A.handle3()); - value_type * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - - vcl_size_t num_blocks = A.size1() / A.rows_per_block() + 1; - std::vector result_values(A.rows_per_block()); - - value_type inner_prod_ApAp = 0; - value_type inner_prod_pAp = 0; - value_type inner_prod_Ap_r0star = 0; - for (vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx) - { - vcl_size_t current_columns_per_block = columns_per_block[block_idx]; - - for (vcl_size_t i=0; i - void pipelined_prod_impl(hyb_matrix const & A, - vector_base const & p, - vector_base & Ap, - NumericT const * r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - typedef NumericT value_type; - typedef unsigned int index_type; - - value_type * Ap_buf = detail::extract_raw_pointer(Ap.handle()) + viennacl::traits::start(Ap);; - value_type const * p_buf = detail::extract_raw_pointer(p.handle()) + viennacl::traits::start(p);; - value_type const * elements = detail::extract_raw_pointer(A.handle()); - index_type const * coords = detail::extract_raw_pointer(A.handle2()); - value_type const * csr_elements = detail::extract_raw_pointer(A.handle5()); - index_type const * csr_row_buffer = detail::extract_raw_pointer(A.handle3()); - index_type const * csr_col_buffer = detail::extract_raw_pointer(A.handle4()); - value_type * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - - value_type inner_prod_ApAp = 0; - value_type inner_prod_pAp = 0; - value_type inner_prod_Ap_r0star = 0; - for (vcl_size_t row = 0; row < A.size1(); ++row) - { - value_type val_p_diag = p_buf[static_cast(row)]; //likely to be loaded from cache if required again in this row - value_type sum = 0; - - // - // Part 1: Process ELL part - // - for (index_type item_id = 0; item_id < A.internal_ellnnz(); ++item_id) - { - vcl_size_t offset = row + item_id * A.internal_size1(); - value_type val = elements[offset]; - - if (val) - sum += p_buf[coords[offset]] * val; - } - - // - // Part 2: Process HYB part - // - vcl_size_t col_begin = csr_row_buffer[row]; - vcl_size_t col_end = csr_row_buffer[row + 1]; - - for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++) - sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id]; - - Ap_buf[row] = sum; - inner_prod_ApAp += sum * sum; - inner_prod_pAp += val_p_diag * sum; - inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0); - } - - data_buffer[ buffer_chunk_size] = inner_prod_ApAp; - data_buffer[2 * buffer_chunk_size] = inner_prod_pAp; - if (r0star) - data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star; - } - -} // namespace detail - - -/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm. - * - * This routines computes for vectors 'result', 'p', 'r', 'Ap': - * result += alpha * p; - * r -= alpha * Ap; - * p = r + beta * p; - * and runs the parallel reduction stage for computing inner_prod(r,r) - */ -template -void pipelined_cg_vector_update(vector_base & result, - NumericT alpha, - vector_base & p, - vector_base & r, - vector_base const & Ap, - NumericT beta, - vector_base & inner_prod_buffer) -{ - typedef NumericT value_type; - - value_type * data_result = detail::extract_raw_pointer(result); - value_type * data_p = detail::extract_raw_pointer(p); - value_type * data_r = detail::extract_raw_pointer(r); - value_type const * data_Ap = detail::extract_raw_pointer(Ap); - value_type * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - - // Note: Due to the special setting in CG, there is no need to check for sizes and strides - vcl_size_t size = viennacl::traits::size(result); - - value_type inner_prod_r = 0; - for (long i = 0; i < static_cast(size); ++i) - { - value_type value_p = data_p[static_cast(i)]; - value_type value_r = data_r[static_cast(i)]; - - - data_result[static_cast(i)] += alpha * value_p; - value_r -= alpha * data_Ap[static_cast(i)]; - value_p = value_r + beta * value_p; - inner_prod_r += value_r * value_r; - - data_p[static_cast(i)] = value_p; - data_r[static_cast(i)] = value_r; - } - - data_buffer[0] = inner_prod_r; -} - - -/** @brief Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorithm. - * - * This routines computes for a matrix A and vectors 'p' and 'Ap': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap) - */ -template -void pipelined_cg_prod(compressed_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base & inner_prod_buffer) -{ - typedef NumericT const * PtrType; - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0); -} - - - -/** @brief Performs a fused matrix-vector product with a coordinate_matrix for an efficient pipelined CG algorithm. - * - * This routines computes for a matrix A and vectors 'p' and 'Ap': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap) - */ -template -void pipelined_cg_prod(coordinate_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base & inner_prod_buffer) -{ - typedef NumericT const * PtrType; - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0); -} - - -/** @brief Performs a fused matrix-vector product with an ell_matrix for an efficient pipelined CG algorithm. - * - * This routines computes for a matrix A and vectors 'p' and 'Ap': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap) - */ -template -void pipelined_cg_prod(ell_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base & inner_prod_buffer) -{ - typedef NumericT const * PtrType; - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0); -} - - -/** @brief Performs a fused matrix-vector product with an sliced_ell_matrix for an efficient pipelined CG algorithm. - * - * This routines computes for a matrix A and vectors 'p' and 'Ap': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap) - */ -template -void pipelined_cg_prod(sliced_ell_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base & inner_prod_buffer) -{ - typedef NumericT const * PtrType; - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0); -} - - - - -/** @brief Performs a fused matrix-vector product with an hyb_matrix for an efficient pipelined CG algorithm. - * - * This routines computes for a matrix A and vectors 'p' and 'Ap': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap) - */ -template -void pipelined_cg_prod(hyb_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base & inner_prod_buffer) -{ - typedef NumericT const * PtrType; - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0); -} - -////////////////////////// - - -/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm. - * - * This routines computes for vectors 's', 'r', 'Ap': - * s = r - alpha * Ap - * with alpha obtained from a reduction step on the 0th and the 3rd out of 6 chunks in inner_prod_buffer - * and runs the parallel reduction stage for computing inner_prod(s,s) - */ -template -void pipelined_bicgstab_update_s(vector_base & s, - vector_base & r, - vector_base const & Ap, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - typedef NumericT value_type; - - value_type * data_s = detail::extract_raw_pointer(s); - value_type * data_r = detail::extract_raw_pointer(r); - value_type const * data_Ap = detail::extract_raw_pointer(Ap); - value_type * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - - // Note: Due to the special setting in CG, there is no need to check for sizes and strides - vcl_size_t size = viennacl::traits::size(s); - - // part 1: compute alpha: - value_type r_in_r0 = 0; - value_type Ap_in_r0 = 0; - for (vcl_size_t i=0; i(size); ++i) - { - value_type value_s = data_s[static_cast(i)]; - - value_s = data_r[static_cast(i)] - alpha * data_Ap[static_cast(i)]; - inner_prod_s += value_s * value_s; - - data_s[static_cast(i)] = value_s; - } - - data_buffer[buffer_chunk_offset] = inner_prod_s; -} - -/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm. - * - * x_{j+1} = x_j + alpha * p_j + omega * s_j - * r_{j+1} = s_j - omega * t_j - * p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j) - * and compute first stage of r_dot_r0 = for use in next iteration - */ - template - void pipelined_bicgstab_vector_update(vector_base & result, NumericT alpha, vector_base & p, NumericT omega, vector_base const & s, - vector_base & residual, vector_base const & As, - NumericT beta, vector_base const & Ap, - vector_base const & r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size) - { - typedef NumericT value_type; - - value_type * data_result = detail::extract_raw_pointer(result); - value_type * data_p = detail::extract_raw_pointer(p); - value_type const * data_s = detail::extract_raw_pointer(s); - value_type * data_residual = detail::extract_raw_pointer(residual); - value_type const * data_As = detail::extract_raw_pointer(As); - value_type const * data_Ap = detail::extract_raw_pointer(Ap); - value_type const * data_r0star = detail::extract_raw_pointer(r0star); - value_type * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - - vcl_size_t size = viennacl::traits::size(result); - - value_type inner_prod_r_r0star = 0; - for (long i = 0; i < static_cast(size); ++i) - { - vcl_size_t index = static_cast(i); - value_type value_result = data_result[index]; - value_type value_p = data_p[index]; - value_type value_s = data_s[index]; - value_type value_residual = data_residual[index]; - value_type value_As = data_As[index]; - value_type value_Ap = data_Ap[index]; - value_type value_r0star = data_r0star[index]; - - value_result += alpha * value_p + omega * value_s; - value_residual = value_s - omega * value_As; - value_p = value_residual + beta * (value_p - omega * value_Ap); - inner_prod_r_r0star += value_residual * value_r0star; - - data_result[index] = value_result; - data_residual[index] = value_residual; - data_p[index] = value_p; - } - - (void)buffer_chunk_size; // not needed here, just silence compiler warning (unused variable) - data_buffer[0] = inner_prod_r_r0star; - } - - /** @brief Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab algorithm. - * - * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0) - */ - template - void pipelined_bicgstab_prod(compressed_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base const & r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - NumericT const * data_r0star = detail::extract_raw_pointer(r0star); - - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); - } - - /** @brief Performs a fused matrix-vector product with a coordinate_matrix for an efficient pipelined BiCGStab algorithm. - * - * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0) - */ - template - void pipelined_bicgstab_prod(coordinate_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base const & r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - NumericT const * data_r0star = detail::extract_raw_pointer(r0star); - - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); - } - - /** @brief Performs a fused matrix-vector product with an ell_matrix for an efficient pipelined BiCGStab algorithm. - * - * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0) - */ - template - void pipelined_bicgstab_prod(ell_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base const & r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - NumericT const * data_r0star = detail::extract_raw_pointer(r0star); - - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); - } - - /** @brief Performs a fused matrix-vector product with a sliced_ell_matrix for an efficient pipelined BiCGStab algorithm. - * - * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0) - */ - template - void pipelined_bicgstab_prod(sliced_ell_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base const & r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - NumericT const * data_r0star = detail::extract_raw_pointer(r0star); - - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); - } - - /** @brief Performs a fused matrix-vector product with a hyb_matrix for an efficient pipelined BiCGStab algorithm. - * - * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0': - * Ap = prod(A, p); - * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0) - */ - template - void pipelined_bicgstab_prod(hyb_matrix const & A, - vector_base const & p, - vector_base & Ap, - vector_base const & r0star, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) - { - NumericT const * data_r0star = detail::extract_raw_pointer(r0star); - - viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); - } - - -///////////////////////////////////////////////////////////// - -/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm. - * - * This routines computes for vectors 'r', 'v_k': - * Second reduction step for ||v_k|| - * v_k /= ||v_k|| - * First reduction step for - */ -template -void pipelined_gmres_normalize_vk(vector_base & v_k, - vector_base const & residual, - vector_base & R_buffer, - vcl_size_t offset_in_R, - vector_base const & inner_prod_buffer, - vector_base & r_dot_vk_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - typedef T value_type; - - value_type * data_v_k = detail::extract_raw_pointer(v_k); - value_type const * data_residual = detail::extract_raw_pointer(residual); - value_type * data_R = detail::extract_raw_pointer(R_buffer); - value_type const * data_buffer = detail::extract_raw_pointer(inner_prod_buffer); - value_type * data_r_dot_vk = detail::extract_raw_pointer(r_dot_vk_buffer); - - // Note: Due to the special setting in GMRES, there is no need to check for sizes and strides - vcl_size_t size = viennacl::traits::size(v_k); - vcl_size_t vk_start = viennacl::traits::start(v_k); - - // part 1: compute alpha: - value_type norm_vk = 0; - for (vcl_size_t i=0; i after normalization of v_k: - value_type inner_prod_r_dot_vk = 0; - for (long i = 0; i < static_cast(size); ++i) - { - value_type value_vk = data_v_k[static_cast(i) + vk_start] / norm_vk; - - inner_prod_r_dot_vk += data_residual[static_cast(i)] * value_vk; - - data_v_k[static_cast(i) + vk_start] = value_vk; - } - - data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk; -} - - - -/** @brief Computes first reduction stage for multiple inner products , i=0..k-1 - * - * All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size' - */ -template -void pipelined_gmres_gram_schmidt_stage1(vector_base const & device_krylov_basis, - vcl_size_t v_k_size, - vcl_size_t v_k_internal_size, - vcl_size_t k, - vector_base & vi_in_vk_buffer, - vcl_size_t buffer_chunk_size) -{ - typedef T value_type; - - value_type const * data_krylov_basis = detail::extract_raw_pointer(device_krylov_basis); - value_type * data_inner_prod = detail::extract_raw_pointer(vi_in_vk_buffer); - - // reset buffer: - for (vcl_size_t j = 0; j < k; ++j) - data_inner_prod[j*buffer_chunk_size] = value_type(0); - - // compute inner products: - for (vcl_size_t i = 0; i < v_k_size; ++i) - { - value_type value_vk = data_krylov_basis[static_cast(i) + k * v_k_internal_size]; - - for (vcl_size_t j = 0; j < k; ++j) - data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast(i) + j * v_k_internal_size] * value_vk; - } -} - - -/** @brief Computes the second reduction stage for multiple inner products , i=0..k-1, then updates v_k -= v_i and computes the first reduction stage for ||v_k|| - * - * All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size' - */ -template -void pipelined_gmres_gram_schmidt_stage2(vector_base & device_krylov_basis, - vcl_size_t v_k_size, - vcl_size_t v_k_internal_size, - vcl_size_t k, - vector_base const & vi_in_vk_buffer, - vector_base & R_buffer, - vcl_size_t krylov_dim, - vector_base & inner_prod_buffer, - vcl_size_t buffer_chunk_size) -{ - typedef T value_type; - - value_type * data_krylov_basis = detail::extract_raw_pointer(device_krylov_basis); - - std::vector values_vi_in_vk(k); - - // Step 1: Finish reduction of to obtain scalars: - for (std::size_t i=0; i v_i and reduction on ||v_k||: - value_type norm_vk = 0; - for (vcl_size_t i = 0; i < v_k_size; ++i) - { - value_type value_vk = data_krylov_basis[static_cast(i) + k * v_k_internal_size]; - - for (vcl_size_t j = 0; j < k; ++j) - value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast(i) + j * v_k_internal_size]; - - norm_vk += value_vk * value_vk; - data_krylov_basis[static_cast(i) + k * v_k_internal_size] = value_vk; - } - - // Step 3: Write values to R_buffer: - for (std::size_t i=0; i -void pipelined_gmres_update_result(vector_base & result, - vector_base const & residual, - vector_base const & krylov_basis, - vcl_size_t v_k_size, - vcl_size_t v_k_internal_size, - vector_base const & coefficients, - vcl_size_t k) -{ - typedef T value_type; - - value_type * data_result = detail::extract_raw_pointer(result); - value_type const * data_residual = detail::extract_raw_pointer(residual); - value_type const * data_krylov_basis = detail::extract_raw_pointer(krylov_basis); - value_type const * data_coefficients = detail::extract_raw_pointer(coefficients); - - for (vcl_size_t i = 0; i < v_k_size; ++i) - { - value_type value_result = data_result[i]; - - value_result += data_coefficients[0] * data_residual[i]; - for (vcl_size_t j = 1; j -void pipelined_gmres_prod(MatrixType const & A, - vector_base const & p, - vector_base & Ap, - vector_base & inner_prod_buffer) -{ - pipelined_cg_prod(A, p, Ap, inner_prod_buffer); -} - - -} //namespace host_based -} //namespace linalg -} //namespace viennacl - - -#endif