Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id D791B200B44 for ; Wed, 8 Jun 2016 23:40:04 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id D67B9160A35; Wed, 8 Jun 2016 21:40:04 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id E9B5A160A0E for ; Wed, 8 Jun 2016 23:40:02 +0200 (CEST) Received: (qmail 73042 invoked by uid 500); 8 Jun 2016 21:40:00 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 71780 invoked by uid 99); 8 Jun 2016 21:39:59 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Jun 2016 21:39:59 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 6B12CE07F4; Wed, 8 Jun 2016 21:39:59 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: apalumbo@apache.org To: commits@mahout.apache.org Date: Wed, 08 Jun 2016 21:40:19 -0000 Message-Id: <4fe0474417664a42be8691ee28982501@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [22/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241 archived-at: Wed, 08 Jun 2016 21:40:05 -0000 http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp new file mode 100644 index 0000000..1d212c2 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp @@ -0,0 +1,307 @@ +#ifndef VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP +#define VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp rupp@iue.tuwien.ac.at + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/host_based/direct_solve.hpp + @brief Implementations of dense direct triangular solvers are found here. +*/ + +#include "viennacl/vector.hpp" +#include "viennacl/matrix.hpp" + +#include "viennacl/linalg/host_based/common.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace host_based +{ + +namespace detail +{ + // + // Upper solve: + // + template + void upper_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal) + { + typedef typename MatrixT2::value_type value_type; + + for (vcl_size_t i = 0; i < A_size; ++i) + { + vcl_size_t current_row = A_size - i - 1; + + for (vcl_size_t j = current_row + 1; j < A_size; ++j) + { + value_type A_element = A(current_row, j); + for (vcl_size_t k=0; k < B_size; ++k) + B(current_row, k) -= A_element * B(j, k); + } + + if (!unit_diagonal) + { + value_type A_diag = A(current_row, current_row); + for (vcl_size_t k=0; k < B_size; ++k) + B(current_row, k) /= A_diag; + } + } + } + + template + void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_upper_tag) + { + upper_inplace_solve_matrix(A, B, A_size, B_size, true); + } + + template + void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::upper_tag) + { + upper_inplace_solve_matrix(A, B, A_size, B_size, false); + } + + // + // Lower solve: + // + template + void lower_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal) + { + typedef typename MatrixT2::value_type value_type; + + for (vcl_size_t i = 0; i < A_size; ++i) + { + for (vcl_size_t j = 0; j < i; ++j) + { + value_type A_element = A(i, j); + for (vcl_size_t k=0; k < B_size; ++k) + B(i, k) -= A_element * B(j, k); + } + + if (!unit_diagonal) + { + value_type A_diag = A(i, i); + for (vcl_size_t k=0; k < B_size; ++k) + B(i, k) /= A_diag; + } + } + } + + template + void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_lower_tag) + { + lower_inplace_solve_matrix(A, B, A_size, B_size, true); + } + + template + void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::lower_tag) + { + lower_inplace_solve_matrix(A, B, A_size, B_size, false); + } + +} + +// +// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here. +// + +////////////////// upper triangular solver (upper_tag) ////////////////////////////////////// +/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation) +* +* @param A The system matrix +* @param B The matrix of row vectors, where the solution is directly written to +*/ +template +void inplace_solve(matrix_base const & A, + matrix_base & B, + SolverTagT) +{ + typedef NumericT value_type; + + value_type const * data_A = detail::extract_raw_pointer(A); + value_type * data_B = detail::extract_raw_pointer(B); + + vcl_size_t A_start1 = viennacl::traits::start1(A); + vcl_size_t A_start2 = viennacl::traits::start2(A); + vcl_size_t A_inc1 = viennacl::traits::stride1(A); + vcl_size_t A_inc2 = viennacl::traits::stride2(A); + //vcl_size_t A_size1 = viennacl::traits::size1(A); + vcl_size_t A_size2 = viennacl::traits::size2(A); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A); + + vcl_size_t B_start1 = viennacl::traits::start1(B); + vcl_size_t B_start2 = viennacl::traits::start2(B); + vcl_size_t B_inc1 = viennacl::traits::stride1(B); + vcl_size_t B_inc2 = viennacl::traits::stride2(B); + //vcl_size_t B_size1 = viennacl::traits::size1(B); + vcl_size_t B_size2 = viennacl::traits::size2(B); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B); + + + if (A.row_major() && B.row_major()) + { + detail::matrix_array_wrapper wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + + detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT()); + } + else if (A.row_major() && !B.row_major()) + { + detail::matrix_array_wrapper wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + + detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT()); + } + else if (!A.row_major() && B.row_major()) + { + detail::matrix_array_wrapper wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + + detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT()); + } + else + { + detail::matrix_array_wrapper wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + + detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT()); + } +} + + +// +// Solve on vector +// + +namespace detail +{ + // + // Upper solve: + // + template + void upper_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal) + { + typedef typename VectorT::value_type value_type; + + for (vcl_size_t i = 0; i < A_size; ++i) + { + vcl_size_t current_row = A_size - i - 1; + + for (vcl_size_t j = current_row + 1; j < A_size; ++j) + { + value_type A_element = A(current_row, j); + b(current_row) -= A_element * b(j); + } + + if (!unit_diagonal) + b(current_row) /= A(current_row, current_row); + } + } + + template + void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_upper_tag) + { + upper_inplace_solve_vector(A, b, A_size, true); + } + + template + void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::upper_tag) + { + upper_inplace_solve_vector(A, b, A_size, false); + } + + // + // Lower solve: + // + template + void lower_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal) + { + typedef typename VectorT::value_type value_type; + + for (vcl_size_t i = 0; i < A_size; ++i) + { + for (vcl_size_t j = 0; j < i; ++j) + { + value_type A_element = A(i, j); + b(i) -= A_element * b(j); + } + + if (!unit_diagonal) + b(i) /= A(i, i); + } + } + + template + void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_lower_tag) + { + lower_inplace_solve_vector(A, b, A_size, true); + } + + template + void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::lower_tag) + { + lower_inplace_solve_vector(A, b, A_size, false); + } + +} + +template +void inplace_solve(matrix_base const & mat, + vector_base & vec, + SolverTagT) +{ + typedef NumericT value_type; + + value_type const * data_A = detail::extract_raw_pointer(mat); + value_type * data_v = detail::extract_raw_pointer(vec); + + vcl_size_t A_start1 = viennacl::traits::start1(mat); + vcl_size_t A_start2 = viennacl::traits::start2(mat); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat); + vcl_size_t A_size2 = viennacl::traits::size2(mat); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); + + vcl_size_t start1 = viennacl::traits::start(vec); + vcl_size_t inc1 = viennacl::traits::stride(vec); + + if (mat.row_major()) + { + detail::matrix_array_wrapper wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::vector_array_wrapper wrapper_v(data_v, start1, inc1); + + detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT()); + } + else + { + detail::matrix_array_wrapper wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::vector_array_wrapper wrapper_v(data_v, start1, inc1); + + detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT()); + } +} + + +} // namespace host_based +} // namespace linalg +} // namespace viennacl + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp new file mode 100644 index 0000000..f53f8f2 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp @@ -0,0 +1,856 @@ +#ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_ +#define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp rupp@iue.tuwien.ac.at + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/host_based/fft_operations.hpp + @brief Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled execution on CPU + */ + +//TODO openom Conditions +#include +#include + +#include "viennacl/linalg/host_based/vector_operations.hpp" + +#include +#include +#include + +namespace viennacl +{ +namespace linalg +{ +namespace host_based +{ +namespace detail +{ + namespace fft + { + const vcl_size_t MAX_LOCAL_POINTS_NUM = 512; + + namespace FFT_DATA_ORDER + { + enum DATA_ORDER + { + ROW_MAJOR, COL_MAJOR + }; + } + + inline vcl_size_t num_bits(vcl_size_t size) + { + vcl_size_t bits_datasize = 0; + vcl_size_t ds = 1; + + while (ds < size) + { + ds = ds << 1; + bits_datasize++; + } + + return bits_datasize; + } + + inline vcl_size_t next_power_2(vcl_size_t n) + { + n = n - 1; + + vcl_size_t power = 1; + + while (power < sizeof(vcl_size_t) * 8) + { + n = n | (n >> power); + power *= 2; + } + + return n + 1; + } + + inline vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size) + { + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + v = (v >> 16) | (v << 16); + v = v >> (32 - bit_size); + return v; + } + + template + void copy_to_complex_array(std::complex * input_complex, + viennacl::vector const & in, vcl_size_t size) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size * 2); i2 += 2) + { //change array to complex array + vcl_size_t i = vcl_size_t(i2); + input_complex[i / 2] = std::complex(in[i], in[i + 1]); + } + } + + template + void copy_to_complex_array(std::complex * input_complex, + viennacl::vector_base const & in, vcl_size_t size) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size * 2); i2 += 2) + { //change array to complex array + vcl_size_t i = vcl_size_t(i2); + input_complex[i / 2] = std::complex(in[i], in[i + 1]); + } + } + + template + void copy_to_vector(std::complex * input_complex, + viennacl::vector & in, vcl_size_t size) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + in(i * 2) = static_cast(std::real(input_complex[i])); + in(i * 2 + 1) = static_cast(std::imag(input_complex[i])); + } + } + + template + void copy_to_complex_array(std::complex * input_complex, + NumericT const * in, vcl_size_t size) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size * 2); i2 += 2) + { //change array to complex array + vcl_size_t i = vcl_size_t(i2); + input_complex[i / 2] = std::complex(in[i], in[i + 1]); + } + } + + template + void copy_to_vector(std::complex * input_complex, NumericT * in, vcl_size_t size) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + in[i * 2] = static_cast(std::real(input_complex[i])); + in[i * 2 + 1] = static_cast(std::imag(input_complex[i])); + } + } + + template + void copy_to_vector(std::complex * input_complex, + viennacl::vector_base & in, vcl_size_t size) + { + std::vector temp(2 * size); +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + temp[i * 2] = static_cast(std::real(input_complex[i])); + temp[i * 2 + 1] = static_cast(std::imag(input_complex[i])); + } + viennacl::copy(temp, in); + } + + template + void zero2(NumericT *input1, NumericT *input2, vcl_size_t size) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + input1[i] = 0; + input2[i] = 0; + } + } + + } //namespace fft + +} //namespace detail + +/** + * @brief Direct algoritm kenrnel + */ +template +void fft_direct(std::complex * input_complex, std::complex * output, + vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign, + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + NumericT const NUM_PI = NumericT(3.14159265358979323846); +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel +#endif + for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++) + { + vcl_size_t batch_id = vcl_size_t(batch_id2); + for (vcl_size_t k = 0; k < size; k += 1) + { + std::complex f = 0; + for (vcl_size_t n = 0; n < size; n++) + { + std::complex input; + if (!data_order) + input = input_complex[batch_id * stride + n]; //input index here + else + input = input_complex[n * stride + batch_id]; + NumericT arg = sign * 2 * NUM_PI * NumericT(k) / NumericT(size * n); + NumericT sn = std::sin(arg); + NumericT cs = std::cos(arg); + + std::complex ex(cs, sn); + std::complex tmp(input.real() * ex.real() - input.imag() * ex.imag(), + input.real() * ex.imag() + input.imag() * ex.real()); + f = f + tmp; + } + if (!data_order) + output[batch_id * stride + k] = f; // output index here + else + output[k * stride + batch_id] = f; + } + } + +} + +/** + * @brief Direct 1D algorithm for computing Fourier transformation. + * + * Works on any sizes of data. + * Serial implementation has o(n^2) complexity + */ +template +void direct(viennacl::vector const & in, + viennacl::vector & out, + vcl_size_t size, vcl_size_t stride, + vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + std::vector > input_complex(size * batch_num); + std::vector > output(size * batch_num); + + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num); + + fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order); + + viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], out, size * batch_num); +} + +/** + * @brief Direct 2D algorithm for computing Fourier transformation. + * + * Works on any sizes of data. + * Serial implementation has o(n^2) complexity + */ +template +void direct(viennacl::matrix const & in, + viennacl::matrix & out, vcl_size_t size, + vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + vcl_size_t row_num = in.internal_size1(); + vcl_size_t col_num = in.internal_size2() >> 1; + + vcl_size_t size_mat = row_num * col_num; + + std::vector > input_complex(size_mat); + std::vector > output(size_mat); + + NumericT const * data_A = detail::extract_raw_pointer(in); + NumericT * data_B = detail::extract_raw_pointer(out); + + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size_mat); + + fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order); + + viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], data_B, size_mat); +} + +/* + * This function performs reorder of 1D input data. Indexes are sorted in bit-reversal order. + * Such reordering should be done before in-place FFT. + */ +template +void reorder(viennacl::vector& in, vcl_size_t size, vcl_size_t stride, + vcl_size_t bits_datasize, vcl_size_t batch_num, + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + std::vector > input(size * batch_num); + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], in, size * batch_num); +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++) + { + vcl_size_t batch_id = vcl_size_t(batch_id2); + for (vcl_size_t i = 0; i < size; i++) + { + vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize); + if (i < v) + { + if (!data_order) + { + std::complex tmp = input[batch_id * stride + i]; // index + input[batch_id * stride + i] = input[batch_id * stride + v]; //index + input[batch_id * stride + v] = tmp; //index + } + else + { + std::complex tmp = input[i * stride + batch_id]; // index + input[i * stride + batch_id] = input[v * stride + batch_id]; //index + input[v * stride + batch_id] = tmp; //index + } + } + } + } + viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], in, size * batch_num); +} + +/* + * This function performs reorder of 2D input data. Indexes are sorted in bit-reversal order. + * Such reordering should be done before in-place FFT. + */ +template +void reorder(viennacl::matrix& in, + vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + + NumericT * data = detail::extract_raw_pointer(in); + vcl_size_t row_num = in.internal_size1(); + vcl_size_t col_num = in.internal_size2() >> 1; + vcl_size_t size_mat = row_num * col_num; + + std::vector > input(size_mat); + + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], data, size_mat); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++) + { + vcl_size_t batch_id = vcl_size_t(batch_id2); + for (vcl_size_t i = 0; i < size; i++) + { + vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize); + if (i < v) + { + if (!data_order) + { + std::complex tmp = input[batch_id * stride + i]; // index + input[batch_id * stride + i] = input[batch_id * stride + v]; //index + input[batch_id * stride + v] = tmp; //index + } else + { + std::complex tmp = input[i * stride + batch_id]; // index + input[i * stride + batch_id] = input[v * stride + batch_id]; //index + input[v * stride + batch_id] = tmp; //index + } + } + } + } + viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], data, size_mat); +} + +/** + * @brief Radix-2 algorithm for computing Fourier transformation. + * Kernel for computing smaller amount of data + */ +template +void fft_radix2(std::complex * input_complex, vcl_size_t batch_num, + vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign, + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + NumericT const NUM_PI = NumericT(3.14159265358979323846); + + for (vcl_size_t step = 0; step < bit_size; step++) + { + vcl_size_t ss = 1 << step; + vcl_size_t half_size = size >> 1; + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++) + { + vcl_size_t batch_id = vcl_size_t(batch_id2); + for (vcl_size_t tid = 0; tid < half_size; tid++) + { + vcl_size_t group = (tid & (ss - 1)); + vcl_size_t pos = ((tid >> step) << (step + 1)) + group; + std::complex in1; + std::complex in2; + vcl_size_t offset; + if (!data_order) + { + offset = batch_id * stride + pos; + in1 = input_complex[offset]; + in2 = input_complex[offset + ss]; + } + else + { + offset = pos * stride + batch_id; + in1 = input_complex[offset]; + in2 = input_complex[offset + ss * stride]; + } + NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss); + NumericT sn = std::sin(arg); + NumericT cs = std::cos(arg); + std::complex ex(cs, sn); + std::complex tmp(in2.real() * ex.real() - in2.imag() * ex.imag(), + in2.real() * ex.imag() + in2.imag() * ex.real()); + if (!data_order) + input_complex[offset + ss] = in1 - tmp; + else + input_complex[offset + ss * stride] = in1 - tmp; + input_complex[offset] = in1 + tmp; + } + } + } + +} + +/** + * @brief Radix-2 algorithm for computing Fourier transformation. + * Kernel for computing bigger amount of data + */ +template +void fft_radix2_local(std::complex * input_complex, + std::complex * lcl_input, vcl_size_t batch_num, vcl_size_t bit_size, + vcl_size_t size, vcl_size_t stride, NumericT sign, + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + NumericT const NUM_PI = NumericT(3.14159265358979323846); + + for (vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long p2 = 0; p2 < long(size); p2 += 1) + { + vcl_size_t p = vcl_size_t(p2); + vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(p, bit_size); + + if (!data_order) + lcl_input[v] = input_complex[batch_id * stride + p]; //index + else + lcl_input[v] = input_complex[p * stride + batch_id]; + } + + for (vcl_size_t s = 0; s < bit_size; s++) + { + vcl_size_t ss = 1 << s; +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long tid2 = 0; tid2 < long(size)/2; tid2++) + { + vcl_size_t tid = vcl_size_t(tid2); + vcl_size_t group = (tid & (ss - 1)); + vcl_size_t pos = ((tid >> s) << (s + 1)) + group; + + std::complex in1 = lcl_input[pos]; + std::complex in2 = lcl_input[pos + ss]; + + NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss); + + NumericT sn = std::sin(arg); + NumericT cs = std::cos(arg); + std::complex ex(cs, sn); + + std::complex tmp(in2.real() * ex.real() - in2.imag() * ex.imag(), + in2.real() * ex.imag() + in2.imag() * ex.real()); + + lcl_input[pos + ss] = in1 - tmp; + lcl_input[pos] = in1 + tmp; + } + + } +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + //copy local array back to global memory + for (long p2 = 0; p2 < long(size); p2 += 1) + { + vcl_size_t p = vcl_size_t(p2); + if (!data_order) + input_complex[batch_id * stride + p] = lcl_input[p]; + else + input_complex[p * stride + batch_id] = lcl_input[p]; + + } + + } + +} + +/** + * @brief Radix-2 1D algorithm for computing Fourier transformation. + * + * Works only on power-of-two sizes of data. + * Serial implementation has o(n * lg n) complexity. + * This is a Cooley-Tukey algorithm + */ +template +void radix2(viennacl::vector& in, vcl_size_t size, vcl_size_t stride, + vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + + vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size); + + std::vector > input_complex(size * batch_num); + std::vector > lcl_input(size * batch_num); + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num); + + if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM) + { + viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order); + } + else + { + viennacl::linalg::host_based::reorder(in, size, stride, bit_size, batch_num, data_order); + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num); + viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order); + } + + viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size * batch_num); +} + +/** + * @brief Radix-2 2D algorithm for computing Fourier transformation. + * + * Works only on power-of-two sizes of data. + * Serial implementation has o(n * lg n) complexity. + * This is a Cooley-Tukey algorithm + */ +template +void radix2(viennacl::matrix& in, vcl_size_t size, + vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + + vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size); + + NumericT * data = detail::extract_raw_pointer(in); + + vcl_size_t row_num = in.internal_size1(); + vcl_size_t col_num = in.internal_size2() >> 1; + vcl_size_t size_mat = row_num * col_num; + + std::vector > input_complex(size_mat); + + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat); + if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM) + { + //std::cout< > lcl_input(size_mat); + viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order); + } + else + { + viennacl::linalg::host_based::reorder(in, size, stride, bit_size, batch_num, data_order); + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat); + viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order); + } + + viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size_mat); + +} + +/** + * @brief Bluestein's algorithm for computing Fourier transformation. + * + * Currently, Works only for sizes of input data which less than 2^16. + * Uses a lot of additional memory, but should be fast for any size of data. + * Serial implementation has something about o(n * lg n) complexity + */ +template +void bluestein(viennacl::vector& in, viennacl::vector& out, vcl_size_t /*batch_num*/) +{ + + vcl_size_t size = in.size() >> 1; + vcl_size_t ext_size = viennacl::linalg::host_based::detail::fft::next_power_2(2 * size - 1); + + viennacl::vector A(ext_size << 1); + viennacl::vector B(ext_size << 1); + viennacl::vector Z(ext_size << 1); + + std::vector > input_complex(size); + std::vector > output_complex(size); + + std::vector > A_complex(ext_size); + std::vector > B_complex(ext_size); + std::vector > Z_complex(ext_size); + + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size); +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < long(ext_size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + A_complex[i] = 0; + B_complex[i] = 0; + } + + vcl_size_t double_size = size << 1; + + NumericT const NUM_PI = NumericT(3.14159265358979323846); +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + vcl_size_t rm = i * i % (double_size); + NumericT angle = NumericT(rm) / NumericT(size) * NumericT(NUM_PI); + + NumericT sn_a = std::sin(-angle); + NumericT cs_a = std::cos(-angle); + + std::complex a_i(cs_a, sn_a); + std::complex b_i(cs_a, -sn_a); + + A_complex[i] = std::complex(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(), + input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real()); + B_complex[i] = b_i; + + // very bad instruction, to be fixed + if (i) + B_complex[ext_size - i] = b_i; + } + + viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size); + viennacl::linalg::host_based::detail::fft::copy_to_vector(&A_complex[0], A, ext_size); + viennacl::linalg::host_based::detail::fft::copy_to_vector(&B_complex[0], B, ext_size); + + viennacl::linalg::convolve_i(A, B, Z); + + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&Z_complex[0], Z, ext_size); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + vcl_size_t rm = i * i % (double_size); + NumericT angle = NumericT(rm) / NumericT(size) * NumericT(-NUM_PI); + NumericT sn_a = std::sin(angle); + NumericT cs_a = std::cos(angle); + std::complex b_i(cs_a, sn_a); + output_complex[i] = std::complex(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(), + Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real()); + } + viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], out, size); + +} + +/** + * @brief Normalize vector with his own size + */ +template +void normalize(viennacl::vector & input) +{ + vcl_size_t size = input.size() >> 1; + NumericT norm_factor = static_cast(size); + for (vcl_size_t i = 0; i < size * 2; i++) + input[i] /= norm_factor; + +} + +/** + * @brief Complex multiplikation of two vectors + */ +template +void multiply_complex(viennacl::vector const & input1, + viennacl::vector const & input2, + viennacl::vector & output) +{ + vcl_size_t size = input1.size() >> 1; + + std::vector > input1_complex(size); + std::vector > input2_complex(size); + std::vector > output_complex(size); + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input1_complex[0], input1, size); + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input2_complex[0], input2, size); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + std::complex in1 = input1_complex[i]; + std::complex in2 = input2_complex[i]; + output_complex[i] = std::complex(in1.real() * in2.real() - in1.imag() * in2.imag(), + in1.real() * in2.imag() + in1.imag() * in2.real()); + } + viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], output, size); + +} +/** + * @brief Inplace transpose of matrix + */ +template +void transpose(viennacl::matrix & input) +{ + vcl_size_t row_num = input.internal_size1() / 2; + vcl_size_t col_num = input.internal_size2() / 2; + + vcl_size_t size = row_num * col_num; + + NumericT * data = detail::extract_raw_pointer(input); + + std::vector > input_complex(size); + + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size); +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + vcl_size_t row = i / col_num; + vcl_size_t col = i - row * col_num; + vcl_size_t new_pos = col * row_num + row; + + if (i < new_pos) + { + std::complex val = input_complex[i]; + input_complex[i] = input_complex[new_pos]; + input_complex[new_pos] = val; + } + } + viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size); + +} + +/** + * @brief Transpose matrix + */ +template +void transpose(viennacl::matrix const & input, + viennacl::matrix & output) +{ + + vcl_size_t row_num = input.internal_size1() / 2; + vcl_size_t col_num = input.internal_size2() / 2; + vcl_size_t size = row_num * col_num; + + NumericT const * data_A = detail::extract_raw_pointer(input); + NumericT * data_B = detail::extract_raw_pointer(output); + + std::vector > input_complex(size); + viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size); + + std::vector > output_complex(size); +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + vcl_size_t row = i / col_num; + vcl_size_t col = i % col_num; + vcl_size_t new_pos = col * row_num + row; + output_complex[new_pos] = input_complex[i]; + } + viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], data_B, size); +} + +/** + * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) + */ +template +void real_to_complex(viennacl::vector_base const & in, + viennacl::vector_base & out, vcl_size_t size) +{ + NumericT const * data_in = detail::extract_raw_pointer(in); + NumericT * data_out = detail::extract_raw_pointer(out); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = static_cast(i2); + data_out[2*i ] = data_in[i]; + data_out[2*i+1] = NumericT(0); + } +} + +/** + * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) + */ +template +void complex_to_real(viennacl::vector_base const & in, + viennacl::vector_base & out, vcl_size_t size) +{ + NumericT const * data_in = detail::extract_raw_pointer(in); + NumericT * data_out = detail::extract_raw_pointer(out); + +#ifdef VIENNACL_WITH_OPENMP +#pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < long(size); i++) + data_out[i] = data_in[2*i]; +} + +/** + * @brief Reverse vector to opposite order and save it in input vector + */ +template +void reverse(viennacl::vector_base & in) +{ + vcl_size_t size = in.size(); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i2 = 0; i2 < long(size); i2++) + { + vcl_size_t i = vcl_size_t(i2); + NumericT val1 = in[i]; + NumericT val2 = in[size - i - 1]; + in[i] = val2; + in[size - i - 1] = val1; + } +} + +} //namespace host_based +} //namespace linalg +} //namespace viennacl + +#endif /* FFT_OPERATIONS_HPP_ */ http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp new file mode 100644 index 0000000..62c885a --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp @@ -0,0 +1,672 @@ +#ifndef VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_ +#define VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp rupp@iue.tuwien.ac.at + + (A list of authors and contributors can be found in the PDF manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/host_based/ilu_operations.hpp + @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using the host (OpenMP) +*/ + +#include +#include //for std::max and std::min + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/linalg/host_based/common.hpp" +#include "viennacl/linalg/vector_operations.hpp" +#include "viennacl/traits/stride.hpp" + + +// Minimum vector size for using OpenMP on vector operations: +#ifndef VIENNACL_OPENMP_ILU_MIN_SIZE + #define VIENNACL_OPENMP_ILU_MIN_SIZE 5000 +#endif + +namespace viennacl +{ +namespace linalg +{ +namespace host_based +{ + +template +void extract_L(compressed_matrix const & A, + compressed_matrix & L) +{ + // L is known to have correct dimensions + + unsigned int const *A_row_buffer = detail::extract_raw_pointer(A.handle1()); + unsigned int const *A_col_buffer = detail::extract_raw_pointer(A.handle2()); + NumericT const *A_elements = detail::extract_raw_pointer(A.handle()); + + unsigned int *L_row_buffer = detail::extract_raw_pointer(L.handle1()); + + // + // Step 1: Count elements in L + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = A_row_buffer[row]; + unsigned int col_end = A_row_buffer[row+1]; + + for (unsigned int j = col_begin; j < col_end; ++j) + { + unsigned int col = A_col_buffer[j]; + if (long(col) <= row) + ++L_row_buffer[row]; + } + } + + // + // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices + // + viennacl::vector_base wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1); + viennacl::linalg::exclusive_scan(wrapped_L_row_buffer); + L.reserve(wrapped_L_row_buffer[L.size1()], false); + + unsigned int *L_col_buffer = detail::extract_raw_pointer(L.handle2()); + NumericT *L_elements = detail::extract_raw_pointer(L.handle()); + + // + // Step 3: Write entries: + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = A_row_buffer[row]; + unsigned int col_end = A_row_buffer[row+1]; + + unsigned int index_L = L_row_buffer[row]; + for (unsigned int j = col_begin; j < col_end; ++j) + { + unsigned int col = A_col_buffer[j]; + NumericT value = A_elements[j]; + + if (long(col) <= row) + { + L_col_buffer[index_L] = col; + L_elements[index_L] = value; + ++index_L; + } + } + } + +} // extract_L + + +/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ +template +void icc_scale(compressed_matrix const & A, + compressed_matrix & L) +{ + viennacl::vector D(A.size1(), viennacl::traits::context(A)); + + unsigned int const *A_row_buffer = detail::extract_raw_pointer(A.handle1()); + unsigned int const *A_col_buffer = detail::extract_raw_pointer(A.handle2()); + NumericT const *A_elements = detail::extract_raw_pointer(A.handle()); + + NumericT *D_elements = detail::extract_raw_pointer(D.handle()); + + // + // Step 1: Determine D + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = A_row_buffer[row]; + unsigned int col_end = A_row_buffer[row+1]; + + for (unsigned int j = col_begin; j < col_end; ++j) + { + unsigned int col = A_col_buffer[j]; + if (row == col) + { + D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j])); + break; + } + } + } + + // + // Step 2: Scale values in L: + // + unsigned int const *L_row_buffer = detail::extract_raw_pointer(L.handle1()); + unsigned int const *L_col_buffer = detail::extract_raw_pointer(L.handle2()); + NumericT *L_elements = detail::extract_raw_pointer(L.handle()); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = L_row_buffer[row]; + unsigned int col_end = L_row_buffer[row+1]; + + NumericT D_row = D_elements[row]; + + for (unsigned int j = col_begin; j < col_end; ++j) + L_elements[j] *= D_row * D_elements[L_col_buffer[j]]; + } + + L.generate_row_block_information(); +} + + + +/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC using OpenMP (cf. Algorithm 3 in paper, but for L rather than U) */ +template +void icc_chow_patel_sweep(compressed_matrix & L, + vector & aij_L) +{ + unsigned int const *L_row_buffer = detail::extract_raw_pointer(L.handle1()); + unsigned int const *L_col_buffer = detail::extract_raw_pointer(L.handle2()); + NumericT *L_elements = detail::extract_raw_pointer(L.handle()); + + NumericT *aij_ptr = detail::extract_raw_pointer(aij_L.handle()); + + // temporary workspace + NumericT *L_backup = (NumericT *)malloc(sizeof(NumericT) * L.nnz()); + + // backup: +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long i = 0; i < static_cast(L.nnz()); ++i) + L_backup[i] = L_elements[i]; + + + // sweep +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(L.size1()); ++row) + { + // + // update L: + // + unsigned int row_Li_start = L_row_buffer[row]; + unsigned int row_Li_end = L_row_buffer[row + 1]; + + for (unsigned int i = row_Li_start; i < row_Li_end; ++i) + { + unsigned int col = L_col_buffer[i]; + + unsigned int row_Lj_start = L_row_buffer[col]; + unsigned int row_Lj_end = L_row_buffer[col+1]; + + // compute \sum_{k=1}^{j-1} l_ik l_jk + unsigned int index_Lj = row_Lj_start; + unsigned int col_Lj = L_col_buffer[index_Lj]; + NumericT s = aij_ptr[i]; + for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) + { + unsigned int col_Li = L_col_buffer[index_Li]; + + // find element in row j + while (col_Lj < col_Li) + { + ++index_Lj; + col_Lj = L_col_buffer[index_Lj]; + } + + if (col_Lj == col_Li) + s -= L_backup[index_Li] * L_backup[index_Lj]; + } + + if (row != col) + L_elements[i] = s / L_backup[row_Lj_end - 1]; // diagonal element is last in row! + else + L_elements[i] = std::sqrt(s); + } + } + + free(L_backup); +} + + + +//////////////////////// ILU //////////////////////// + +template +void extract_LU(compressed_matrix const & A, + compressed_matrix & L, + compressed_matrix & U) +{ + // L and U are known to have correct dimensions + + unsigned int const *A_row_buffer = detail::extract_raw_pointer(A.handle1()); + unsigned int const *A_col_buffer = detail::extract_raw_pointer(A.handle2()); + NumericT const *A_elements = detail::extract_raw_pointer(A.handle()); + + unsigned int *L_row_buffer = detail::extract_raw_pointer(L.handle1()); + unsigned int *U_row_buffer = detail::extract_raw_pointer(U.handle1()); + + // + // Step 1: Count elements in L and U + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = A_row_buffer[row]; + unsigned int col_end = A_row_buffer[row+1]; + + for (unsigned int j = col_begin; j < col_end; ++j) + { + unsigned int col = A_col_buffer[j]; + if (long(col) <= row) + ++L_row_buffer[row]; + if (long(col) >= row) + ++U_row_buffer[row]; + } + } + + // + // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices + // + viennacl::vector_base wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1); + viennacl::linalg::exclusive_scan(wrapped_L_row_buffer); + L.reserve(wrapped_L_row_buffer[L.size1()], false); + + viennacl::vector_base wrapped_U_row_buffer(U.handle1(), U.size1() + 1, 0, 1); + viennacl::linalg::exclusive_scan(wrapped_U_row_buffer); + U.reserve(wrapped_U_row_buffer[U.size1()], false); + + unsigned int *L_col_buffer = detail::extract_raw_pointer(L.handle2()); + NumericT *L_elements = detail::extract_raw_pointer(L.handle()); + + unsigned int *U_col_buffer = detail::extract_raw_pointer(U.handle2()); + NumericT *U_elements = detail::extract_raw_pointer(U.handle()); + + // + // Step 3: Write entries: + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = A_row_buffer[row]; + unsigned int col_end = A_row_buffer[row+1]; + + unsigned int index_L = L_row_buffer[row]; + unsigned int index_U = U_row_buffer[row]; + for (unsigned int j = col_begin; j < col_end; ++j) + { + unsigned int col = A_col_buffer[j]; + NumericT value = A_elements[j]; + + if (long(col) <= row) + { + L_col_buffer[index_L] = col; + L_elements[index_L] = value; + ++index_L; + } + + if (long(col) >= row) + { + U_col_buffer[index_U] = col; + U_elements[index_U] = value; + ++index_U; + } + } + } + +} // extract_LU + + + +/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ +template +void ilu_scale(compressed_matrix const & A, + compressed_matrix & L, + compressed_matrix & U) +{ + viennacl::vector D(A.size1(), viennacl::traits::context(A)); + + unsigned int const *A_row_buffer = detail::extract_raw_pointer(A.handle1()); + unsigned int const *A_col_buffer = detail::extract_raw_pointer(A.handle2()); + NumericT const *A_elements = detail::extract_raw_pointer(A.handle()); + + NumericT *D_elements = detail::extract_raw_pointer(D.handle()); + + // + // Step 1: Determine D + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = A_row_buffer[row]; + unsigned int col_end = A_row_buffer[row+1]; + + for (unsigned int j = col_begin; j < col_end; ++j) + { + unsigned int col = A_col_buffer[j]; + if (row == col) + { + D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j])); + break; + } + } + } + + // + // Step 2: Scale values in L: + // + unsigned int const *L_row_buffer = detail::extract_raw_pointer(L.handle1()); + unsigned int const *L_col_buffer = detail::extract_raw_pointer(L.handle2()); + NumericT *L_elements = detail::extract_raw_pointer(L.handle()); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = L_row_buffer[row]; + unsigned int col_end = L_row_buffer[row+1]; + + NumericT D_row = D_elements[row]; + + for (unsigned int j = col_begin; j < col_end; ++j) + L_elements[j] *= D_row * D_elements[L_col_buffer[j]]; + } + + // + // Step 3: Scale values in U: + // + unsigned int const *U_row_buffer = detail::extract_raw_pointer(U.handle1()); + unsigned int const *U_col_buffer = detail::extract_raw_pointer(U.handle2()); + NumericT *U_elements = detail::extract_raw_pointer(U.handle()); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(A.size1()); ++row) + { + unsigned int col_begin = U_row_buffer[row]; + unsigned int col_end = U_row_buffer[row+1]; + + NumericT D_row = D_elements[row]; + + for (unsigned int j = col_begin; j < col_end; ++j) + U_elements[j] *= D_row * D_elements[U_col_buffer[j]]; + } + + L.generate_row_block_information(); + // Note: block information for U will be generated after transposition + +} + +template +void ilu_transpose(compressed_matrix const & A, + compressed_matrix & B) +{ + NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer(A.handle()); + unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer(A.handle1()); + unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer(A.handle2()); + + // initialize datastructures for B: + B = compressed_matrix(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A)); + + NumericT * B_elements = viennacl::linalg::host_based::detail::extract_raw_pointer(B.handle()); + unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer(B.handle1()); + unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer(B.handle2()); + + // prepare uninitialized B_row_buffer: + for (std::size_t i = 0; i < B.size1(); ++i) + B_row_buffer[i] = 0; + + // + // Stage 1: Compute pattern for B + // + for (std::size_t row = 0; row < A.size1(); ++row) + { + unsigned int row_start = A_row_buffer[row]; + unsigned int row_stop = A_row_buffer[row+1]; + + for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) + B_row_buffer[A_col_buffer[nnz_index]] += 1; + } + + // Bring row-start array in place using exclusive-scan: + unsigned int offset = B_row_buffer[0]; + B_row_buffer[0] = 0; + for (std::size_t row = 1; row < B.size1(); ++row) + { + unsigned int tmp = B_row_buffer[row]; + B_row_buffer[row] = offset; + offset += tmp; + } + B_row_buffer[B.size1()] = offset; + + // + // Stage 2: Fill with data + // + + std::vector B_row_offsets(B.size1()); //number of elements already written per row + + for (unsigned int row = 0; row < static_cast(A.size1()); ++row) + { + //std::cout << "Row " << row << ": "; + unsigned int row_start = A_row_buffer[row]; + unsigned int row_stop = A_row_buffer[row+1]; + + for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) + { + unsigned int col_in_A = A_col_buffer[nnz_index]; + unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A]; + B_col_buffer[B_nnz_index] = row; + B_elements[B_nnz_index] = A_elements[nnz_index]; + ++B_row_offsets[col_in_A]; + //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index]; + } + } + + // Step 3: Make datastructure consistent (row blocks!) + B.generate_row_block_information(); +} + + + +/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */ +template +void ilu_chow_patel_sweep(compressed_matrix & L, + vector const & aij_L, + compressed_matrix & U_trans, + vector const & aij_U_trans) +{ + unsigned int const *L_row_buffer = detail::extract_raw_pointer(L.handle1()); + unsigned int const *L_col_buffer = detail::extract_raw_pointer(L.handle2()); + NumericT *L_elements = detail::extract_raw_pointer(L.handle()); + + NumericT const *aij_L_ptr = detail::extract_raw_pointer(aij_L.handle()); + + unsigned int const *U_row_buffer = detail::extract_raw_pointer(U_trans.handle1()); + unsigned int const *U_col_buffer = detail::extract_raw_pointer(U_trans.handle2()); + NumericT *U_elements = detail::extract_raw_pointer(U_trans.handle()); + + NumericT const *aij_U_trans_ptr = detail::extract_raw_pointer(aij_U_trans.handle()); + + // temporary workspace + NumericT *L_backup = new NumericT[L.nnz()]; + NumericT *U_backup = new NumericT[U_trans.nnz()]; + + // backup: +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long i = 0; i < static_cast(L.nnz()); ++i) + L_backup[i] = L_elements[i]; + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (U_trans.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long i = 0; i < static_cast(U_trans.nnz()); ++i) + U_backup[i] = U_elements[i]; + + // sweep +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(L.size1()); ++row) + { + // + // update L: + // + unsigned int row_L_start = L_row_buffer[row]; + unsigned int row_L_end = L_row_buffer[row + 1]; + + for (unsigned int j = row_L_start; j < row_L_end; ++j) + { + unsigned int col = L_col_buffer[j]; + + if (col == row) + continue; + + unsigned int row_U_start = U_row_buffer[col]; + unsigned int row_U_end = U_row_buffer[col + 1]; + + // compute \sum_{k=1}^{j-1} l_ik u_kj + unsigned int index_U = row_U_start; + unsigned int col_U = (index_U < row_U_end) ? U_col_buffer[index_U] : static_cast(U_trans.size2()); + NumericT sum = 0; + for (unsigned int k = row_L_start; k < j; ++k) + { + unsigned int col_L = L_col_buffer[k]; + + // find element in U + while (col_U < col_L) + { + ++index_U; + col_U = U_col_buffer[index_U]; + } + + if (col_U == col_L) + sum += L_backup[k] * U_backup[index_U]; + } + + // update l_ij: + assert(U_col_buffer[row_U_end - 1] == col && bool("Accessing U element which is not a diagonal element!")); + L_elements[j] = (aij_L_ptr[j] - sum) / U_backup[row_U_end - 1]; // diagonal element is last entry in U + } + + + // + // update U: + // + unsigned int row_U_start = U_row_buffer[row]; + unsigned int row_U_end = U_row_buffer[row + 1]; + for (unsigned int j = row_U_start; j < row_U_end; ++j) + { + unsigned int col = U_col_buffer[j]; + + row_L_start = L_row_buffer[col]; + row_L_end = L_row_buffer[col + 1]; + + // compute \sum_{k=1}^{j-1} l_ik u_kj + unsigned int index_L = row_L_start; + unsigned int col_L = (index_L < row_L_end) ? L_col_buffer[index_L] : static_cast(L.size1()); + NumericT sum = 0; + for (unsigned int k = row_U_start; k < j; ++k) + { + unsigned int col_U = U_col_buffer[k]; + + // find element in L + while (col_L < col_U) + { + ++index_L; + col_L = L_col_buffer[index_L]; + } + + if (col_U == col_L) + sum += L_backup[index_L] * U_backup[k]; + } + + // update u_ij: + U_elements[j] = aij_U_trans_ptr[j] - sum; + } + } + + delete[] L_backup; + delete[] U_backup; +} + + +template +void ilu_form_neumann_matrix(compressed_matrix & R, + vector & diag_R) +{ + unsigned int *R_row_buffer = detail::extract_raw_pointer(R.handle1()); + unsigned int *R_col_buffer = detail::extract_raw_pointer(R.handle2()); + NumericT *R_elements = detail::extract_raw_pointer(R.handle()); + + NumericT *diag_R_ptr = detail::extract_raw_pointer(diag_R.handle()); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (R.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) +#endif + for (long row = 0; row < static_cast(R.size1()); ++row) + { + unsigned int col_begin = R_row_buffer[row]; + unsigned int col_end = R_row_buffer[row+1]; + + // part 1: extract diagonal entry + NumericT diag = 0; + for (unsigned int j = col_begin; j < col_end; ++j) + { + unsigned int col = R_col_buffer[j]; + if (col == row) + { + diag = R_elements[j]; + R_elements[j] = 0; // (I - D^{-1}R) + break; + } + } + diag_R_ptr[row] = diag; + + assert((diag > 0 || diag < 0) && bool("Zero diagonal detected!")); + + // part2: scale + for (unsigned int j = col_begin; j < col_end; ++j) + R_elements[j] /= -diag; + } + + //std::cout << "diag_R: " << diag_R << std::endl; +} + +} //namespace host_based +} //namespace linalg +} //namespace viennacl + + +#endif