mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From apalu...@apache.org
Subject [22/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241
Date Wed, 08 Jun 2016 21:40:19 GMT
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
new file mode 100644
index 0000000..1d212c2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
@@ -0,0 +1,307 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/direct_solve.hpp
+    @brief Implementations of dense direct triangular solvers are found here.
+*/
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+namespace detail
+{
+  //
+  // Upper solve:
+  //
+  template<typename MatrixT1, typename MatrixT2>
+  void upper_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
+  {
+    typedef typename MatrixT2::value_type   value_type;
+
+    for (vcl_size_t i = 0; i < A_size; ++i)
+    {
+      vcl_size_t current_row = A_size - i - 1;
+
+      for (vcl_size_t j = current_row + 1; j < A_size; ++j)
+      {
+        value_type A_element = A(current_row, j);
+        for (vcl_size_t k=0; k < B_size; ++k)
+          B(current_row, k) -= A_element * B(j, k);
+      }
+
+      if (!unit_diagonal)
+      {
+        value_type A_diag = A(current_row, current_row);
+        for (vcl_size_t k=0; k < B_size; ++k)
+          B(current_row, k) /= A_diag;
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_upper_tag)
+  {
+    upper_inplace_solve_matrix(A, B, A_size, B_size, true);
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::upper_tag)
+  {
+    upper_inplace_solve_matrix(A, B, A_size, B_size, false);
+  }
+
+  //
+  // Lower solve:
+  //
+  template<typename MatrixT1, typename MatrixT2>
+  void lower_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
+  {
+    typedef typename MatrixT2::value_type   value_type;
+
+    for (vcl_size_t i = 0; i < A_size; ++i)
+    {
+      for (vcl_size_t j = 0; j < i; ++j)
+      {
+        value_type A_element = A(i, j);
+        for (vcl_size_t k=0; k < B_size; ++k)
+          B(i, k) -= A_element * B(j, k);
+      }
+
+      if (!unit_diagonal)
+      {
+        value_type A_diag = A(i, i);
+        for (vcl_size_t k=0; k < B_size; ++k)
+          B(i, k) /= A_diag;
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_lower_tag)
+  {
+    lower_inplace_solve_matrix(A, B, A_size, B_size, true);
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::lower_tag)
+  {
+    lower_inplace_solve_matrix(A, B, A_size, B_size, false);
+  }
+
+}
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+//
+
+////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
+/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B   (MATLAB notation)
+*
+* @param A        The system matrix
+* @param B        The matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+  value_type       * data_B = detail::extract_raw_pointer<value_type>(B);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(A);
+  vcl_size_t A_start2 = viennacl::traits::start2(A);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+  //vcl_size_t A_size1  = viennacl::traits::size1(A);
+  vcl_size_t A_size2  = viennacl::traits::size2(A);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+  vcl_size_t B_start1 = viennacl::traits::start1(B);
+  vcl_size_t B_start2 = viennacl::traits::start2(B);
+  vcl_size_t B_inc1   = viennacl::traits::stride1(B);
+  vcl_size_t B_inc2   = viennacl::traits::stride2(B);
+  //vcl_size_t B_size1  = viennacl::traits::size1(B);
+  vcl_size_t B_size2  = viennacl::traits::size2(B);
+  vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B);
+  vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B);
+
+
+  if (A.row_major() && B.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type,       row_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+  }
+  else if (A.row_major() && !B.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major,    false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+  }
+  else if (!A.row_major() && B.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type,       row_major,    false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::matrix_array_wrapper<value_type,       column_major, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+    detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+  }
+}
+
+
+//
+//  Solve on vector
+//
+
+namespace detail
+{
+  //
+  // Upper solve:
+  //
+  template<typename MatrixT, typename VectorT>
+  void upper_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
+  {
+    typedef typename VectorT::value_type   value_type;
+
+    for (vcl_size_t i = 0; i < A_size; ++i)
+    {
+      vcl_size_t current_row = A_size - i - 1;
+
+      for (vcl_size_t j = current_row + 1; j < A_size; ++j)
+      {
+        value_type A_element = A(current_row, j);
+        b(current_row) -= A_element * b(j);
+      }
+
+      if (!unit_diagonal)
+        b(current_row) /= A(current_row, current_row);
+    }
+  }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_upper_tag)
+  {
+    upper_inplace_solve_vector(A, b, A_size, true);
+  }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::upper_tag)
+  {
+    upper_inplace_solve_vector(A, b, A_size, false);
+  }
+
+  //
+  // Lower solve:
+  //
+  template<typename MatrixT, typename VectorT>
+  void lower_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
+  {
+    typedef typename VectorT::value_type   value_type;
+
+    for (vcl_size_t i = 0; i < A_size; ++i)
+    {
+      for (vcl_size_t j = 0; j < i; ++j)
+      {
+        value_type A_element = A(i, j);
+        b(i) -= A_element * b(j);
+      }
+
+      if (!unit_diagonal)
+        b(i) /= A(i, i);
+    }
+  }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_lower_tag)
+  {
+    lower_inplace_solve_vector(A, b, A_size, true);
+  }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::lower_tag)
+  {
+    lower_inplace_solve_vector(A, b, A_size, false);
+  }
+
+}
+
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & mat,
+                   vector_base<NumericT> & vec,
+                   SolverTagT)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+  value_type       * data_v = detail::extract_raw_pointer<value_type>(vec);
+
+  vcl_size_t A_start1 = viennacl::traits::start1(mat);
+  vcl_size_t A_start2 = viennacl::traits::start2(mat);
+  vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+  vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+  vcl_size_t A_size2  = viennacl::traits::size2(mat);
+  vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+  vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+  vcl_size_t start1 = viennacl::traits::start(vec);
+  vcl_size_t inc1   = viennacl::traits::stride(vec);
+
+  if (mat.row_major())
+  {
+    detail::matrix_array_wrapper<value_type const, row_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
+
+    detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
+  }
+  else
+  {
+    detail::matrix_array_wrapper<value_type const, column_major, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+    detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
+
+    detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
+  }
+}
+
+
+} // namespace host_based
+} // namespace linalg
+} // namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
new file mode 100644
index 0000000..f53f8f2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
@@ -0,0 +1,856 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/host_based/fft_operations.hpp
+    @brief Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled execution on CPU
+ */
+
+//TODO openom Conditions
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+
+#include <stdexcept>
+#include <cmath>
+#include <complex>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+  namespace fft
+  {
+    const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+    namespace FFT_DATA_ORDER
+    {
+      enum DATA_ORDER
+      {
+        ROW_MAJOR, COL_MAJOR
+      };
+    }
+
+    inline vcl_size_t num_bits(vcl_size_t size)
+    {
+      vcl_size_t bits_datasize = 0;
+      vcl_size_t ds = 1;
+
+      while (ds < size)
+      {
+        ds = ds << 1;
+        bits_datasize++;
+      }
+
+      return bits_datasize;
+    }
+
+    inline vcl_size_t next_power_2(vcl_size_t n)
+    {
+      n = n - 1;
+
+      vcl_size_t power = 1;
+
+      while (power < sizeof(vcl_size_t) * 8)
+      {
+        n = n | (n >> power);
+        power *= 2;
+      }
+
+      return n + 1;
+    }
+
+    inline vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size)
+    {
+      v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+      v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+      v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+      v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+      v = (v >> 16) | (v << 16);
+      v = v >> (32 - bit_size);
+      return v;
+    }
+
+    template<typename NumericT, unsigned int AlignmentV>
+    void copy_to_complex_array(std::complex<NumericT> * input_complex,
+                               viennacl::vector<NumericT, AlignmentV> const & in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+      { //change array to complex array
+        vcl_size_t i = vcl_size_t(i2);
+        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+      }
+    }
+
+    template<typename NumericT>
+    void copy_to_complex_array(std::complex<NumericT> * input_complex,
+                               viennacl::vector_base<NumericT> const & in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+      { //change array to complex array
+        vcl_size_t i = vcl_size_t(i2);
+        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+      }
+    }
+
+    template<typename NumericT, unsigned int AlignmentV>
+    void copy_to_vector(std::complex<NumericT> * input_complex,
+                        viennacl::vector<NumericT, AlignmentV> & in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size); i2++)
+      {
+        vcl_size_t i = vcl_size_t(i2);
+        in(i * 2)     = static_cast<NumericT>(std::real(input_complex[i]));
+        in(i * 2 + 1) = static_cast<NumericT>(std::imag(input_complex[i]));
+      }
+    }
+
+    template<typename NumericT>
+    void copy_to_complex_array(std::complex<NumericT> * input_complex,
+                               NumericT const * in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+      { //change array to complex array
+        vcl_size_t i = vcl_size_t(i2);
+        input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+      }
+    }
+
+    template<typename NumericT>
+    void copy_to_vector(std::complex<NumericT> * input_complex, NumericT * in, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size); i2++)
+      {
+        vcl_size_t i = vcl_size_t(i2);
+        in[i * 2]     = static_cast<NumericT>(std::real(input_complex[i]));
+        in[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
+      }
+    }
+
+    template<typename NumericT>
+    void copy_to_vector(std::complex<NumericT> * input_complex,
+                        viennacl::vector_base<NumericT> & in, vcl_size_t size)
+    {
+      std::vector<NumericT> temp(2 * size);
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size); i2++)
+      {
+        vcl_size_t i = vcl_size_t(i2);
+        temp[i * 2]     = static_cast<NumericT>(std::real(input_complex[i]));
+        temp[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
+      }
+      viennacl::copy(temp, in);
+    }
+
+    template<typename NumericT>
+    void zero2(NumericT *input1, NumericT *input2, vcl_size_t size)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i2 = 0; i2 < long(size); i2++)
+      {
+        vcl_size_t i = vcl_size_t(i2);
+        input1[i] = 0;
+        input2[i] = 0;
+      }
+    }
+
+  } //namespace fft
+
+} //namespace detail
+
+/**
+ * @brief Direct algoritm kenrnel
+ */
+template<typename NumericT>
+void fft_direct(std::complex<NumericT> * input_complex, std::complex<NumericT> * output,
+                vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign,
+                viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  NumericT const NUM_PI = NumericT(3.14159265358979323846);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel
+#endif
+  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+  {
+    vcl_size_t batch_id = vcl_size_t(batch_id2);
+    for (vcl_size_t k = 0; k < size; k += 1)
+    {
+      std::complex<NumericT> f = 0;
+      for (vcl_size_t n = 0; n < size; n++)
+      {
+        std::complex<NumericT> input;
+        if (!data_order)
+          input = input_complex[batch_id * stride + n]; //input index here
+        else
+          input = input_complex[n * stride + batch_id];
+        NumericT arg = sign * 2 * NUM_PI * NumericT(k) / NumericT(size * n);
+        NumericT sn  = std::sin(arg);
+        NumericT cs  = std::cos(arg);
+
+        std::complex<NumericT> ex(cs, sn);
+        std::complex<NumericT> tmp(input.real() * ex.real() - input.imag() * ex.imag(),
+                                   input.real() * ex.imag() + input.imag() * ex.real());
+        f = f + tmp;
+      }
+      if (!data_order)
+        output[batch_id * stride + k] = f;   // output index here
+      else
+        output[k * stride + batch_id] = f;
+    }
+  }
+
+}
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+            viennacl::vector<NumericT, AlignmentV>       & out,
+            vcl_size_t size, vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  std::vector<std::complex<NumericT> > input_complex(size * batch_num);
+  std::vector<std::complex<NumericT> > output(size * batch_num);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+
+  fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], out, size * batch_num);
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & out, vcl_size_t size,
+            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  vcl_size_t row_num = in.internal_size1();
+  vcl_size_t col_num = in.internal_size2() >> 1;
+
+  vcl_size_t size_mat = row_num * col_num;
+
+  std::vector<std::complex<NumericT> > input_complex(size_mat);
+  std::vector<std::complex<NumericT> > output(size_mat);
+
+  NumericT const * data_A = detail::extract_raw_pointer<NumericT>(in);
+  NumericT       * data_B = detail::extract_raw_pointer<NumericT>(out);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size_mat);
+
+  fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], data_B, size_mat);
+}
+
+/*
+ * This function performs reorder of 1D input  data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+             vcl_size_t bits_datasize, vcl_size_t batch_num,
+             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  std::vector<std::complex<NumericT> > input(size * batch_num);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], in, size * batch_num);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+  {
+    vcl_size_t batch_id = vcl_size_t(batch_id2);
+    for (vcl_size_t i = 0; i < size; i++)
+    {
+      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
+      if (i < v)
+      {
+        if (!data_order)
+        {
+          std::complex<NumericT> tmp   = input[batch_id * stride + i]; // index
+          input[batch_id * stride + i] = input[batch_id * stride + v]; //index
+          input[batch_id * stride + v] = tmp;      //index
+        }
+        else
+        {
+          std::complex<NumericT> tmp   = input[i * stride + batch_id]; // index
+          input[i * stride + batch_id] = input[v * stride + batch_id]; //index
+          input[v * stride + batch_id] = tmp;      //index
+        }
+      }
+    }
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], in, size * batch_num);
+}
+
+/*
+ * This function performs reorder of 2D input  data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
+             vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
+             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  NumericT * data = detail::extract_raw_pointer<NumericT>(in);
+  vcl_size_t row_num = in.internal_size1();
+  vcl_size_t col_num = in.internal_size2() >> 1;
+  vcl_size_t size_mat = row_num * col_num;
+
+  std::vector<std::complex<NumericT> > input(size_mat);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], data, size_mat);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+  {
+    vcl_size_t batch_id = vcl_size_t(batch_id2);
+    for (vcl_size_t i = 0; i < size; i++)
+    {
+      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
+      if (i < v)
+      {
+        if (!data_order)
+        {
+          std::complex<NumericT> tmp   = input[batch_id * stride + i]; // index
+          input[batch_id * stride + i] = input[batch_id * stride + v]; //index
+          input[batch_id * stride + v] = tmp;      //index
+        } else
+        {
+          std::complex<NumericT> tmp   = input[i * stride + batch_id]; // index
+          input[i * stride + batch_id] = input[v * stride + batch_id]; //index
+          input[v * stride + batch_id] = tmp;      //index
+        }
+      }
+    }
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], data, size_mat);
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ * Kernel for computing smaller amount of data
+ */
+template<typename NumericT>
+void fft_radix2(std::complex<NumericT> * input_complex, vcl_size_t batch_num,
+                vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign,
+                viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  NumericT const NUM_PI = NumericT(3.14159265358979323846);
+
+  for (vcl_size_t step = 0; step < bit_size; step++)
+  {
+    vcl_size_t ss = 1 << step;
+    vcl_size_t half_size = size >> 1;
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+    {
+      vcl_size_t batch_id = vcl_size_t(batch_id2);
+      for (vcl_size_t tid = 0; tid < half_size; tid++)
+      {
+        vcl_size_t group = (tid & (ss - 1));
+        vcl_size_t pos = ((tid >> step) << (step + 1)) + group;
+        std::complex<NumericT> in1;
+        std::complex<NumericT> in2;
+        vcl_size_t offset;
+        if (!data_order)
+        {
+          offset = batch_id * stride + pos;
+          in1 = input_complex[offset];
+          in2 = input_complex[offset + ss];
+        }
+        else
+        {
+          offset = pos * stride + batch_id;
+          in1 = input_complex[offset];
+          in2 = input_complex[offset + ss * stride];
+        }
+        NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
+        NumericT sn = std::sin(arg);
+        NumericT cs = std::cos(arg);
+        std::complex<NumericT> ex(cs, sn);
+        std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
+                                   in2.real() * ex.imag() + in2.imag() * ex.real());
+        if (!data_order)
+          input_complex[offset + ss] = in1 - tmp;
+        else
+          input_complex[offset + ss * stride] = in1 - tmp;
+        input_complex[offset] = in1 + tmp;
+      }
+    }
+  }
+
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ * Kernel for computing bigger amount of data
+ */
+template<typename NumericT>
+void fft_radix2_local(std::complex<NumericT> * input_complex,
+                      std::complex<NumericT> * lcl_input, vcl_size_t batch_num, vcl_size_t bit_size,
+                      vcl_size_t size, vcl_size_t stride, NumericT sign,
+                      viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  NumericT const NUM_PI = NumericT(3.14159265358979323846);
+
+  for (vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++)
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long p2 = 0; p2 < long(size); p2 += 1)
+    {
+      vcl_size_t p = vcl_size_t(p2);
+      vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(p, bit_size);
+
+      if (!data_order)
+        lcl_input[v] = input_complex[batch_id * stride + p]; //index
+      else
+        lcl_input[v] = input_complex[p * stride + batch_id];
+    }
+
+    for (vcl_size_t s = 0; s < bit_size; s++)
+    {
+      vcl_size_t ss = 1 << s;
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for
+#endif
+      for (long tid2 = 0; tid2 < long(size)/2; tid2++)
+      {
+        vcl_size_t tid = vcl_size_t(tid2);
+        vcl_size_t group = (tid & (ss - 1));
+        vcl_size_t pos = ((tid >> s) << (s + 1)) + group;
+
+        std::complex<NumericT> in1 = lcl_input[pos];
+        std::complex<NumericT> in2 = lcl_input[pos + ss];
+
+        NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
+
+        NumericT sn = std::sin(arg);
+        NumericT cs = std::cos(arg);
+        std::complex<NumericT> ex(cs, sn);
+
+        std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
+                                   in2.real() * ex.imag() + in2.imag() * ex.real());
+
+        lcl_input[pos + ss] = in1 - tmp;
+        lcl_input[pos] = in1 + tmp;
+      }
+
+    }
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    //copy local array back to global memory
+    for (long p2 = 0; p2 < long(size); p2 += 1)
+    {
+      vcl_size_t p = vcl_size_t(p2);
+      if (!data_order)
+        input_complex[batch_id * stride + p] = lcl_input[p];
+      else
+        input_complex[p * stride + batch_id] = lcl_input[p];
+
+    }
+
+  }
+
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
+
+  std::vector<std::complex<NumericT> > input_complex(size * batch_num);
+  std::vector<std::complex<NumericT> > lcl_input(size * batch_num);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+
+  if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
+  }
+  else
+  {
+    viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
+    viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+    viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
+  }
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size * batch_num);
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, vcl_size_t size,
+            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
+
+  NumericT * data = detail::extract_raw_pointer<NumericT>(in);
+
+  vcl_size_t row_num = in.internal_size1();
+  vcl_size_t col_num = in.internal_size2() >> 1;
+  vcl_size_t size_mat = row_num * col_num;
+
+  std::vector<std::complex<NumericT> > input_complex(size_mat);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
+  if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
+  {
+    //std::cout<<bit_size<<","<<size<<","<<stride<<","<<batch_num<<","<<size<<","<<sign<<","<<data_order<<std::endl;
+    std::vector<std::complex<NumericT> > lcl_input(size_mat);
+    viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
+  }
+  else
+  {
+    viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
+    viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
+    viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
+  }
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size_mat);
+
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently,  Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV>& in, viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/)
+{
+
+  vcl_size_t size = in.size() >> 1;
+  vcl_size_t ext_size = viennacl::linalg::host_based::detail::fft::next_power_2(2 * size - 1);
+
+  viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+  viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+  std::vector<std::complex<NumericT> > input_complex(size);
+  std::vector<std::complex<NumericT> > output_complex(size);
+
+  std::vector<std::complex<NumericT> > A_complex(ext_size);
+  std::vector<std::complex<NumericT> > B_complex(ext_size);
+  std::vector<std::complex<NumericT> > Z_complex(ext_size);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(ext_size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    A_complex[i] = 0;
+    B_complex[i] = 0;
+  }
+
+  vcl_size_t double_size = size << 1;
+
+  NumericT const NUM_PI = NumericT(3.14159265358979323846);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    vcl_size_t rm = i * i % (double_size);
+    NumericT angle = NumericT(rm) / NumericT(size) * NumericT(NUM_PI);
+
+    NumericT sn_a = std::sin(-angle);
+    NumericT cs_a = std::cos(-angle);
+
+    std::complex<NumericT> a_i(cs_a, sn_a);
+    std::complex<NumericT> b_i(cs_a, -sn_a);
+
+    A_complex[i] = std::complex<NumericT>(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(),
+                                          input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real());
+    B_complex[i] = b_i;
+
+    // very bad instruction, to be fixed
+    if (i)
+      B_complex[ext_size - i] = b_i;
+  }
+
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size);
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&A_complex[0], A, ext_size);
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&B_complex[0], B, ext_size);
+
+  viennacl::linalg::convolve_i(A, B, Z);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&Z_complex[0], Z, ext_size);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    vcl_size_t rm = i * i % (double_size);
+    NumericT angle = NumericT(rm) / NumericT(size) * NumericT(-NUM_PI);
+    NumericT sn_a = std::sin(angle);
+    NumericT cs_a = std::cos(angle);
+    std::complex<NumericT> b_i(cs_a, sn_a);
+    output_complex[i] = std::complex<NumericT>(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(),
+                                               Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real());
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], out, size);
+
+}
+
+/**
+ * @brief Normalize vector with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+  vcl_size_t size = input.size() >> 1;
+  NumericT norm_factor = static_cast<NumericT>(size);
+  for (vcl_size_t i = 0; i < size * 2; i++)
+    input[i] /= norm_factor;
+
+}
+
+/**
+ * @brief Complex multiplikation of two vectors
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+                      viennacl::vector<NumericT, AlignmentV> const & input2,
+                      viennacl::vector<NumericT, AlignmentV> & output)
+{
+  vcl_size_t size = input1.size() >> 1;
+
+  std::vector<std::complex<NumericT> > input1_complex(size);
+  std::vector<std::complex<NumericT> > input2_complex(size);
+  std::vector<std::complex<NumericT> > output_complex(size);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input1_complex[0], input1, size);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input2_complex[0], input2, size);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    std::complex<NumericT> in1 = input1_complex[i];
+    std::complex<NumericT> in2 = input2_complex[i];
+    output_complex[i] = std::complex<NumericT>(in1.real() * in2.real() - in1.imag() * in2.imag(),
+                                               in1.real() * in2.imag() + in1.imag() * in2.real());
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], output, size);
+
+}
+/**
+ * @brief Inplace transpose of matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+  vcl_size_t row_num = input.internal_size1() / 2;
+  vcl_size_t col_num = input.internal_size2() / 2;
+
+  vcl_size_t size = row_num * col_num;
+
+  NumericT * data = detail::extract_raw_pointer<NumericT>(input);
+
+  std::vector<std::complex<NumericT> > input_complex(size);
+
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    vcl_size_t row = i / col_num;
+    vcl_size_t col = i - row * col_num;
+    vcl_size_t new_pos = col * row_num + row;
+
+    if (i < new_pos)
+    {
+      std::complex<NumericT> val = input_complex[i];
+      input_complex[i] = input_complex[new_pos];
+      input_complex[new_pos] = val;
+    }
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size);
+
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & output)
+{
+
+  vcl_size_t row_num = input.internal_size1() / 2;
+  vcl_size_t col_num = input.internal_size2() / 2;
+  vcl_size_t size = row_num * col_num;
+
+  NumericT const * data_A = detail::extract_raw_pointer<NumericT>(input);
+  NumericT       * data_B = detail::extract_raw_pointer<NumericT>(output);
+
+  std::vector<std::complex<NumericT> > input_complex(size);
+  viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size);
+
+  std::vector<std::complex<NumericT> > output_complex(size);
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    vcl_size_t row = i / col_num;
+    vcl_size_t col = i % col_num;
+    vcl_size_t new_pos = col * row_num + row;
+    output_complex[new_pos] = input_complex[i];
+  }
+  viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], data_B, size);
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
+{
+  NumericT const * data_in  = detail::extract_raw_pointer<NumericT>(in);
+  NumericT       * data_out = detail::extract_raw_pointer<NumericT>(out);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    data_out[2*i  ] = data_in[i];
+    data_out[2*i+1] = NumericT(0);
+  }
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t size)
+{
+  NumericT const * data_in  = detail::extract_raw_pointer<NumericT>(in);
+  NumericT       * data_out = detail::extract_raw_pointer<NumericT>(out);
+
+#ifdef VIENNACL_WITH_OPENMP
+#pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < long(size); i++)
+    data_out[i] = data_in[2*i];
+}
+
+/**
+ * @brief Reverse vector to opposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT> & in)
+{
+  vcl_size_t size = in.size();
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i2 = 0; i2 < long(size); i2++)
+  {
+    vcl_size_t i = vcl_size_t(i2);
+    NumericT val1 = in[i];
+    NumericT val2 = in[size - i - 1];
+    in[i] = val2;
+    in[size - i - 1] = val1;
+  }
+}
+
+}      //namespace host_based
+}      //namespace linalg
+}      //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
new file mode 100644
index 0000000..62c885a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
@@ -0,0 +1,672 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/ilu_operations.hpp
+    @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using the host (OpenMP)
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_ILU_MIN_SIZE
+  #define VIENNACL_OPENMP_ILU_MIN_SIZE  5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  // L is known to have correct dimensions
+
+  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+
+  unsigned int       *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+
+  //
+  // Step 1: Count elements in L
+  //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      if (long(col) <= row)
+        ++L_row_buffer[row];
+    }
+  }
+
+  //
+  // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
+  //
+  viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  unsigned int       *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+  //
+  // Step 3: Write entries:
+  //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    unsigned int index_L = L_row_buffer[row];
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      NumericT value = A_elements[j];
+
+      if (long(col) <= row)
+      {
+        L_col_buffer[index_L] = col;
+        L_elements[index_L]   = value;
+        ++index_L;
+      }
+    }
+  }
+
+} // extract_L
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+
+  NumericT           *D_elements   = detail::extract_raw_pointer<NumericT>(D.handle());
+
+  //
+  // Step 1: Determine D
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      if (row == col)
+      {
+        D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
+        break;
+      }
+    }
+  }
+
+  //
+  // Step 2: Scale values in L:
+  //
+  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = L_row_buffer[row];
+    unsigned int col_end   = L_row_buffer[row+1];
+
+    NumericT D_row = D_elements[row];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+      L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
+  }
+
+  L.generate_row_block_information();
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC using OpenMP (cf. Algorithm 3 in paper, but for L rather than U) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
+                          vector<NumericT>            & aij_L)
+{
+  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+  NumericT           *aij_ptr   = detail::extract_raw_pointer<NumericT>(aij_L.handle());
+
+  // temporary workspace
+  NumericT *L_backup = (NumericT *)malloc(sizeof(NumericT) * L.nnz());
+
+  // backup:
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
+    L_backup[i] = L_elements[i];
+
+
+  // sweep
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(L.size1()); ++row)
+  {
+    //
+    // update L:
+    //
+    unsigned int row_Li_start = L_row_buffer[row];
+    unsigned int row_Li_end   = L_row_buffer[row + 1];
+
+    for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
+    {
+      unsigned int col = L_col_buffer[i];
+
+      unsigned int row_Lj_start = L_row_buffer[col];
+      unsigned int row_Lj_end   = L_row_buffer[col+1];
+
+      // compute \sum_{k=1}^{j-1} l_ik l_jk
+      unsigned int index_Lj = row_Lj_start;
+      unsigned int col_Lj = L_col_buffer[index_Lj];
+      NumericT s = aij_ptr[i];
+      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
+      {
+        unsigned int col_Li = L_col_buffer[index_Li];
+
+        // find element in row j
+        while (col_Lj < col_Li)
+        {
+          ++index_Lj;
+          col_Lj = L_col_buffer[index_Lj];
+        }
+
+        if (col_Lj == col_Li)
+          s -= L_backup[index_Li] * L_backup[index_Lj];
+      }
+
+      if (row != col)
+        L_elements[i] = s / L_backup[row_Lj_end - 1]; // diagonal element is last in row!
+      else
+        L_elements[i] = std::sqrt(s);
+    }
+  }
+
+  free(L_backup);
+}
+
+
+
+//////////////////////// ILU ////////////////////////
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+                compressed_matrix<NumericT>       & L,
+                compressed_matrix<NumericT>       & U)
+{
+  // L and U are known to have correct dimensions
+
+  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+
+  unsigned int       *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int       *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+
+  //
+  // Step 1: Count elements in L and U
+  //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      if (long(col) <= row)
+        ++L_row_buffer[row];
+      if (long(col) >= row)
+        ++U_row_buffer[row];
+    }
+  }
+
+  //
+  // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
+  //
+  viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
+  L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+  viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), U.size1() + 1, 0, 1);
+  viennacl::linalg::exclusive_scan(wrapped_U_row_buffer);
+  U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+  unsigned int       *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+  unsigned int       *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U.handle());
+
+  //
+  // Step 3: Write entries:
+  //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    unsigned int index_L = L_row_buffer[row];
+    unsigned int index_U = U_row_buffer[row];
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      NumericT value = A_elements[j];
+
+      if (long(col) <= row)
+      {
+        L_col_buffer[index_L] = col;
+        L_elements[index_L]   = value;
+        ++index_L;
+      }
+
+      if (long(col) >= row)
+      {
+        U_col_buffer[index_U] = col;
+        U_elements[index_U]   = value;
+        ++index_U;
+      }
+    }
+  }
+
+} // extract_LU
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L,
+               compressed_matrix<NumericT>       & U)
+{
+  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+  unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+  NumericT     const *A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());
+
+  NumericT           *D_elements   = detail::extract_raw_pointer<NumericT>(D.handle());
+
+  //
+  // Step 1: Determine D
+  //
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = A_row_buffer[row];
+    unsigned int col_end   = A_row_buffer[row+1];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = A_col_buffer[j];
+      if (row == col)
+      {
+        D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
+        break;
+      }
+    }
+  }
+
+  //
+  // Step 2: Scale values in L:
+  //
+  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = L_row_buffer[row];
+    unsigned int col_end   = L_row_buffer[row+1];
+
+    NumericT D_row = D_elements[row];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+      L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
+  }
+
+  //
+  // Step 3: Scale values in U:
+  //
+  unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+  unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+  {
+    unsigned int col_begin = U_row_buffer[row];
+    unsigned int col_end   = U_row_buffer[row+1];
+
+    NumericT D_row = D_elements[row];
+
+    for (unsigned int j = col_begin; j < col_end; ++j)
+      U_elements[j] *= D_row * D_elements[U_col_buffer[j]];
+  }
+
+  L.generate_row_block_information();
+  // Note: block information for U will be generated after transposition
+
+}
+
+template<typename NumericT>
+void ilu_transpose(compressed_matrix<NumericT> const & A,
+                   compressed_matrix<NumericT>       & B)
+{
+  NumericT     const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  // initialize datastructures for B:
+  B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+  NumericT     * B_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle());
+  unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1());
+  unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2());
+
+  // prepare uninitialized B_row_buffer:
+  for (std::size_t i = 0; i < B.size1(); ++i)
+    B_row_buffer[i] = 0;
+
+  //
+  // Stage 1: Compute pattern for B
+  //
+  for (std::size_t row = 0; row < A.size1(); ++row)
+  {
+    unsigned int row_start = A_row_buffer[row];
+    unsigned int row_stop  = A_row_buffer[row+1];
+
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+      B_row_buffer[A_col_buffer[nnz_index]] += 1;
+  }
+
+  // Bring row-start array in place using exclusive-scan:
+  unsigned int offset = B_row_buffer[0];
+  B_row_buffer[0] = 0;
+  for (std::size_t row = 1; row < B.size1(); ++row)
+  {
+    unsigned int tmp = B_row_buffer[row];
+    B_row_buffer[row] = offset;
+    offset += tmp;
+  }
+  B_row_buffer[B.size1()] = offset;
+
+  //
+  // Stage 2: Fill with data
+  //
+
+  std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row
+
+  for (unsigned int row = 0; row < static_cast<unsigned int>(A.size1()); ++row)
+  {
+    //std::cout << "Row " << row << ": ";
+    unsigned int row_start = A_row_buffer[row];
+    unsigned int row_stop  = A_row_buffer[row+1];
+
+    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+    {
+      unsigned int col_in_A = A_col_buffer[nnz_index];
+      unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A];
+      B_col_buffer[B_nnz_index] = row;
+      B_elements[B_nnz_index] = A_elements[nnz_index];
+      ++B_row_offsets[col_in_A];
+      //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index];
+    }
+  }
+
+  // Step 3: Make datastructure consistent (row blocks!)
+  B.generate_row_block_information();
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L,
+                          compressed_matrix<NumericT>       & U_trans,
+                          vector<NumericT>            const & aij_U_trans)
+{
+  unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+  unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+  NumericT           *L_elements   = detail::extract_raw_pointer<NumericT>(L.handle());
+
+  NumericT     const *aij_L_ptr    = detail::extract_raw_pointer<NumericT>(aij_L.handle());
+
+  unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle1());
+  unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle2());
+  NumericT           *U_elements   = detail::extract_raw_pointer<NumericT>(U_trans.handle());
+
+  NumericT     const *aij_U_trans_ptr = detail::extract_raw_pointer<NumericT>(aij_U_trans.handle());
+
+  // temporary workspace
+  NumericT *L_backup = new NumericT[L.nnz()];
+  NumericT *U_backup = new NumericT[U_trans.nnz()];
+
+  // backup:
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
+    L_backup[i] = L_elements[i];
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (U_trans.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(U_trans.nnz()); ++i)
+    U_backup[i] = U_elements[i];
+
+  // sweep
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(L.size1()); ++row)
+  {
+    //
+    // update L:
+    //
+    unsigned int row_L_start = L_row_buffer[row];
+    unsigned int row_L_end   = L_row_buffer[row + 1];
+
+    for (unsigned int j = row_L_start; j < row_L_end; ++j)
+    {
+      unsigned int col = L_col_buffer[j];
+
+      if (col == row)
+        continue;
+
+      unsigned int row_U_start = U_row_buffer[col];
+      unsigned int row_U_end   = U_row_buffer[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_U = row_U_start;
+      unsigned int col_U = (index_U < row_U_end) ? U_col_buffer[index_U] : static_cast<unsigned int>(U_trans.size2());
+      NumericT sum = 0;
+      for (unsigned int k = row_L_start; k < j; ++k)
+      {
+        unsigned int col_L = L_col_buffer[k];
+
+        // find element in U
+        while (col_U < col_L)
+        {
+          ++index_U;
+          col_U = U_col_buffer[index_U];
+        }
+
+        if (col_U == col_L)
+          sum += L_backup[k] * U_backup[index_U];
+      }
+
+      // update l_ij:
+      assert(U_col_buffer[row_U_end - 1] == col && bool("Accessing U element which is not a diagonal element!"));
+      L_elements[j] = (aij_L_ptr[j] - sum) / U_backup[row_U_end - 1];  // diagonal element is last entry in U
+    }
+
+
+    //
+    // update U:
+    //
+    unsigned int row_U_start = U_row_buffer[row];
+    unsigned int row_U_end   = U_row_buffer[row + 1];
+    for (unsigned int j = row_U_start; j < row_U_end; ++j)
+    {
+      unsigned int col = U_col_buffer[j];
+
+      row_L_start = L_row_buffer[col];
+      row_L_end   = L_row_buffer[col + 1];
+
+      // compute \sum_{k=1}^{j-1} l_ik u_kj
+      unsigned int index_L = row_L_start;
+      unsigned int col_L = (index_L < row_L_end) ? L_col_buffer[index_L] : static_cast<unsigned int>(L.size1());
+      NumericT sum = 0;
+      for (unsigned int k = row_U_start; k < j; ++k)
+      {
+        unsigned int col_U = U_col_buffer[k];
+
+        // find element in L
+        while (col_L < col_U)
+        {
+          ++index_L;
+          col_L = L_col_buffer[index_L];
+        }
+
+        if (col_U == col_L)
+          sum += L_backup[index_L] * U_backup[k];
+      }
+
+      // update u_ij:
+      U_elements[j] = aij_U_trans_ptr[j] - sum;
+    }
+  }
+
+  delete[] L_backup;
+  delete[] U_backup;
+}
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+                             vector<NumericT> & diag_R)
+{
+  unsigned int *R_row_buffer = detail::extract_raw_pointer<unsigned int>(R.handle1());
+  unsigned int *R_col_buffer = detail::extract_raw_pointer<unsigned int>(R.handle2());
+  NumericT     *R_elements   = detail::extract_raw_pointer<NumericT>(R.handle());
+
+  NumericT     *diag_R_ptr   = detail::extract_raw_pointer<NumericT>(diag_R.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (R.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+  for (long row = 0; row < static_cast<long>(R.size1()); ++row)
+  {
+    unsigned int col_begin = R_row_buffer[row];
+    unsigned int col_end   = R_row_buffer[row+1];
+
+    // part 1: extract diagonal entry
+    NumericT diag = 0;
+    for (unsigned int j = col_begin; j < col_end; ++j)
+    {
+      unsigned int col = R_col_buffer[j];
+      if (col == row)
+      {
+        diag = R_elements[j];
+        R_elements[j] = 0; // (I - D^{-1}R)
+        break;
+      }
+    }
+    diag_R_ptr[row] = diag;
+
+    assert((diag > 0 || diag < 0) && bool("Zero diagonal detected!"));
+
+    // part2: scale
+    for (unsigned int j = col_begin; j < col_end; ++j)
+      R_elements[j] /= -diag;
+  }
+
+  //std::cout << "diag_R: " << diag_R << std::endl;
+}
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif


Mime
View raw message