mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From apalu...@apache.org
Subject [35/51] [partial] mahout git commit: Revert "(nojira) add native-viennaCL module to codebase. closes apache/mahout#241"
Date Fri, 10 Jun 2016 16:52:40 GMT
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
deleted file mode 100644
index 912d24d..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
+++ /dev/null
@@ -1,2725 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file  viennacl/linalg/cuda/matrix_operations.hpp
-    @brief Implementations of dense matrix related operations, including matrix-vector products, using CUDA.
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/vector_proxy.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/result_of.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/traits/handle.hpp"
-#include "viennacl/traits/stride.hpp"
-
-#include "viennacl/linalg/cuda/common.hpp"
-
-#include "viennacl/linalg/cuda/vector_operations.hpp"
-#include "viennacl/linalg/cuda/matrix_operations_row.hpp"
-#include "viennacl/linalg/cuda/matrix_operations_col.hpp"
-#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
-#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-//
-// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
-//
-
-template<typename DestNumericT, typename SrcNumericT>
-void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2)
-{
-  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
-
-  if (mat1.row_major())
-  {
-    convert_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
-
-                                    viennacl::cuda_arg(mat2),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
-                                  );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("convert_row_kernel");
-  }
-  else
-  {
-    convert_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
-
-                                    viennacl::cuda_arg(mat2),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
-                                  );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("convert_col_kernel");
-  }
-}
-
-template<typename NumericT, typename SizeT, typename DistanceT>
-void trans(matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> const & proxy,
-           matrix_base<NumericT> & temp_trans)
-{
-  trans_kernel<<<128,128>>>(viennacl::cuda_arg(proxy.lhs()),
-                            static_cast<unsigned int>(proxy.lhs().start1()),          static_cast<unsigned int>(proxy.lhs().start2()),
-                            static_cast<unsigned int>(proxy.lhs().internal_size1()),  static_cast<unsigned int>(proxy.lhs().internal_size2()),
-                            static_cast<unsigned int>(proxy.lhs().size1()),           static_cast<unsigned int>(proxy.lhs().size2()),
-                            static_cast<unsigned int>(proxy.lhs().stride1()),         static_cast<unsigned int>(proxy.lhs().stride2()),
-
-                            viennacl::cuda_arg(temp_trans),
-                            static_cast<unsigned int>(temp_trans.start1()),            static_cast<unsigned int>(temp_trans.start2()),
-                            static_cast<unsigned int>(temp_trans.internal_size1()),    static_cast<unsigned int>(temp_trans.internal_size2()),
-                            static_cast<unsigned int>(temp_trans.stride1()),           static_cast<unsigned int>(temp_trans.stride2()),
-                            static_cast<bool>(proxy.lhs().row_major()));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_kernel");
-}
-
-
-template<typename NumericT, typename ScalarT>
-void am(matrix_base<NumericT> & mat1,
-        matrix_base<NumericT> const & mat2, ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
-{
-  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<ScalarT>::value)
-    temporary_alpha = alpha;
-
-  if (mat1.row_major())
-  {
-    am_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
-                                static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
-                                static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
-                                static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
-                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
-
-                                viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                                options_alpha,
-                                viennacl::cuda_arg(mat2),
-                                static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
-                                static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
-                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
-                              );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
-  }
-  else
-  {
-    am_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
-                                static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
-                                static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
-                                static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
-                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
-
-                                viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                                options_alpha,
-                                viennacl::cuda_arg(mat2),
-                                static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
-                                static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
-                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
-                              );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
-  }
-}
-
-
-template<typename NumericT, typename ScalarT1, typename ScalarT2>
-void ambm(matrix_base<NumericT> & mat1,
-          matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-          matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
-{
-  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<ScalarT1>::value)
-    temporary_alpha = alpha;
-
-
-  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
-
-  value_type temporary_beta = 0;
-  if (viennacl::is_cpu_scalar<ScalarT2>::value)
-    temporary_beta = beta;
-
-
-  if (mat1.row_major())
-  {
-    ambm_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
-                                  static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
-                                  static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
-                                  static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
-                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
-
-                                  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                                  options_alpha,
-                                  viennacl::cuda_arg(mat2),
-                                  static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
-                                  static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
-                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
-
-                                  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
-                                  options_beta,
-                                  viennacl::cuda_arg(mat3),
-                                  static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
-                                  static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
-                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
-                                );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
-  }
-  else
-  {
-    ambm_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
-                                  static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
-                                  static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
-                                  static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
-                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
-
-                                  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                                  options_alpha,
-                                  viennacl::cuda_arg(mat2),
-                                  static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
-                                  static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
-                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
-
-                                  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
-                                  options_beta,
-                                  viennacl::cuda_arg(mat3),
-                                  static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
-                                  static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
-                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
-                                );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
-  }
-
-}
-
-
-template<typename NumericT, typename ScalarT1, typename ScalarT2>
-void ambm_m(matrix_base<NumericT> & mat1,
-            matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-            matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
-{
-  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<ScalarT1>::value)
-    temporary_alpha = alpha;
-
-
-  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
-
-  value_type temporary_beta = 0;
-  if (viennacl::is_cpu_scalar<ScalarT2>::value)
-    temporary_beta = beta;
-
-
-  if (mat1.row_major())
-  {
-    ambm_m_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
-
-                                    viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                                    options_alpha,
-                                    viennacl::cuda_arg(mat2),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
-
-                                    viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
-                                    options_beta,
-                                    viennacl::cuda_arg(mat3),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
-                                  );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
-  }
-  else
-  {
-    ambm_m_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
-
-                                    viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                                    options_alpha,
-                                    viennacl::cuda_arg(mat2),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
-
-                                    viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
-                                    options_beta,
-                                    viennacl::cuda_arg(mat3),
-                                    static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
-                                    static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
-                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
-                                  );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
-  }
-
-}
-
-
-
-
-template<typename NumericT>
-void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
-{
-  typedef NumericT        value_type;
-  value_type alpha = s;
-
-  unsigned int s1  = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
-  unsigned int s2  = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
-
-  if (mat.row_major())
-  {
-
-    matrix_row_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
-                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
-                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
-                                           s1,                                                                 s2,
-                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
-                                           alpha);
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
-  }
-  else
-  {
-    matrix_col_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
-                                            static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
-                                            static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
-                                            s1,                                                                 s2,
-                                            static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
-                                            alpha);
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
-  }
-}
-
-template<typename NumericT>
-void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
-{
-  typedef NumericT        value_type;
-  value_type alpha = s;
-
-  if (mat.row_major())
-  {
-    matrix_row_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
-                                                    static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
-                                                    static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
-                                                    static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
-                                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
-                                                    alpha);
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
-  }
-  else
-  {
-    matrix_col_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
-                                                    static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
-                                                    static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
-                                                    static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
-                                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
-                                                    alpha);
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
-  }
-}
-
-
-template<typename NumericT>
-void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
-{
-  typedef NumericT        value_type;
-
-  // Step 1: assign zero matrix:
-  matrix_assign(mat, NumericT(0));
-
-  // Step 2: Assign diagonal:
-  unsigned int options_alpha = 0;
-
-  vcl_size_t mat_start = 0;
-  vcl_size_t mat_stride = 0;
-  vcl_size_t mat_size = viennacl::traits::size(vec);
-  if (mat.row_major())
-  {
-    vcl_size_t first_row_index = 0;
-    vcl_size_t first_col_index = 0;
-    if (k < 0)
-      first_row_index = vcl_size_t(-k);
-    else
-      first_col_index = vcl_size_t(k);
-    mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
-                 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
-    mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
-  }
-  else
-  {
-    vcl_size_t first_row_index = 0;
-    vcl_size_t first_col_index = 0;
-    if (k < 0)
-      first_row_index = vcl_size_t(-k);
-    else
-      first_col_index = vcl_size_t(k);
-    mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
-                 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
-    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
-  }
-
-  av_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
-                          static_cast<unsigned int>(mat_start),
-                          static_cast<unsigned int>(mat_stride),
-                          static_cast<unsigned int>(mat_size),
-
-                          viennacl::cuda_arg<value_type>(NumericT(1)),
-                          options_alpha,
-                          viennacl::cuda_arg(vec),
-                          static_cast<unsigned int>(viennacl::traits::start(vec)),
-                          static_cast<unsigned int>(viennacl::traits::stride(vec)) );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
-}
-
-template<typename NumericT>
-void matrix_diag_to_vector(matrix_base<NumericT> const & mat, int k, vector_base<NumericT> & vec)
-{
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = 0;
-
-  vcl_size_t mat_start = 0;
-  vcl_size_t mat_stride = 0;
-  if (mat.row_major())
-  {
-    vcl_size_t first_row_index = 0;
-    vcl_size_t first_col_index = 0;
-    if (k < 0)
-      first_row_index = vcl_size_t(-k);
-    else
-      first_col_index = vcl_size_t(k);
-    mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
-                 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
-    mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
-  }
-  else
-  {
-    vcl_size_t first_row_index = 0;
-    vcl_size_t first_col_index = 0;
-    if (k < 0)
-      first_row_index = vcl_size_t(-k);
-    else
-      first_col_index = vcl_size_t(k);
-    mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
-                 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
-    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
-  }
-
-  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
-                          static_cast<unsigned int>(viennacl::traits::start(vec)),
-                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
-                          static_cast<unsigned int>(viennacl::traits::size(vec)),
-
-                          viennacl::cuda_arg<value_type>(NumericT(1)),
-                          options_alpha,
-                          viennacl::cuda_arg(mat),
-                          static_cast<unsigned int>(mat_start),
-                          static_cast<unsigned int>(mat_stride));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
-}
-
-template<typename NumericT>
-void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
-{
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = 0;
-
-  vcl_size_t mat_start = 0;
-  vcl_size_t mat_stride = 0;
-  if (mat.row_major())
-  {
-    mat_start  = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat);
-    mat_stride = viennacl::traits::stride2(mat);
-  }
-  else
-  {
-    mat_start  = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat);
-    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat);
-  }
-
-  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
-                          static_cast<unsigned int>(viennacl::traits::start(vec)),
-                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
-                          static_cast<unsigned int>(viennacl::traits::size(vec)),
-
-                          viennacl::cuda_arg<value_type>(NumericT(1)),
-                          options_alpha,
-                          viennacl::cuda_arg(mat),
-                          static_cast<unsigned int>(mat_start),
-                          static_cast<unsigned int>(mat_stride));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
-}
-
-template<typename NumericT>
-void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
-{
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = 0;
-
-  vcl_size_t mat_start = 0;
-  vcl_size_t mat_stride = 0;
-  if (mat.row_major())
-  {
-    mat_start  = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat);
-    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat);
-  }
-  else
-  {
-    mat_start  = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
-    mat_stride = viennacl::traits::stride2(mat);
-  }
-
-  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
-                          static_cast<unsigned int>(viennacl::traits::start(vec)),
-                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
-                          static_cast<unsigned int>(viennacl::traits::size(vec)),
-
-                          viennacl::cuda_arg<value_type>(NumericT(1)),
-                          options_alpha,
-                          viennacl::cuda_arg(mat),
-                          static_cast<unsigned int>(mat_start),
-                          static_cast<unsigned int>(mat_stride));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
-}
-
-
-//
-/////////////////////////   binary element-wise operations    /////////////////////////////////
-//
-
-
-template<typename NumericT, typename SizeT, typename OpT>
-void element_op(matrix_base<NumericT, SizeT> & A,
-                matrix_expression<const matrix_base<NumericT, SizeT>, const matrix_base<NumericT, SizeT>, op_element_binary<OpT> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT      value_type;
-
-  unsigned int op_type = 2; //0: product, 1: division, 2: power
-  if (viennacl::is_division<OpT>::value)
-    op_type = 1;
-  else if (viennacl::is_product<OpT>::value)
-    op_type = 0;
-
-  if (A.row_major())
-  {
-    element_op_int_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
-
-                                        viennacl::cuda_arg(proxy.rhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
-
-                                        op_type
-                                      );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
-  }
-  else
-  {
-    element_op_int_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
-
-                                        viennacl::cuda_arg(proxy.rhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
-
-                                        op_type
-                                      );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
-  }
-}
-
-template<typename SizeT, typename OpT>
-void element_op(matrix_base<float, SizeT> & A,
-                matrix_expression<const matrix_base<float, SizeT>, const matrix_base<float, SizeT>, op_element_binary<OpT> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef float        value_type;
-
-  unsigned int op_type = 2; //0: product, 1: division, 2: power
-  if (viennacl::is_division<OpT>::value)
-    op_type = 1;
-  else if (viennacl::is_product<OpT>::value)
-    op_type = 0;
-
-  if (A.row_major())
-  {
-    element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
-
-                                        viennacl::cuda_arg(proxy.rhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
-
-                                        op_type
-                                      );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
-  }
-  else
-  {
-    element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
-
-                                        viennacl::cuda_arg(proxy.rhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
-
-                                        op_type
-                                      );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
-  }
-}
-
-template<typename SizeT, typename OpT>
-void element_op(matrix_base<double, SizeT> & A,
-                matrix_expression<const matrix_base<double, SizeT>, const matrix_base<double, SizeT>, op_element_binary<OpT> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef double        value_type;
-
-  unsigned int op_type = 2; //0: product, 1: division, 2: power
-  if (viennacl::is_division<OpT>::value)
-    op_type = 1;
-  else if (viennacl::is_product<OpT>::value)
-    op_type = 0;
-
-  if (A.row_major())
-  {
-    element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
-
-                                        viennacl::cuda_arg(proxy.rhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
-
-                                        op_type
-                                      );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
-  }
-  else
-  {
-    element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
-
-                                        viennacl::cuda_arg(proxy.rhs()),
-                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
-                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
-
-                                        op_type
-                                      );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
-  }
-}
-
-//
-/////////////////////////   unary element-wise operations    /////////////////////////////////
-//
-
-// Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
-//       we could not find a more 'automatic' way of generating the overloads below...
-
-// abs
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_abs> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
-  }
-  else
-  {
-    matrix_col_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
-  }
-}
-
-
-// acos
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_acos> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT    value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
-  }
-  else
-  {
-    matrix_col_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
-  }
-}
-
-
-// asin
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_asin> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT    value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
-  }
-  else
-  {
-    matrix_col_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
-  }
-}
-
-
-// atan
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_atan> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT   value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
-  }
-  else
-  {
-    matrix_col_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
-  }
-}
-
-
-// ceil
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT   value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
-  }
-  else
-  {
-    matrix_col_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
-  }
-}
-
-
-// cos
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cos> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT   value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
-  }
-  else
-  {
-    matrix_col_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
-  }
-}
-
-
-// cosh
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT  value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
-  }
-  else
-  {
-    matrix_col_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
-  }
-}
-
-
-// exp
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_exp> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT  value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
-  }
-  else
-  {
-    matrix_col_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
-  }
-}
-
-
-// fabs
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT   value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
-  }
-  else
-  {
-    matrix_col_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
-  }
-}
-
-
-// floor
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_floor> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT    value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
-  }
-  else
-  {
-    matrix_col_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
-  }
-}
-
-
-// log
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT  value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
-  }
-  else
-  {
-    matrix_col_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
-  }
-}
-
-
-// log10
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log10> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT   value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
-  }
-  else
-  {
-    matrix_col_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
-  }
-}
-
-
-// sin
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sin> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT  value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
-  }
-  else
-  {
-    matrix_col_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-      viennacl::cuda_arg(proxy.lhs()),
-      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
-  }
-}
-
-
-// sinh
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT   value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
-  }
-  else
-  {
-    matrix_col_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
-
-     viennacl::cuda_arg(proxy.lhs()),
-     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
-     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
-    );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
-  }
-}
-
-
-// sqrt
-template<typename NumericT>
-void element_op(matrix_base<NumericT> & A,
-                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
-{
-  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
-
-  typedef NumericT   value_type;
-
-  if (A.row_major())
-  {
-    matrix_row_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
-     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
-     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
-     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
-     static_cast<unsign

<TRUNCATED>

Mime
View raw message