singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wang...@apache.org
Subject [3/3] incubator-singa git commit: SINGA-301 Hardcode OpenCL source code as static strings
Date Tue, 21 Feb 2017 06:50:39 GMT
SINGA-301 Hardcode OpenCL source code as static strings

Merge all xxx_str.cpp file into a single file in src/core/device/opencl_func.h.
Replace the global string for opencl functions with global constant string in singa::opencl
Add license headers for new files.

The python script should only be executed if .cl files are updated or
new .cl file is added.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/f6cf8f5d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/f6cf8f5d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/f6cf8f5d

Branch: refs/heads/master
Commit: f6cf8f5d6384452e03ab03ffa65d8ec8aa87d669
Parents: 8cc453a
Author: wangwei <wangwei@comp.nus.edu.sg>
Authored: Tue Feb 21 06:37:33 2017 +0000
Committer: Wei Wang <wangwei@comp.nus.edu.sg>
Committed: Tue Feb 21 14:48:43 2017 +0800

----------------------------------------------------------------------
 clsrc_to_str.py                       | 37 ----------------
 include/singa/core/device.h           |  5 ---
 src/core/device/opencl_device.cc      |  9 ++--
 src/core/device/opencl_func.h         | 26 +++++++++++
 src/core/tensor/distribution_str.cpp  |  9 ----
 src/core/tensor/tensor_math_opencl.h  | 28 ++++++------
 src/core/tensor/tensormath_str.cpp    |  9 ----
 src/model/layer/im2col_str.cpp        |  9 ----
 src/model/layer/opencl_convolution.cc |  4 +-
 src/model/layer/opencl_pooling.cc     | 14 +++---
 src/model/layer/pooling_str.cpp       |  9 ----
 tool/opencl/clsrc_to_str.py           | 71 ++++++++++++++++++++++++++++++
 12 files changed, 125 insertions(+), 105 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/clsrc_to_str.py
----------------------------------------------------------------------
diff --git a/clsrc_to_str.py b/clsrc_to_str.py
deleted file mode 100755
index dcd5274..0000000
--- a/clsrc_to_str.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/python
-
-import os.path
-
-distribution = "./src/core/tensor/distribution.cl"
-tensormath = "./src/core/tensor/tensor_math_opencl.cl"
-im2col = "./src/model/layer/im2col.cl"
-pooling = "./src/model/layer/pooling.cl"
-
-files = {"distribution_str" : distribution,
-		 "tensormath_str" : tensormath,
-		 "im2col_str" : im2col,
-		 "pooling_str" : pooling}
-
-if __name__ == "__main__":
-
-	for name, path in files.items():
-		with open(path, 'r') as file:
-			src = file.read()
-		src = repr(src)
-		src = src[1:-1]
-		src = src.replace('\"', '\\"') # Escape double quotes
-		src = src.replace('\\t', '') # Strip out tabs
-		
-		fullpath = os.path.dirname(path)
-		fullpath = fullpath + "/" + name + ".cpp"
-		
-		with open(fullpath, 'w') as file:
-			file.write("// This file is auto-generated, do not edit manually.\n")
-			file.write("// If any error occurs during compilation, please refer to clsrc_to_str.py\n")
-			file.write("#include <string>\n\n")
-			file.write("namespace singa {\n\n")
-			file.write("std::string " + name + " = \"")
-			file.write(src)
-			file.write("\";")
-			file.write("\n\n} // namespace singa")
-			file.close()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 8823b36..1a960d8 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -202,11 +202,6 @@ class CudaGPU : public Device {
 
 #ifdef USE_OPENCL
 
-extern std::string distribution_str;
-extern std::string tensormath_str;
-extern std::string im2col_str;
-extern std::string pooling_str;
-
 // Implement Device using OpenCL libs.
 class OpenclDevice : public singa::Device {
 public:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/core/device/opencl_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc
index 02d33ad..b36a1da 100644
--- a/src/core/device/opencl_device.cc
+++ b/src/core/device/opencl_device.cc
@@ -25,6 +25,7 @@
 #include "singa/core/device.h"
 #include "singa/utils/tinydir.h"
 #include "singa/utils/opencl_utils.h"
+#include "./opencl_func.h"
 
 #ifdef USE_OPENCL
 
@@ -91,10 +92,10 @@ void OpenclDevice::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
 
 
 void OpenclDevice::BuildPrograms() {
-  ocl::current_context().add_program(distribution_str, "distribution.cl");
-  ocl::current_context().add_program(tensormath_str, "tensor_math_opencl.cl");
-  ocl::current_context().add_program(im2col_str, "im2col.cl");
-  ocl::current_context().add_program(pooling_str, "pooling.cl");
+  ocl::current_context().add_program(opencl::distribution_str, "opencl_distribution");
+  ocl::current_context().add_program(opencl::tensormath_str, "opencl_tensor_math");
+  ocl::current_context().add_program(opencl::im2col_str, "opencl_im2col");
+  ocl::current_context().add_program(opencl::pooling_str, "opencl_pooling");
 }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/core/device/opencl_func.h
----------------------------------------------------------------------
diff --git a/src/core/device/opencl_func.h b/src/core/device/opencl_func.h
new file mode 100644
index 0000000..97ef2ec
--- /dev/null
+++ b/src/core/device/opencl_func.h
@@ -0,0 +1,26 @@
+// This file is auto-generated by tool/opencl/clsrc_to_str, do not edit manually.
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <string>
+
+namespace singa {
+ namespace opencl {
+const std::string im2col_str = "// This file is modified from the file located at\n// https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/im2col.cl\n// and is covered under the BSD 2-Clause License, as indicated in the LICENSE\n// file at the root of this repository.\n\n__kernel void im2col(const int n, __global const float* data_im,\n                     const int data_im_off,\n                     const int height, const int width,\n                     const int kernel_h, const int kernel_w,\n                     const int pad_h, const int pad_w,\n                     const int stride_h, const int stride_w,\n                     const int dilation_h, const int dilation_w,\n                     const int height_col, const int width_col,\n                     __global float* data_col, const int data_col_off) {\n\n  for (int index = get_global_id(0); index < n;\n      index += get_global_size(0)) {\n    const int h_index = index / width_col;\n    const int h_col 
 = h_index % height_col;\n    const int w_col = index % width_col;\n    const int c_im = h_index / height_col;\n    const int c_col = c_im * kernel_h * kernel_w;\n    const int h_offset = h_col * stride_h - pad_h;\n    const int w_offset = w_col * stride_w - pad_w;\n    \n    __global float* data_col_ptr = data_col + data_col_off;\n    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;\n    __global const float* data_im_ptr = data_im + data_im_off;\n    data_im_ptr += (c_im * height + h_offset) * width + w_offset;\n    \n    for (int i = 0; i < kernel_h; ++i) {\n      for (int j = 0; j < kernel_w; ++j) {\n        int h_im = h_offset + i * dilation_h;\n        int w_im = w_offset + j * dilation_w;\n        *data_col_ptr =\n            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?\n                data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;\n        data_col_ptr += height_col * width_col;\n      }\n    }\n  }\n}\n\n__kernel void col2im(const i
 nt n, __global const float* data_col,\n                     const int data_col_off, const int channels,\n                     const int height, const int width,\n                     const int kernel_h, const int kernel_w,\n                     const int pad_h, const int pad_w,\n                     const int stride_h, const int stride_w,\n                     const int dilation_h, const int dilation_w,\n                     const int height_col, const int width_col,\n                     __global float* data_im, const int data_im_off) {\n\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    float val = 0;\n    const int w_im = index % width + pad_w;\n    const int h_im = (index / width) % height + pad_h;\n    const int c_im = index / (width * height);\n    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;\n    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;\n    // compute the start and end of the output\n    const int w_col_start =\n    
     (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;\n    const int w_col_end = min(w_im / stride_w + 1, width_col);\n    const int h_col_start =\n        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;\n    const int h_col_end = min(h_im / stride_h + 1, height_col);\n    \n    // TODO: use LCM of stride and dilation to avoid unnecessary loops\n    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {\n      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {\n        int h_k = (h_im - h_col * stride_h);\n        int w_k = (w_im - w_col * stride_w);\n        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {\n          h_k /= dilation_h;\n          w_k /= dilation_w;\n          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *\n                                height_col + h_col) * width_col + w_col;\n          val += data_col[data_col_off + data_col_index];\n        }\n      }\n    }\n    data_im[
 data_im_off + index] = val;\n  }\n}\n";const std::string pooling_str = "// This file is modified from the file located at\n// https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/pooling.cl\n// and is covered under the BSD 2-Clause License, as indicated in the LICENSE\n// file at the root of this repository.\n\n__kernel void max_pool_forward(\n    const int nthreads, __global const float* bottom, const int channels, \n    const int height, const int width,\n    const int pooled_h, const int pooled_w,\n    const int kernel_h, const int kernel_w,\n    const int stride_h, const int stride_w,\n    const int pad_h, const int pad_w,\n    __global float* top, __global float* mask) {\n\n//  printf(\"%d \", get_global_size(0));\n  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n    const int pw = i % pooled_w;\n    const int ph = (i / pooled_w) % pooled_h;\n    const int c = (i / pooled_w / pooled_h) % channels;\n    const int n = i / pooled_w / po
 oled_h / channels;\n    \n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    const int hend = min(hstart + kernel_h, height);\n    const int wend = min(wstart + kernel_w, width);\n    hstart = max(hstart, (int)0);\n    wstart = max(wstart, (int)0);\n    \n    float maxval = -FLT_MAX;\n    int maxidx = -1;\n    __global const float* bottom_slice = bottom + (n * channels + c) * height * width;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        const int index = h * width + w;\n        if (bottom_slice[index] > maxval) {\n          maxidx = index;\n          maxval = bottom_slice[maxidx];\n        }\n      }\n    }\n    top[i] = maxval;\n    mask[i] = (float)maxidx;\n  }\n}\n\n__kernel void ave_pool_forward(\n    const int nthreads, __global const float* const bottom, const int channels, \n    const int height, const int width,\n    const int pooled_h, const int pooled_w,\n    const int kernel_h, const int
  kernel_w,\n    const int stride_h, const int stride_w, \n    const int pad_h, const int pad_w, __global float* top) {\n    \n  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n    const int pw = i % pooled_w;\n    const int ph = (i / pooled_w) % pooled_h;\n    const int c = (i / pooled_w / pooled_h) % channels;\n    const int n = i / pooled_w / pooled_h / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + kernel_h, height + pad_h);\n    int wend = min(wstart + kernel_w, width + pad_w);\n    const int pool_size = (hend - hstart) * (wend - wstart);\n    hstart = max(hstart, (int)0);\n    wstart = max(wstart, (int)0);\n    hend = min(hend, height);\n    wend = min(wend, width);\n    float aveval = 0;\n    __global const float* bottom_slice = bottom + (n * channels + c) * height * width;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        aveval += 
 bottom_slice[h * width + w];\n      }\n    }\n    top[i] = aveval / pool_size;\n  }\n}\n\n__kernel void sto_pool_forward_train(\n    const int nthreads, __global const float* bottom,\n    const int channels, const int height, const int width,\n    const int pooled_h, const int pooled_w, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w,\n    __global float* rand_idx, __global float* top) {\n    \n  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n    const int pw = i % pooled_w;\n    const int ph = (i / pooled_w) % pooled_h;\n    const int c = (i / pooled_w / pooled_h) % channels;\n    const int n = i / pooled_w / pooled_h / channels;\n    \n    const int hstart = ph * stride_h;\n    const int hend = min(hstart + kernel_h, height);\n    const int wstart = pw * stride_w;\n    const int wend = min(wstart + kernel_w, width);\n    float cumsum = 0.;\n    __global const float* bottom_slice = bottom + (n * channels + c) * height * 
 width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n      }\n    }\n    const float thres = rand_idx[i] * cumsum;\n    // Second pass: get value, and set i.\n    cumsum = 0;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n        if (cumsum >= thres) {\n          rand_idx[i] = ((n * channels + c) * height + h) * width + w;\n          top[i] = bottom_slice[h * width + w];\n          h = hend;\n          w = wend;\n        }\n      }\n    }\n  }\n}\n\n__kernel void sto_pool_forward_test(\n    const int nthreads, __global const float* const bottom, const int channels, \n    const int height, const int width,\n    const int pooled_h, const int pooled_w, \n    const int kernel_h, const int kernel_w, \n    const int stride_h, const int stride_w,\n    __global float* top) {\n    \n  f
 or (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n    const int pw = i % pooled_w;\n    const int ph = (i / pooled_w) % pooled_h;\n    const int c = (i / pooled_w / pooled_h) % channels;\n    const int n = i / pooled_w / pooled_h / channels;\n    \n    const int hstart = ph * stride_h;\n    const int hend = min(hstart + kernel_h, height);\n    const int wstart = pw * stride_w;\n    const int wend = min(wstart + kernel_w, width);\n    // We set cumsum to be 0 to avoid divide-by-zero problems\n    float cumsum = FLT_MIN;\n    float cumvalues = 0.;\n    __global const float* bottom_slice = bottom + (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];\n      }\n    }\n    top[i] = cumvalues / cumsum;\n  }\n}\n\n__kernel void max_pool_backwa
 rd(const int nthreads,\n                                __global const float* top_diff,\n                                __global const float* mask,\n                                const int channels,\n                                const int height, const int width,\n                                const int pooled_h, const int pooled_w,\n                                const int kernel_h, const int kernel_w,\n                                const int stride_h, const int stride_w,\n                                const int pad_h, const int pad_w,\n                                __global float* bottom_diff) {\n  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n    // find out the local i\n    // find out the local offset\n    const int w = i % width;\n    const int h = (i / width) % height;\n    const int c = (i / width / height) % channels;\n    const int n = i / width / height / channels;\n    \n    const int phstart =\n        (h + pad_h < kernel_h) ? 0
  : (h + pad_h - kernel_h) / stride_h + 1;\n    const int phend = min((h + pad_h) / stride_h + 1, pooled_h);\n    const int pwstart =\n        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;\n    const int pwend = min((w + pad_w) / stride_w + 1, pooled_w);\n    float gradient = 0.0f;\n    const int offset = (n * channels + c) * pooled_h * pooled_w;\n    __global const float* top_diff_slice = top_diff + offset;\n    __global const float* mask_slice = mask + offset;\n    for (int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < pwend; ++pw) {\n        if (mask_slice[ph * pooled_w + pw] == (float)(h * width + w)) {\n          gradient += top_diff_slice[ph * pooled_w + pw];\n        }\n      }\n    }\n    bottom_diff[i] = gradient;\n  }\n}\n\n__kernel void ave_pool_backward(const int nthreads,\n                                __global const float* top_diff,\n                                const int channels,\n                                const 
 int height, const int width,\n                                const int pooled_h, const int pooled_w,\n                                const int kernel_h, const int kernel_w,\n                                const int stride_h, const int stride_w,\n                                const int pad_h, const int pad_w,\n                                __global float* bottom_diff) {\n  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n    // find out the local i\n    // find out the local offset\n    const int w = i % width + pad_w;\n    const int h = (i / width) % height + pad_h;\n    const int c = (i / width / height) % channels;\n    const int n = i / width / height / channels;\n    \n    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n    const int phend = min(h / stride_h + 1, pooled_h);\n    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n    const int pwend = min(w / stride_w + 1, pooled_w);\n    float gradient
  = 0.0;\n    __global const float* const top_diff_slice = top_diff + (n * channels + c) * pooled_h * pooled_w;\n    for (int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < pwend; ++pw) {\n        // figure out the pooling size\n        int hstart = ph * stride_h - pad_h;\n        int wstart = pw * stride_w - pad_w;\n        int hend = min(hstart + kernel_h, height + pad_h);\n        int wend = min(wstart + kernel_w, width + pad_w);\n        int pool_size = (hend - hstart) * (wend - wstart);\n        gradient += top_diff_slice[ph * pooled_w + pw] / pool_size;\n      }\n    }\n    bottom_diff[i] = gradient;\n  }\n}\n\n__kernel void sto_pool_backward(\n    const int nthreads, __global const float* rand_idx,\n    __global const float* const top_diff, const int channels,\n    const int height, const int width,\n    const int pooled_h, const int pooled_w,\n    const int kernel_h, const int kernel_w,\n    const int stride_h, const int stride_w,\n    __global float* bo
 ttom_diff) {\n\n  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n    // find out the local i\n    // find out the local offset\n    const int w = i % width;\n    const int h = (i / width) % height;\n    const int c = (i / width / height) % channels;\n    const int n = i / width / height / channels;\n    \n    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n    const int phend = min(h / stride_h + 1, pooled_h);\n    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n    const int pwend = min(w / stride_w + 1, pooled_w);\n    float gradient = 0.0;\n    __global const float* rand_idx_slice = rand_idx + (n * channels + c) * pooled_h * pooled_w;\n    __global const float* top_diff_slice = top_diff + (n * channels + c) * pooled_h * pooled_w;\n    for (int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < pwend; ++pw) {\n        gradient += top_diff_slice[ph * pooled_w + pw]\n            * (i == (in
 t) (rand_idx_slice[ph * pooled_w + pw])?1.0:0.0);\n      }\n    }\n    bottom_diff[i] = gradient;\n  }\n}\n\n";const std::string distribution_str = "// This code is adapted from https://github.com/amd/OpenCL-caffe/blob/stable/src/caffe/ocl/random.cl\n\n//Note: random generator has two parts\n//first part: the open sourced threefy random generator kernel from DE Shaw Research\n//second part. we wrap the kernel up to generate uniform, bernoulli and gaussion distribution generators.\n\n//begin: the open sourced random generator from DE Shaw Research\n//https://www.deshawresearch.com/resources_random123.html\ntypedef uint uint32_t;\n\nstruct r123array4x32 {\n  uint32_t v[4];\n};\n\nenum r123_enum_threefry32x4 {\n  R_32x4_0_0 = 10,\n  R_32x4_0_1 = 26,\n  R_32x4_1_0 = 11,\n  R_32x4_1_1 = 21,\n  R_32x4_2_0 = 13,\n  R_32x4_2_1 = 27,\n  R_32x4_3_0 = 23,\n  R_32x4_3_1 = 5,\n  R_32x4_4_0 = 6,\n  R_32x4_4_1 = 20,\n  R_32x4_5_0 = 17,\n  R_32x4_5_1 = 11,\n  R_32x4_6_0 = 25,\n  R_32x4_6_1 = 10,\n 
  R_32x4_7_0 = 18,\n  R_32x4_7_1 = 20\n};\n\ninline uint32_t RotL_32(uint32_t x, unsigned int N) {\n  return (x << (N & 31)) | (x >> ((32 - N) & 31));\n}\n\ntypedef struct r123array4x32 threefry4x32_ctr_t;\ntypedef struct r123array4x32 threefry4x32_key_t;\ntypedef struct r123array4x32 threefry4x32_ukey_t;\n\ninline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) {\n  threefry4x32_ctr_t X;\n  uint32_t ks[4 + 1];\n  int i;\n  ks[4] = 0x1BD11BDA;\n\n  {\n    ks[0] = k.v[0];\n    X.v[0] = in.v[0];\n    ks[4] ^= k.v[0];\n\n    ks[1] = k.v[1];\n    X.v[1] = in.v[1];\n    ks[4] ^= k.v[1];\n\n    ks[2] = k.v[2];\n    X.v[2] = in.v[2];\n    ks[4] ^= k.v[2];\n\n    ks[3] = k.v[3];\n    X.v[3] = in.v[3];\n    ks[4] ^= k.v[3];\n  }\n\n  X.v[0] += ks[0];\n  X.v[1] += ks[1];\n  X.v[2] += ks[2];\n  X.v[3] += ks[3];\n\n  if (Nrounds > 0) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3
 ];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 1) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 2) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 3) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 3) {\n    X.v[0] += ks[1];\n    X.v[1] += ks[2];\n    X.v[2] += ks[3];\n    X.v[3] += ks[4];\n    X.v[4 - 1] += 1;\n  }\n\n  if (Nrounds > 4) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3
 ], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 5) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 6) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 7) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 7) {\n    X.v[0] += ks[2];\n    X.v[1] += ks[3];\n    X.v[2] += ks[4];\n    X.v[3] += ks[0];\n    X.v[4 - 1] += 2;\n  }\n\n  if (Nrounds > 8) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^=
  X.v[2];\n  }\n\n  if (Nrounds > 9) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 10) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 11) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 11) {\n    X.v[0] += ks[3];\n    X.v[1] += ks[4];\n    X.v[2] += ks[0];\n    X.v[3] += ks[1];\n    X.v[4 - 1] += 3;\n  }\n\n  if (Nrounds > 12) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nro
 unds > 13) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 14) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 15) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 15) {\n    X.v[0] += ks[4];\n    X.v[1] += ks[0];\n    X.v[2] += ks[1];\n    X.v[3] += ks[2];\n    X.v[4 - 1] += 4;\n  }\n\n  if (Nrounds > 16) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 17) {\n    X.v[0] 
 += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 18) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 19) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 19) {\n    X.v[0] += ks[0];\n    X.v[1] += ks[1];\n    X.v[2] += ks[2];\n    X.v[3] += ks[3];\n    X.v[4 - 1] += 5;\n  }\n\n  if (Nrounds > 20) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 21) {\n    X.v[0] += X.v[3];\n    X.v[3] = 
 RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 22) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 23) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 23) {\n    X.v[0] += ks[1];\n    X.v[1] += ks[2];\n    X.v[2] += ks[3];\n    X.v[3] += ks[4];\n    X.v[4 - 1] += 6;\n  }\n\n  if (Nrounds > 24) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 25) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_
 0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 26) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 27) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 27) {\n    X.v[0] += ks[2];\n    X.v[1] += ks[3];\n    X.v[2] += ks[4];\n    X.v[3] += ks[0];\n    X.v[4 - 1] += 7;\n  }\n\n  if (Nrounds > 28) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 29) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0]
 ;\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 30) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 31) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 31) {\n    X.v[0] += ks[3];\n    X.v[1] += ks[4];\n    X.v[2] += ks[0];\n    X.v[3] += ks[1];\n    X.v[4 - 1] += 8;\n  }\n\n  if (Nrounds > 32) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 33) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\
 n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 34) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 35) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 35) {\n    X.v[0] += ks[4];\n    X.v[1] += ks[0];\n    X.v[2] += ks[1];\n    X.v[3] += ks[2];\n    X.v[4 - 1] += 9;\n  }\n\n  if (Nrounds > 36) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 37) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v
 [1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 38) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 39) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 39) {\n    X.v[0] += ks[0];\n    X.v[1] += ks[1];\n    X.v[2] += ks[2];\n    X.v[3] += ks[3];\n    X.v[4 - 1] += 10;\n  }\n\n  if (Nrounds > 40) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 41) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X
 .v[1] ^= X.v[2];\n  }\n  if (Nrounds > 42) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 43) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 43) {\n    X.v[0] += ks[1];\n    X.v[1] += ks[2];\n    X.v[2] += ks[3];\n    X.v[3] += ks[4];\n    X.v[4 - 1] += 11;\n  }\n\n  if (Nrounds > 44) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 45) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n 
  if (Nrounds > 46) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 47) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 47) {\n    X.v[0] += ks[2];\n    X.v[1] += ks[3];\n    X.v[2] += ks[4];\n    X.v[3] += ks[0];\n    X.v[4 - 1] += 12;\n  }\n\n  if (Nrounds > 48) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 49) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 50) {\n  
   X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 51) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 51) {\n    X.v[0] += ks[3];\n    X.v[1] += ks[4];\n    X.v[2] += ks[0];\n    X.v[3] += ks[1];\n    X.v[4 - 1] += 13;\n  }\n\n  if (Nrounds > 52) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 53) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 54) {\n    X.v[0] += X.v[1];\n   
  X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 55) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 55) {\n    X.v[0] += ks[4];\n    X.v[1] += ks[0];\n    X.v[2] += ks[1];\n    X.v[3] += ks[2];\n    X.v[4 - 1] += 14;\n  }\n\n  if (Nrounds > 56) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 57) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 58) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1]
 , R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 59) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 59) {\n    X.v[0] += ks[0];\n    X.v[1] += ks[1];\n    X.v[2] += ks[2];\n    X.v[3] += ks[3];\n    X.v[4 - 1] += 15;\n  }\n\n  if (Nrounds > 60) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 61) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 62) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[
 1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 63) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 63) {\n    X.v[0] += ks[1];\n    X.v[1] += ks[2];\n    X.v[2] += ks[3];\n    X.v[3] += ks[4];\n    X.v[4 - 1] += 16;\n  }\n\n  if (Nrounds > 64) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 65) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 66) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2
 ] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 67) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 67) {\n    X.v[0] += ks[2];\n    X.v[1] += ks[3];\n    X.v[2] += ks[4];\n    X.v[3] += ks[0];\n    X.v[4 - 1] += 17;\n  }\n\n  if (Nrounds > 68) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 69) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 70) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3]
  = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 71) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 71) {\n    X.v[0] += ks[3];\n    X.v[1] += ks[4];\n    X.v[2] += ks[0];\n    X.v[3] += ks[1];\n    X.v[4 - 1] += 18;\n  }\n  return X;\n}\n//end: the open sourced random generator from DE Shaw Research\n\n// **************************\n// BERNOULLI DISTRIBUTION\n// **************************\n\n__kernel void PRNG_threefry4x32_bernoulli(\n__global float4 *randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat inf, float sup,\nfloat threshold,\nuint nrounds, uint numrandom) {\n\n  size_t gdx = get_global_id(0);\n\n  uint maxUint = 0;\n  maxUint--;\n  float r = (float)maxUint;\n\n  threefry4x32_ctr_t ctr = ctr_i;\n  threefry4x32_ukey_t ukey;\n\n  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;\n\n  threefr
 y4x32_ctr_t random4;\n\n  if ( gdx < numrandom ) {\n    random4 = threefry4x32_R(nrounds, ctr, ukey);\n    float4 frnd;\n    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n    randomnumber[gdx] = frnd;\n  }\n}\n\n// **************************\n// UNIFORM DISTRIBUTION (float)\n// **************************\n\n__kernel void PRNG_threefry4x32_uniform(\n__global float4 *randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat inf, float sup,\nuint nrounds, uint numrandom) {\n\n  size_t gdx = get_global_id(0);\n\n  uint maxUint = 0;\n  maxUint--;\n  float r = (float)maxUint;\n\n  threefry4x32_ctr_t ctr = ctr_i;\n  threefry4x32_ukey_t ukey;\n\n  ukey.v[0] = ukey.v[1] = ukey.v[2] = u
 key.v[3] = gdx;\n\n  threefry4x32_ctr_t random4;\n\n  if ( gdx < numrandom ) {\n    random4 = threefry4x32_R(nrounds, ctr, ukey);\n    float4 frnd;\n    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );\n    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );\n    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );\n    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );\n    randomnumber[gdx] = frnd;\n  }\n}\n\n// **************************\n// UNIFORM DISTRIBUTION (uint)\n// **************************\n\n__kernel void PRNG_threefry4x32_uint_uniform(\n__global uint4 *randomnumber,\nthreefry4x32_ctr_t ctr_i,\nuint inf, uint sup,\nuint nrounds, uint numrandom) {\n\n  size_t gdx = get_global_id(0);\n\n  threefry4x32_ctr_t ctr = ctr_i;\n  threefry4x32_ukey_t ukey;\n\n  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;\n\n  threefry4x32_ctr_t random4;\n\n  if ( gdx < numrandom ) {\n    random4 = threefry4x32_R(nrounds, ctr, ukey);\n    uint4 
 frnd;\n    frnd.x = random4.v[0] % (sup - inf) + inf;\n    frnd.y = random4.v[1] % (sup - inf) + inf;\n    frnd.z = random4.v[2] % (sup - inf) + inf;\n    frnd.w = random4.v[3] % (sup - inf) + inf;\n    randomnumber[gdx] = frnd;\n  }\n}\n\n// **************************\n// GAUSSIAN DISTRIBUTION\n// **************************\n\n__kernel void PRNG_threefry4x32_gaussian(\n__global float4 *randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat E, float V,\nuint nrounds, uint numrandom) {\n\n  size_t gdx = get_global_id(0);\n\n  uint maxUint = 0;\n  maxUint--;\n  float r = (float)maxUint;\n\n  threefry4x32_ctr_t ctr = ctr_i;\n  threefry4x32_ukey_t ukey1, ukey2;\n\n  ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;\n  ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;\n\n  threefry4x32_ctr_t random1, random2;\n\n  if ( gdx < numrandom ) {\n    random1 = threefry4x32_R(nrounds, ctr, ukey1);\n    random2 = threefry4x32_R(nrounds, ctr, ukey2);\n    float4 frnd1;\n\n    float r1 = (((fl
 oat)random1.v[0]) / r); // generate a random sequence of uniform distribution\n    float r2 = (((float)random2.v[0]) / r);\n    float r3 = (((float)random1.v[1]) / r);\n    float r4 = (((float)random2.v[1]) / r);\n    float r5 = (((float)random1.v[2]) / r);\n    float r6 = (((float)random2.v[2]) / r);\n    float r7 = (((float)random1.v[3]) / r);\n    float r8 = (((float)random2.v[3]) / r);\n\n    if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) {\n      r2 += 0.0001;\n      r4 += 0.0001;\n      r6 += 0.0001;\n      r8 += 0.0001;\n    }\n\n    frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data\n    //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence\n    frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data\n    //frnd2.y = sin(2*M
 _PI*r3)*sqrt(-2.0*log(r4));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence\n    frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data\n    //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence\n    frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data\n    //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence\n\n    randomnumber[gdx] = frnd1;\n  }\n}\n";const std::string tensormath_str = "/**\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOTICE file\n * distributed with this work for additional information\n * regarding copyright
  ownership.  The ASF licenses this file\n * to you under the Apache License, Version 2.0 (the\n * \"License\"); you may not use this file except in compliance\n * with the License.  You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n// **************************************\n// Element-wise functions\n// **************************************\n\n// Sum is basically reduction.\n// This reduction code is serial reduction modified from AMD\'s example.\n// http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/\n__kernel\nvoid clkernel_fa
 bs(const int num, __global const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = fabs(in[i]);\n}\n\n__kernel\nvoid clkernel_add_scalar(const int num, float x, __global const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in[i] + x;\n}\n\n__kernel\nvoid clkernel_add(const int num, __global const float* in1, __global const float* in2,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in1[i] + in2[i];\n}\n\n__kernel\nvoid clkernel_clamp(const int num, float low, float high, __global const float* in,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = clamp(in[i], low, high);\n}\n\n__kernel\nvoid clkernel_divide_scalar_matx(const int num, __global const float* in1, const float x,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in1
 [i] / x;\n}\n\n__kernel\nvoid clkernel_divide_scalar_xmat(const int num, const float x, __global const float* in1,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = x / in1[i];\n}\n\n__kernel\nvoid clkernel_divide(const int num, __global const float* in1, __global const float* in2,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in1[i] / in2[i];\n}\n\n__kernel\nvoid clkernel_eltmult_scalar(const int num, const float x, __global const float* in,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in[i] * x;\n}\n\n__kernel\nvoid clkernel_eltmult(const int num, __global const float* in1, __global const float* in2,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in1[i] * in2[i];\n}\n\n__kernel\nvoid clkernel_exp(const int num, __global const float* in, __global float* out) {\n  const int i = get_
 global_id(0);\n  if (i >= num) return;\n  out[i] = exp(in[i]);\n}\n\n__kernel\nvoid clkernel_le(const int num, __global const float* in, const float x,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] <= x) ? 1.0f : 0.0f;\n}\n\n__kernel\nvoid clkernel_log(const int num, __global const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = log(in[i]);\n}\n\n__kernel\nvoid clkernel_lt(const int num, __global const float* in, const float x,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] < x) ? 1.0f : 0.0f;\n}\n\n__kernel\nvoid clkernel_ge(const int num, __global const float* in, const float x,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] >= x) ? 1.0f : 0.0f;\n}\n\n__kernel\nvoid clkernel_gt(const int num, __global const float* in, const float x,\n  __global float* 
 out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] > x) ? 1.0f : 0.0f;\n}\n\n__kernel\nvoid clkernel_pow_scalar(const int num, const float x, __global const float* in,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = pow(in[i], x);\n}\n\n__kernel\nvoid clkernel_pow(const int num, __global const float* in1, __global const float* in2,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = pow(in1[i], in2[i]);\n}\n\n__kernel\nvoid clkernel_relu(const int num, __global const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] >= 0.0f) ? in[i] : 0.0f;\n}\n\n__kernel\nvoid clkernel_set(const int num, const float x, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = x;\n}\n\n__kernel\nvoid clkernel_sigmoid(const int num, __global const float* in, __global float*
  out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = 1 / (1 + exp(-(in[i])));\n}\n\n__kernel\nvoid clkernel_sign(const int num, __global const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] > 0) - (in[i] < 0);\n}\n\n__kernel\nvoid clkernel_sqrt(const int num, __global const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = sqrt(in[i]);\n}\n\n// kernel for square is called pow(2).\n\n__kernel\nvoid clkernel_subtract_scalar(const int num, __global const float* in, const float x,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in[i] - x;\n}\n\n__kernel\nvoid clkernel_subtract(const int num, __global const float* in1, __global const float* in2,\n   __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in1[i] - in2[i];\n}\n\n// reduce3 kernel from\n// 
 https://github.com/sschaetz/nvidia-opencl-examples/blob/master/OpenCL/src/oclReduction/oclReduction_kernel.cl\n__kernel\nvoid clkernel_sum(const int num, __global const float* in, __global float* out,\n  __local float* sdata) {\n  const int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);\n  const int tid = get_local_id(0);\n  sdata[tid] = (i < num) ? in[i] : 0.0f;\n\n  // Perform the first level of reduction.\n  if (i + get_local_size(0) < num) {\nsdata[tid] += in[i + get_local_size(0)];\n  }\n  barrier(CLK_LOCAL_MEM_FENCE);\n\n  for (int s = get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid > s) {\n  sdata[tid] += sdata[tid + s];\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n\n  if (tid == 0) {\nout[get_group_id(0)] = sdata[0];\n  }\n}\n\n__kernel\nvoid clkernel_tanh(const int num, __global const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = tanh(in[i]);\n}\n\n// **************************************\n// Random funct
 ions\n// **************************************\n\n// See: distribution.cl\n\n// *********************************************************\n// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas\n// *********************************************************\n\n__kernel\nvoid clkernel_amax(const int num, __global const float* in, __global int* ret,\n   __local uint* sdata, __local size_t* temp) {\n  const int gid = get_global_id(0);\n  const int tid = get_local_id(0);\n\n  for(int s = get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid < s) {\n  sdata[tid] = (in[sdata[tid]] > in[tid+s]) ? sdata[tid] : tid;\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n  if (tid == 0) {\nret[0] = sdata[0];\n  }\n}\n\n\n/* TODO: Fix line 284:20.\n__kernel\nvoid clkernel_amin(const int num, __global const float* in, __global int* ret,\n   __local float* sdata, __local size_t* temp) {\n  const int gid = get_global_id(0);\n  const int tid = get_local_id(0);\n\n  // Initialize the values to pos infinity.\n  sda
 ta[tid] = (gid < num) ? in[gid] : INFINITY;\n  barrier(CLK_LOCAL_MEM_FENCE);\n\n  for(int s = get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid < s) {\n  sdata[tid] = (in[sdata[tid]] < in[tid+s]) ? sdata[tid] : tid;\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n  if (tid == 0) {\nret[0] = sdata[0];\n  }\n}*/\n\n\n__kernel\nvoid clkernel_asum(const int num, __global const float* in, __global float* out,\n   __local float* sdata) {\n  const int tid = get_local_id(0);\n  const int i = get_global_id(0);\n\n  // Initialize\n  sdata[tid] = (i < num) ? in[i] : INFINITY;\n  // Perform the first level of reduction.\n  if (i + get_local_size(0) < num) {\nsdata[tid] += in[i + get_local_size(0)];\n  }\n  barrier(CLK_LOCAL_MEM_FENCE);\n\n  for(int s = get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid < s) {\n  sdata[tid] = fabs(sdata[tid + s]);\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n  if (tid == 0) {\nout[0] = sdata[0];\n  }\n}\n\n__kernel\nvoid clkernel_axpy(const int num, float alpha, __global const fl
 oat* in,\n   __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = fma(alpha, in[i], out[i]);\n}\n\n// This kernel is essentially the same as Sum, except that during the process\n// of reading in data to the local memory, the value is also doubled.\n// Then, just before submitting the sum to out, we do a square-root on it.\n__kernel\nvoid clkernel_nrm2(const int num, __global const float* in, __global float* out,\n   __local float* sdata) {\n  const int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);\n  const int tid = get_local_id(0);\n  sdata[tid] = (i < num) ? (in[i] * in[i]) : 0.0f;\n\n  // Perform the first level of reduction.\n  if (i + get_local_size(0) < num) {\nsdata[tid] += in[i + get_local_size(0)];\n  }\n  barrier(CLK_LOCAL_MEM_FENCE);\n\n  for (int s = get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid > s) {\n  sdata[tid] += sdata[tid + s];\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n\n  if (tid == 0) {\nout[get_group_id(0
 )] = sqrt(sdata[0]);\n  }\n}\n\n__kernel\nvoid clkernel_scale(const int num, float x, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = x * out[i];\n}\n\n__kernel\nvoid clkernel_dot(const int num, __global const float* in1, __global const float* in2,\n    __global float* out, __local float* scratch) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  int offset = i << 2;\n  scratch[i] = in1[offset] * in2[offset];\n\n}\n\n// First kernel from http://www.bealto.com/gpu-gemv_intro.html\n// y = \xce\xb1*A*v + \xce\xb2*y\n// fma(a, b, c) == (a * b) + c with infinite precision\n__kernel\nvoid clkernel_gemv(const int m, const int n, const float alpha,\n   __global const float* A, __global const float* v,\n   const float beta, __global float* out) {\n  const int i = get_global_id(0);\n  float sum  = 0.0f;\n  for (int k = 0; k < n; k++) {\n    sum += fma(beta, out[i + m * k], alpha * A[i + m * k] * v[k]);\n  }\n  out[i] = sum;\n}\n\n/
 / http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-dgmm\n// X[j] = x[j*inc(x)] if inc(x) \xe2\x89\xa5 0\n//= x[(\xcf\x87 \xe2\x88\x92 1)*|inc(x)| \xe2\x88\x92 j*|inc(x)|] if inc(x) < 0\n\n// C = diag( X )*A\n__kernel\nvoid clkernel_dgmm_left(const int nrow, const int ncol,\n__global const float* M, __global const float* v,\n__global float* out) {\n  const uint gidx = get_global_id(0);\n\n  uint offset = gidx * ncol;\n  for (uint i = 0; i < ncol; i++) {\nout[offset + i] = M[offset + i] * v[i];\n  }\n}\n\n// C = A*diag( X )\n__kernel\nvoid clkernel_dgmm_right(const int nrow, const int ncol,\n __global const float* M, __global const float* v,\n __global float* out) {\n  const uint gidx = get_global_id(0);\n\n  uint offset = gidx * ncol;\n  for (uint i = 0; i < ncol; i++) {\nout[offset + i] = M[offset + i] * v[gidx];\n  }\n}\n\n// TODO: Optimize with Reference from http://www.cedricnugteren.nl/tutorial.php?page=1\n//  C = \xce\xb1*A*B + \xce\xb2*C\n__kernel\nvoid clkernel_gemm(const u
 int nrowA, const uint ncolB, const uint ncolA, const float alpha,\n    __global const float* A, __global const float* B, const float beta,\n     __global float* C, __local float* Asub, __local float* Bsub) {\n\n  const uint lidx = get_local_id(0);\n  const uint lidy = get_local_id(1);\n  const uint TS = get_local_size(0); // Tile size\n  const uint gidx = TS * get_group_id(0) + lidx; // Row ID of C (0..M)\n  const uint gidy = TS * get_group_id(1) + lidy; // Row ID of C (0..N)\n\n  // Initialise the accumulation register\n  float acc = 0.0f;\n\n  // Loop over all tiles\n  const int numtiles = ncolA / TS;\n  for (int t = 0; t < numtiles; t++) {\n    const int tiledRow = TS * t + lidx;\n    const int tiledCol = TS * t + lidy;\n    Asub[lidy * TS + lidx] = A[tiledCol * nrowA + gidx];\n    Bsub[lidy * TS + lidx] = B[gidy * ncolA + tiledRow];\n\n    barrier(CLK_LOCAL_MEM_FENCE);\n\n    for(int k = 0; k < TS; k++) {\n      acc += Asub[k * TS + lidx] * Bsub[lidy * TS + k] * alpha;\n    }\n\
 n    barrier(CLK_LOCAL_MEM_FENCE);\n  }\n\n  C[gidy * nrowA + gidx] = fma(beta, C[gidy * nrowA + gidx], acc);\n}\n\n\n__kernel\nvoid clkernel_crossentropy(const uint batchsize, const uint dim,\n   __global const float* p, __global const int* t,\n   __global float* loss) {\n  const uint gidx = get_global_id(0);\n  if (gidx >= batchsize) return;\n\n  int truth_idx = t[gidx];\n  if (truth_idx <= 0) return;\n  float prob_of_truth = p[gidx * dim + truth_idx];\n  loss[gidx] = -log(fmax(prob_of_truth, -FLT_MIN));\n}\n\n\n__kernel\nvoid clkernel_softmaxentropy(const uint batchsize, const uint dim,\n __global const float* p, __global const int* t,\n __global float* grad) {\n  const uint gidx = get_global_id(0);\n  if (gidx >= batchsize) return;\n\n  int truth_idx = t[gidx];\n  if (truth_idx <= 0) return;\n  grad[gidx * dim + truth_idx] -= 1.0;\n}\n\n\n__kernel\nvoid clkernel_rowmax(const uint nrow, const uint ncol,\n                     __global const float* in, __global float* out) {\n  con
 st uint row_id = get_global_id(0);\n  if (row_id >= nrow) return;\n\n  float row_max_val = -FLT_MAX;\n  for (uint i = 0; i < ncol; i++) {\n    row_max_val = fmax(row_max_val, in[row_id * ncol + i]);\n  }\n\n  out[row_id] = row_max_val;\n}\n\n\n// **************************************\n// Matrix functions\n// **************************************\n/*\n__kernel\nvoid clkernel_addcol(int nrow, int ncol, __global const float* A, __global const float* v, __global float* out) {\n  const int i = get_global_id(0);\n  const int j = get_global_id(1);\n  if (i >= nrow) return;\n  if (j >= ncol) return;\n  ret[j] = A[j + nrow * i] + v[j];\n}\n\n__kernel\nvoid clkernel_addrow(int nrow, int ncol, __global const float* A, __global const float* v, __global float* out) {\n  const int i = get_global_id(0);\n  const int j = get_global_id(1);\n  if (i >= nrow) return;\n  if (j >= ncol) return;\n  out[i] = A[i + ncol * j] + v[i];\n}\n\n__kernel\nvoid clkernel_outerproduct(int m, const int n, __global 
 const float* in1, __global const float* in2, __global float* out) {\n  const int col = get_global_id(0);\n  const int row = get_global_id(1);\n\n  // TODO: This\n}\n\n__kernel\nvoid clkernel_sumcol(int nrow, int ncol, __global const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= nrow) return;\n\n  float sum = 0.0f;\n  for (int j = 0; j < nrow; j++) {\nsum += input[nrow * i + j];\n  }\n  out[i] = sum;\n}\n*/\n__kernel\nvoid clkernel_sumrow(int nrow, int ncol, __global const float* in, __global float* out) {\n  const int idx = get_global_id(0);\n  if (idx >= nrow) return;\n\n  float sum = 0.0f;\n  for (int j = 0; j < ncol; j++) {\nsum += in[j + ncol * idx];\n  }\n  out[idx] = sum;\n}\n\n\n// Adapted from http://code.haskell.org/HsOpenCL/tests/bench/transpose.cl\n#define BLOCK_DIM 16\n__kernel\nvoid clkernel_transpose(uint nrow, uint ncol,\n__global const float* in, __global float* out,\n__local float* sdata) {\n  uint gidx = get_global_id(0);\n  uint 
 gidy = get_global_id(1);\n\n  if ((gidx < ncol) && (gidy < nrow)) {\nuint id_in = gidy * ncol + gidx;\nsdata[get_local_id(1) * (BLOCK_DIM+1) + get_local_id(0)] = in[id_in];\n  }\n\n  barrier(CLK_LOCAL_MEM_FENCE);\n\n  gidx = get_group_id(1) * BLOCK_DIM + get_local_id(0);\n  gidy = get_group_id(0) * BLOCK_DIM + get_local_id(1);\n  if ((gidx < nrow) && (gidy < ncol)) {\nuint id_out = gidy * nrow + gidx;\nout[id_out] = sdata[get_local_id(0) * (BLOCK_DIM + 1) + get_local_id(1)];\n  }\n}\n/*\n__kernel\nvoid clkernel_transpose2(uint nrow, uint ncol, __global const float* in, __global float* out, __local float* sdata) {\n  const uint lidx = get_local_id(0);\n  const uint lidy = get_local_id(1);\n  const uint id0 = get_group_id(0) * ncol * lidx;\n  const uint id1 = get_group_id(1) * nrow * lidy;\n\n  if (id0 < nrow && id1 < ncol) {\nsdata[lidx][lidy] = in[id1 * nrow + id0];\n  }\n\n  barrier(CLK_LOCAL_MEM_FENCE);\n\n  const uint new_id0 = get_group_id(1) * nrow + lidx;\n  const uint new_id1
  = get_group_id(0) * ncol + lidy;\n\n  if (new_id0 < ncol && new_id1 < nrow) {\nout[new_id1 * ncol + new_id0] = sdata[lidx][lidy];\n  }\n}*/\n\n__kernel\nvoid clkernel_diagvec_left(uint vsize, __global const float* vin, __global float* out) {\n  const uint gid = get_global_id(0);\n\n  for (uint i = 0; i < vsize; i++)\nout[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f;\n}\n\n\n__kernel\nvoid clkernel_diagvec_right(uint vsize, __global const float* vin, __global float* out) {\n  const uint gid = get_global_id(0);\n\n  for (uint i = 0; i < vsize; i++)\nout[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f;\n}\n";
+ } //  namespace opencl 
+} //  namespace singa
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/core/tensor/distribution_str.cpp
----------------------------------------------------------------------
diff --git a/src/core/tensor/distribution_str.cpp b/src/core/tensor/distribution_str.cpp
deleted file mode 100644
index 0162bca..0000000
--- a/src/core/tensor/distribution_str.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// This file is auto-generated, do not edit manually.
-// If any error occurs during compilation, please refer to clsrc_to_str.py
-#include <string>
-
-namespace singa {
-
-std::string distribution_str = "// This code is adapted from https://github.com/amd/OpenCL-caffe/blob/stable/src/caffe/ocl/random.cl\n\n//Note: random generator has two parts\n//first part: the open sourced threefy random generator kernel from DE Shaw Research\n//second part. we wrap the kernel up to generate uniform, bernoulli and gaussion distribution generators.\n\n//begin: the open sourced random generator from DE Shaw Research\n//https://www.deshawresearch.com/resources_random123.html\ntypedef uint uint32_t;\n\nstruct r123array4x32 {\n  uint32_t v[4];\n};\n\nenum r123_enum_threefry32x4 {\n  R_32x4_0_0 = 10,\n  R_32x4_0_1 = 26,\n  R_32x4_1_0 = 11,\n  R_32x4_1_1 = 21,\n  R_32x4_2_0 = 13,\n  R_32x4_2_1 = 27,\n  R_32x4_3_0 = 23,\n  R_32x4_3_1 = 5,\n  R_32x4_4_0 = 6,\n  R_32x4_4_1 = 20,\n  R_32x4_5_0 = 17,\n  R_32x4_5_1 = 11,\n  R_32x4_6_0 = 25,\n  R_32x4_6_1 = 10,\n  R_32x4_7_0 = 18,\n  R_32x4_7_1 = 20\n};\n\ninline uint32_t RotL_32(uint32_t x, unsigned int N) {\n  return (x << (N 
 & 31)) | (x >> ((32 - N) & 31));\n}\n\ntypedef struct r123array4x32 threefry4x32_ctr_t;\ntypedef struct r123array4x32 threefry4x32_key_t;\ntypedef struct r123array4x32 threefry4x32_ukey_t;\n\ninline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) {\n  threefry4x32_ctr_t X;\n  uint32_t ks[4 + 1];\n  int i;\n  ks[4] = 0x1BD11BDA;\n\n  {\n    ks[0] = k.v[0];\n    X.v[0] = in.v[0];\n    ks[4] ^= k.v[0];\n\n    ks[1] = k.v[1];\n    X.v[1] = in.v[1];\n    ks[4] ^= k.v[1];\n\n    ks[2] = k.v[2];\n    X.v[2] = in.v[2];\n    ks[4] ^= k.v[2];\n\n    ks[3] = k.v[3];\n    X.v[3] = in.v[3];\n    ks[4] ^= k.v[3];\n  }\n\n  X.v[0] += ks[0];\n  X.v[1] += ks[1];\n  X.v[2] += ks[2];\n  X.v[3] += ks[3];\n\n  if (Nrounds > 0) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 1) {\n    X.v[0] += X.v[
 3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 2) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 3) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 3) {\n    X.v[0] += ks[1];\n    X.v[1] += ks[2];\n    X.v[2] += ks[3];\n    X.v[3] += ks[4];\n    X.v[4 - 1] += 1;\n  }\n\n  if (Nrounds > 4) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 5) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[
 3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 6) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 7) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 7) {\n    X.v[0] += ks[2];\n    X.v[1] += ks[3];\n    X.v[2] += ks[4];\n    X.v[3] += ks[0];\n    X.v[4 - 1] += 2;\n  }\n\n  if (Nrounds > 8) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 9) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^
 = X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 10) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 11) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 11) {\n    X.v[0] += ks[3];\n    X.v[1] += ks[4];\n    X.v[2] += ks[0];\n    X.v[3] += ks[1];\n    X.v[4 - 1] += 3;\n  }\n\n  if (Nrounds > 12) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 13) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += 
 X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 14) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 15) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 15) {\n    X.v[0] += ks[4];\n    X.v[1] += ks[0];\n    X.v[2] += ks[1];\n    X.v[3] += ks[2];\n    X.v[4 - 1] += 4;\n  }\n\n  if (Nrounds > 16) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 17) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = Rot
 L_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 18) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 19) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 19) {\n    X.v[0] += ks[0];\n    X.v[1] += ks[1];\n    X.v[2] += ks[2];\n    X.v[3] += ks[3];\n    X.v[4 - 1] += 5;\n  }\n\n  if (Nrounds > 20) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 21) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 \n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 22) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 23) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 23) {\n    X.v[0] += ks[1];\n    X.v[1] += ks[2];\n    X.v[2] += ks[3];\n    X.v[3] += ks[4];\n    X.v[4 - 1] += 6;\n  }\n\n  if (Nrounds > 24) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 25) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n
   }\n\n  if (Nrounds > 26) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 27) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 27) {\n    X.v[0] += ks[2];\n    X.v[1] += ks[3];\n    X.v[2] += ks[4];\n    X.v[3] += ks[0];\n    X.v[4 - 1] += 7;\n  }\n\n  if (Nrounds > 28) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 29) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 30
 ) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 31) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 31) {\n    X.v[0] += ks[3];\n    X.v[1] += ks[4];\n    X.v[2] += ks[0];\n    X.v[3] += ks[1];\n    X.v[4 - 1] += 8;\n  }\n\n  if (Nrounds > 32) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 33) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 34) {\n    X.v[0] += X.v[1]
 ;\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 35) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 35) {\n    X.v[0] += ks[4];\n    X.v[1] += ks[0];\n    X.v[2] += ks[1];\n    X.v[3] += ks[2];\n    X.v[4 - 1] += 9;\n  }\n\n  if (Nrounds > 36) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 37) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 38) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X
 .v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 39) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 39) {\n    X.v[0] += ks[0];\n    X.v[1] += ks[1];\n    X.v[2] += ks[2];\n    X.v[3] += ks[3];\n    X.v[4 - 1] += 10;\n  }\n\n  if (Nrounds > 40) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 41) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n  if (Nrounds > 42) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X
 .v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 43) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 43) {\n    X.v[0] += ks[1];\n    X.v[1] += ks[2];\n    X.v[2] += ks[3];\n    X.v[3] += ks[4];\n    X.v[4 - 1] += 11;\n  }\n\n  if (Nrounds > 44) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 45) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 46) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.
 v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 47) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 47) {\n    X.v[0] += ks[2];\n    X.v[1] += ks[3];\n    X.v[2] += ks[4];\n    X.v[3] += ks[0];\n    X.v[4 - 1] += 12;\n  }\n\n  if (Nrounds > 48) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 49) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 50) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v
 [3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 51) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 51) {\n    X.v[0] += ks[3];\n    X.v[1] += ks[4];\n    X.v[2] += ks[0];\n    X.v[3] += ks[1];\n    X.v[4 - 1] += 13;\n  }\n\n  if (Nrounds > 52) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 53) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 54) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_
 32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 55) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 55) {\n    X.v[0] += ks[4];\n    X.v[1] += ks[0];\n    X.v[2] += ks[1];\n    X.v[3] += ks[2];\n    X.v[4 - 1] += 14;\n  }\n\n  if (Nrounds > 56) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 57) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 58) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^
 = X.v[2];\n  }\n\n  if (Nrounds > 59) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 59) {\n    X.v[0] += ks[0];\n    X.v[1] += ks[1];\n    X.v[2] += ks[2];\n    X.v[3] += ks[3];\n    X.v[4 - 1] += 15;\n  }\n\n  if (Nrounds > 60) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 61) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 62) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (
 Nrounds > 63) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 63) {\n    X.v[0] += ks[1];\n    X.v[1] += ks[2];\n    X.v[2] += ks[3];\n    X.v[3] += ks[4];\n    X.v[4 - 1] += 16;\n  }\n\n  if (Nrounds > 64) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 65) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 66) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 67) {\n    X.v
 [0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 67) {\n    X.v[0] += ks[2];\n    X.v[1] += ks[3];\n    X.v[2] += ks[4];\n    X.v[3] += ks[0];\n    X.v[4 - 1] += 17;\n  }\n\n  if (Nrounds > 68) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 69) {\n    X.v[0] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 70) {\n    X.v[0] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n    X.v[1] ^= X.v[0];\n    X.v[2] += X.v[3];\n    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);\n    X.v[3] ^= X.v[2];\n  }\n\n  if (Nrounds > 71) {\n    X.v[0] += X.v[3];\n    X.v[
 3] = RotL_32(X.v[3], R_32x4_7_0);\n    X.v[3] ^= X.v[0];\n    X.v[2] += X.v[1];\n    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);\n    X.v[1] ^= X.v[2];\n  }\n\n  if (Nrounds > 71) {\n    X.v[0] += ks[3];\n    X.v[1] += ks[4];\n    X.v[2] += ks[0];\n    X.v[3] += ks[1];\n    X.v[4 - 1] += 18;\n  }\n  return X;\n}\n//end: the open sourced random generator from DE Shaw Research\n\n// **************************\n// BERNOULLI DISTRIBUTION\n// **************************\n\n__kernel void PRNG_threefry4x32_bernoulli(\n__global float4 *randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat inf, float sup,\nfloat threshold,\nuint nrounds, uint numrandom) {\n\n  size_t gdx = get_global_id(0);\n\n  uint maxUint = 0;\n  maxUint--;\n  float r = (float)maxUint;\n\n  threefry4x32_ctr_t ctr = ctr_i;\n  threefry4x32_ukey_t ukey;\n\n  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;\n\n  threefry4x32_ctr_t random4;\n\n  if ( gdx < numrandom ) {\n    random4 = threefry4x32_R(nrounds, ctr, ukey);\n    float4 frn
 d;\n    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n    randomnumber[gdx] = frnd;\n  }\n}\n\n// **************************\n// UNIFORM DISTRIBUTION (float)\n// **************************\n\n__kernel void PRNG_threefry4x32_uniform(\n__global float4 *randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat inf, float sup,\nuint nrounds, uint numrandom) {\n\n  size_t gdx = get_global_id(0);\n\n  uint maxUint = 0;\n  maxUint--;\n  float r = (float)maxUint;\n\n  threefry4x32_ctr_t ctr = ctr_i;\n  threefry4x32_ukey_t ukey;\n\n  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;\n\n  threefry4x32_ctr_t random4;\n\n  if ( gdx < numrandom ) {\n    random4 = threefry4x32_R(nrounds,
  ctr, ukey);\n    float4 frnd;\n    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );\n    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );\n    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );\n    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );\n    randomnumber[gdx] = frnd;\n  }\n}\n\n// **************************\n// UNIFORM DISTRIBUTION (uint)\n// **************************\n\n__kernel void PRNG_threefry4x32_uint_uniform(\n__global uint4 *randomnumber,\nthreefry4x32_ctr_t ctr_i,\nuint inf, uint sup,\nuint nrounds, uint numrandom) {\n\n  size_t gdx = get_global_id(0);\n\n  threefry4x32_ctr_t ctr = ctr_i;\n  threefry4x32_ukey_t ukey;\n\n  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;\n\n  threefry4x32_ctr_t random4;\n\n  if ( gdx < numrandom ) {\n    random4 = threefry4x32_R(nrounds, ctr, ukey);\n    uint4 frnd;\n    frnd.x = random4.v[0] % (sup - inf) + inf;\n    frnd.y = random4.v[1] % (sup - inf) + inf;\n    frnd.z = r
 andom4.v[2] % (sup - inf) + inf;\n    frnd.w = random4.v[3] % (sup - inf) + inf;\n    randomnumber[gdx] = frnd;\n  }\n}\n\n// **************************\n// GAUSSIAN DISTRIBUTION\n// **************************\n\n__kernel void PRNG_threefry4x32_gaussian(\n__global float4 *randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat E, float V,\nuint nrounds, uint numrandom) {\n\n  size_t gdx = get_global_id(0);\n\n  uint maxUint = 0;\n  maxUint--;\n  float r = (float)maxUint;\n\n  threefry4x32_ctr_t ctr = ctr_i;\n  threefry4x32_ukey_t ukey1, ukey2;\n\n  ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;\n  ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;\n\n  threefry4x32_ctr_t random1, random2;\n\n  if ( gdx < numrandom ) {\n    random1 = threefry4x32_R(nrounds, ctr, ukey1);\n    random2 = threefry4x32_R(nrounds, ctr, ukey2);\n    float4 frnd1;\n\n    float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution\n    float r2 = (((float)random2.v[0]) 
 / r);\n    float r3 = (((float)random1.v[1]) / r);\n    float r4 = (((float)random2.v[1]) / r);\n    float r5 = (((float)random1.v[2]) / r);\n    float r6 = (((float)random2.v[2]) / r);\n    float r7 = (((float)random1.v[3]) / r);\n    float r8 = (((float)random2.v[3]) / r);\n\n    if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) {\n      r2 += 0.0001;\n      r4 += 0.0001;\n      r6 += 0.0001;\n      r8 += 0.0001;\n    }\n\n    frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data\n    //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence\n    frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data\n    //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4));      // return the quadrature counterpart of the foregoing pseudo normal distribution seq
 uence\n    frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data\n    //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence\n    frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data\n    //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence\n\n    randomnumber[gdx] = frnd1;\n  }\n}\n";
-
-} // namespace singa
\ No newline at end of file



Mime
View raw message