I have been trying to implement some code requiring to call reduce on thrust::device_ptr, and the results are not consistent with CPU implementation while dealing with large values. I have to deal with large values. So is there a way around:
My code:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
int main()
real** a;
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
cout<<"\nsum cpu "<< sum2<<"\n";
std::cout << "\nSUCESS "<< "\n";
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
The compiler that I am using is nvcc and my graphics card is nvidia 1650 with compute capability 7.5.

According to the documentation, thrust expects the type for summation to be reflected in the init value:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
The type of that constant you have is an integral type. If you change that to a double-precision constant:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
you get matching results, between CPU and GPU, according to my testing. (You could alternatively cast your constant to real type: (real)0 and use that, and there are other ways to address this as well, such as dropping the use of the init value and the binary op.)


Thrust's exclusive_scan_by_key function takes the same amount of time as a sequential implementation?

I'm relatively new to Thrust and I'm trying to perform a segmented scan. Here is my code, which you should be able to run as-is:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <chrono>
// Sequential scan for CPU
float* test_seqScan(float* in, int s, int m) {
float* out = new float[s * m];
for (unsigned int i = 0; i < s; i++) {
out[i * m] = 0;
for (unsigned int i = 0; i < s; i++) {
for (unsigned int j = 1; j < m; j++) {
out[i * m + j] = out[i * m + j - 1] + in[i * m + j - 1];
return out;
void test_sumScan(thrust::device_vector<float> dev_in, thrust::device_vector<int> dev_keys, int s, int m) {
// Allocate device memory for output
thrust::device_vector<float> dev_out(s * m);
thrust::exclusive_scan_by_key(thrust::device, dev_keys.begin(), dev_keys.end(), dev_in.begin(), dev_out.begin());
int main(){
int s = 100;
int m = 100000;
float* seq_in = new float[s * m];
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
seq_in[i * m + j] = j + 1;
thrust::host_vector<float> par_in(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
par_in[i * m + j] = j + 1;
thrust::host_vector<int> keys(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
keys[i * m + j] = i;
thrust::device_vector<float> dev_in = par_in;
thrust::device_vector<int> dev_keys = keys;
auto t1 = std::chrono::high_resolution_clock::now();
test_seqScan(seq_in, s, m);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Sequential duration: " << duration1 << "\n\n";
auto t3 = std::chrono::high_resolution_clock::now();
test_sumScan(dev_in, dev_keys, s, m);
auto t4 = std::chrono::high_resolution_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Parallel duration: " << duration2 << "\n\n";
My issue is that both these snippets of code take exactly the same amount of time to run regardless of how small or large I set s and m. I assume that I'm doing something wrong, but I don't know what; can anyone point out the issue?

Access an matrix as its tranpose in tiled matrix mutliplication in CUDA

I'm currently experimenting with CUDA and i came across this kernel from an answer for matrix multiplication:
I want instead of doing A*B to do A_Transpose*A but without saving A_Transpose (only matrix A as an input to kernel). I have to properly set the indexes but I'm confused by this matrix representation. Any help would be appreciated.
most of what you need is here and here.
In the first link it is identified that AxAT involves taking inner products of rows of matrix A, and similarly ATxA will involve taking inner products of columns of matrix A. Also note the symmetry statement. In the second link (scroll down from that point a bit in the programming guide) you will find a complete tiled matrix multiply. You just need to index into both tiles by column.
Here is a worked example, using the code from the SO answer you linked:
$ cat
#include <iostream>
#include <cstdio>
#include <cstdlib>
const int TILE_DIM = 32;
template <typename T>
__global__ void ATA(const T * __restrict__ A, T * __restrict__ C, int ARows, int ACols)
T CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
__shared__ T As[TILE_DIM][TILE_DIM];
__shared__ T Bs[TILE_DIM][TILE_DIM];
for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {
if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
As[threadIdx.y][threadIdx.x] = 0.0;
if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
Bs[threadIdx.y][threadIdx.x] = 0.0;
for (int n = 0; n < TILE_DIM; ++n)
CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];
if (Row < ACols && Col < ACols)
C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
(blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
template <typename T>
__global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
template <typename T>
__global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
T Cval = 0;
for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
C[row*colB+col] = Cval;}
typedef float mt;
int main(){
mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
int m = 64;
int n = 64;
h_A = new mt[m*n];
h_C = new mt[n*n];
h_C1 = new mt[n*n];
cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
// test 1
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = (i==j)?1.0f:0.0f;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
dim3 block(TILE_DIM, TILE_DIM);
dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
// test 2
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = rand()%10;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
transpose_naive<<<grid,block>>>(d_A, d_B, n);
mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C1[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
$ nvcc -o t1654
$ cuda-memcheck ./t1654
========= ERROR SUMMARY: 0 errors
Note that loading the Bs tile is identical in both cases. The main changes are in loading the As tile, and also note the indexing change when computing Cvalue. These changes are necessary to index in both cases by column.
There may still be bugs. I have not tested the non-square case, nor have I tested the case where the matrix size is not a multiple of block size. Furthermore I've taken no advantage of the symmetry in the output. However this should help with the indexing.

Why my rcpp code is not much too fast using openmp parallelization

I try to use openmp to parallelise my loop to be faster. The problem is that the parallelised version is not faster than the sequential version
#include <Rcpp.h>
#include <iostream>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
#include "test.h"
using namespace std;
// [[Rcpp::export]]
std::vector<double> parallel_random_sum(int n, int ncores) {
std::vector<double> res(n);
#pragma omp parallel num_threads(ncores)
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
return res;
// [[Rcpp::export]]
std::vector<double> not_parallel_random_sum(int n) {
std::vector<double> res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
return res;
/*** R
parallel_random_sum(1e7, 8),
times = 20
result ==>
parallel_random_sum(1e+07,8) 62.02360 milliseconds
not_parallel_random_sum(1e+07) 65.56082 milliseconds
The code you are trying to parallelize is just not expensive enough, making the overhead of the parallelization comparable to the gain. If you add some artificial workload in the loop by sleeping for a short amount of time, you can see the performance gain:
#include <chrono>
#include <thread>
#include <Rcpp.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
// [[Rcpp::export]]
Rcpp::NumericVector parallel_sleep(int n, int ncores) {
Rcpp::NumericVector res_(n);
RcppParallel::RVector<double> res(res_);
#pragma omp parallel num_threads(ncores)
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
lres += j;
res[j] = lres / n;
return res_;
// [[Rcpp::export]]
Rcpp::NumericVector not_parallel_sleep(int n) {
Rcpp::NumericVector res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
lres += j;
res[j] = lres / n;
return res;
/*** R
N <- 1e4
parallel_sleep(N, 8),
# A tibble: 2 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <lis> <list>
1 parallel_sle… 73.2ms 81.3ms 82.3ms 87ms 12.3 80.7KB 0 7 569ms <dbl [1… <Rprofme… <bch… <tibbl…
2 not_parallel… 667.8ms 667.8ms 667.8ms 668ms 1.50 80.7KB 0 1 668ms <dbl [1… <Rprofme… <bch… <tibbl…
Note that I am also using data structures from RcppParallel to avoid the need of a deep copy when returning the data (c.f. comment by #coatless).

Why inner product of same size matrix in Eigen cost quite different time?

I used Eigen to calculate inner product of two matrix, the first one is A=(BC).eval() and second one is D=(EF).eval(). Here B,C,E,F are the same size (1500 * 1500) but with different values. I find the first one cost about 200 ms while the second one cost about 6000 ms, I have no idea why this happened.
#include <iostream>
#include <time.h>
#include "Eigen/Dense"
int main() {
clock_t start, stop;
Eigen::MatrixXf mat_a(1200, 1500);
Eigen::MatrixXf mat_b(1500, 1500);
Eigen::MatrixXf mat_r(1000, 1300);
int i, j;
float c = 0;
for (i = 0; i < 1200; i++) {
for (j = 0; j < 1500; j++) {
mat_a(i, j) = (float)(c/3 * 1.0e-40);
//if (i % 2 == 0 && j % 2 == 0) mat_a(i, j);
//std::cout << mat_a.row(0) << std::endl;
c = 100;
for (i = 0; i < 1500; i++) {
for (j = 0; j < 1500; j++) {
mat_b(i, j) = (float)(c/3 * 0.5e-10);
//std::cout << mat_b.row(0) << std::endl;
start = clock();
mat_r = mat_a * mat_b;
stop = clock();
std::cout << stop - start << std::endl;
return 0;
as show in above example code. I find this is caused by the value of the matrix, when mat_a has value about e-40 and mat_b has value about e-10, this problem occurs stably.
Is there anyone who can explain it?
This is because your matrix contains denormal numbers that are slow to deal with for the CPU. You should make sure that you are using reasonable units so that those can be considered as zeros, and then enable the flush-to-zero (FTZ) and denormals-as-zero flags (DAZ), for instance using the fast-math mode of your compiler or at runtime, see this SO question.

openacc create data while running inside a kernels

I'm having a task that is to be accelerated by OpenACC. I need to do dynamic memory allocation within a kernel computation. I've built a simpler demo for it as following.
#include <iostream>
using namespace std;
#pragma acc routine seq
int *routine(int init) {
int *ptr;
#pragma acc data create(ptr[:10])
for (int i = 0; i < 10; ++i) {
ptr[i] = init + i;
return ptr;
void print_array(int *arr) {
for (int i = 0; i < 10; ++i) {
cout << arr[i] << " ";
cout << endl;
int main(void) {
int *arrs[5];
#pragma acc kernels
for (int i = 0; i < 5; ++i) {
arrs[i] = routine(i);
for (int i = 0; i < 5; ++i) {
return 0;
In this demo, I'm trying to call the routine while running inside a kernel construct. The routine procedure wants to create some data within the GPU and put some values into it.
While I can compile the code, but it reports runtime problems as following.
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ pgc++ -o test -acc -Minfo=accel
6, Generating acc routine seq
23, Generating implicit copyout(arrs[:])
26, Accelerator restriction: size of the GPU copy of arrs is unknown
Loop is parallelizable
Generating implicit copy(arrs[:][:])
Accelerator kernel generated
Generating Tesla code
26, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ ./test
call to cuStreamSynchronize returned error 715: Illegal instruction
I'm wondering what I should do to accomplish this task (dynamically allocating memory within processing of a kernel construct). Really appreciate it if you could help.
This is untested, and probably very slow, but this might do what you need it to.
int main() {
const int num = 20;
int a[x] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
int* sizes = (int *)malloc(num * sizeof(int));
int *ptrs[num];
int* temp, *temp2;
int sum;
int* finished = (int *)malloc(num * sizeof(int));
for (int x = 0; x < num; ++x){
finished[x] = 0;
#pragma acc kernels copyin(a[0:10]) copyout(ptrs[:num][:1]) async(num*2+1)
#pragma acc loop private(temp)
for (int i = 0; i < num; ++i){
#pragma acc loop seq async(i)
for (int j = 0; j < 1; ++j){
temp = ptrs[x];
sizes[i] = ...
while (ptrs[x] != x);
ptrs[x] = routine(a, sizes[i]);
while (true){
sum = 0;
for (int x = 0; x < num; ++x){
sum += finished[x];
if (sum == num){
for (int x = 0; x < num; ++x){
if (acc_async_test(x) != 0 && finished[x] == 0){
finished[x] = 1;
#pragma acc update host(sizes[x:1])
temp = (int *)malloc(size[x] * sizeof(int));
#pragma acc enter data copyin(temp[0:x])
temp2 = acc_deviceptr(temp);
ptrs[x] = temp2;
#pragma acc update device(ptrs[x:1][0:1])
