How can I create Pointers to an RcppEigen matrix that I can use with std::nth_element and openMP?

I am trying to implement a function in Rcpp that takes a matrix as input and calculates and quantiles as specified by the user for the row of said matrix. Since I want to use openMP I tried to do it using RcppEigen due to thread safety concerns.
One reason this looks a bit complicated is that for calculating quantiles efficiently I tried to mimic this approach (finding quartiles, first answer), but allow for user input. So essentially I create a vector with indices corresponding to the quantiles in the first step. In the second step I try to acces the corresponding values in the for loop.
This is the code I was trying:
// // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*-
// [[Rcpp::depends(RcppEigen)]]
#include <RcppEigen.h>
// [[Rcpp::plugins(openmp)]]
#ifdef _OPENMP
#include <omp.h>
// [[Rcpp::plugins(cpp11)]]
#include <random>
// [[Rcpp::export]]
SEXP summaryParC(const Eigen::MatrixXd x,
const Eigen::VectorXd quantiles,
int nrow, int ncol, const int ncores)
const int no_quantiles = quantiles.size();
Eigen::MatrixXd result(nrow, no_quantiles);
// this part is just to give me a vector of indices I need later on in the foor loop
Eigen::VectorXi indices(no_quantiles +1);
indices[0] = -1;
for (int k=0; k<no_quantiles; k++){
if (quantiles[k] < 0.5){
indices[k+1] = floor(quantiles[k] * (ncol-1));
} else {
indices[k+1] = ceil(quantiles[k] * (ncol-1));
#pragma omp parallel num_threads(ncores)
#pragma omp for
for(int i = 0; i < nrow; i++){
// I am trying to convert it into a vector so I can sort it
Eigen::VectorXd v = (x.row(i));
auto * ptr = v; // this fails
// here I want to use the pointer to access the n-th element of the vector
for(int q=0; q<no_quantiles; q++){ //quantiles
std::nth_element(ptr + indices[q] + 1, ptr + indices[q+1], ptr + ncol);
result(i,q) = *(ptr + indices[q+1]);
return Rcpp::wrap(result);
The reason that I wanted to define my own pointer is that Eigen::VectorXd v has nothing like v.begin(). without openMP I would simply define x as NumericMatrix and v as NumericVector and everything works fine. Using openMP I can not rely on that being thread-safe?
This works for smaller datasets, but crashes when used on a larger matrix:
// [[Rcpp::export]]
SEXP summaryC(NumericMatrix x,
NumericVector quantiles,
int nrow, int ncol, const int ncores)
const int no_quantiles = quantiles.size();
NumericMatrix result(nrow, no_quantiles);
int indices[no_quantiles +1];
indices[0] = -1;
for (int k=0; k<no_quantiles; k++){
if (quantiles[k] < 0.5){
indices[k+1] = floor(quantiles[k] * (ncol-1));
} else {
indices[k+1] = ceil(quantiles[k] * (ncol-1));
#pragma omp parallel num_threads(ncores)
#pragma omp for
for(int i = 0; i < nrow; i++){
// converting it into a vector so I can sort it
NumericVector v = (x.row(i));
for(int q=0; q<no_quantiles; q++){ //quantiles
std::nth_element(v.begin() + indices[q] + 1, v.begin() + indices[q+1], v.end());
result(i,q) = *(v.begin() + indices[q+1]);
return Rcpp::wrap(result);
Thank you very much!
I implemented Ralf Stubner's approach. The Pointer works fine as far as I can tell. (Unfortunately R still aborts the session when I try to run it. As Dirk Eddelbuettel pointed out using a pointer does not solve the problem of accessing R memory).
// [[Rcpp::export]]
SEXP summaryParC(Eigen::MatrixXd x,
const Eigen::VectorXd quantiles,
int nrow, int ncol, const int ncores)
const int no_quantiles = quantiles.size();
Eigen::MatrixXd result(nrow, no_quantiles);
Eigen::VectorXi indices(no_quantiles +1);
indices[0] = -1;
for (int k=0; k<no_quantiles; k++){
if (quantiles[k] < 0.5){
indices[k+1] = floor(quantiles[k] * (ncol-1));
} else {
indices[k+1] = ceil(quantiles[k] * (ncol-1));
#pragma omp parallel num_threads(ncores)
#pragma omp for
for(int i = 0; i < nrow; i++){
Eigen::VectorXd v = (x.row(i));
double * B =;
double * E = B + nrow;
for(int q=0; q<no_quantiles; q++){ //quantiles
std::nth_element(B + indices[q] + 1, B + indices[q+1], E);
result(i,q) = *(B + indices[q+1]);
return Rcpp::wrap(result);
2nd update: here a cleaner example of the underlying problem. I am aware of the fact that using R structures is problematic with openMP, but maybe the example can lead to a better understanding of the underlying reasons.
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
#ifdef _OPENMP
#include <omp.h>
using namespace Rcpp;
// [[Rcpp::export]]
SEXP summaryC(NumericMatrix x,
int nrow, int ncol, const int ncores)
NumericMatrix result(nrow, 5);
int indices[6] = {-1, 0, 249, 500, 750, 999};
// #pragma omp parallel num_threads(ncores)
// #pragma omp for
for(int i = 0; i < nrow; i++){
NumericVector v = (x.row(i));
for(int q=0; q < 5; q++){
std::nth_element(v.begin() + indices[q] + 1, v.begin() + indices[q+1], v.end());
result(i,q) = *(v.begin() + indices[q+1]);
return Rcpp::wrap(result);
// [[Rcpp::export]]
SEXP summaryParC(NumericMatrix x,
int nrow, int ncol, const int ncores)
NumericMatrix result(nrow, 5);
int indices[6] = {-1, 0, 249, 500, 750, 999};
#pragma omp parallel num_threads(ncores)
#pragma omp for schedule(dynamic)
for(int i = 0; i < nrow; i++){
NumericVector v = (x.row(i));
for(int q=0; q<5; q++){
std::nth_element(v.begin() + indices[q] + 1, v.begin() + indices[q+1], v.end());
result(i,q) = *(v.begin() + indices[q+1]);
return Rcpp::wrap(result);
// [[Rcpp::export]]
SEXP summaryParCorder(NumericMatrix x,
int nrow, int ncol, const int ncores)
NumericMatrix result(nrow, 5);
int indices[6] = {-1, 0, 249, 500, 750, 999};
#pragma omp parallel num_threads(ncores)
#pragma omp for ordered schedule(dynamic)
for(int i = 0; i < nrow; i++){
#pragma omp ordered
NumericVector v = (x.row(i));
for(int q=0; q<5; q++){
std::nth_element(v.begin() + indices[q] + 1, v.begin() + indices[q+1], v.end());
result(i,q) = *(v.begin() + indices[q+1]);
return Rcpp::wrap(result);
***** R - code *****
#this works, but summaryParCorder is much slower.
mbm <- microbenchmark::microbenchmark(
summaryC(x = matrix(as.numeric(1:1000000), ncol = 1000),
nrow = 1000, ncol = 1000, ncores = 4),
summaryParCorder(x = matrix(as.numeric(1:1000000), ncol = 1000),
nrow = 1000, ncol = 1000, ncores = 4),
times = 20
# this breaks:
summaryParC(x = matrix(as.numeric(1:1000000), ncol = 1000),
nrow = 1000, ncol = 1000, ncores = 4)

I have not checked for compatibility with OpenMP, but Eigen::VectorXd::data() gives you the required pointer, if the vector in question is not const:
// [[Rcpp::depends(RcppEigen)]]
#include <RcppEigen.h>
// [[Rcpp::export]]
Eigen::VectorXd quantiles(Eigen::VectorXd x, const Eigen::VectorXi& indices) {
Eigen::VectorXd result(indices.size());
std::nth_element(, + indices[0], + x.size());
result(0) = x[indices[0]];
for (int i = 1; i < indices.size(); ++i) {
std::nth_element( + indices[i - 1] + 1, + indices[i], + x.size());
result(i) = x[indices[i]];
return result;
/*** R
x <- runif(12)
i <- sort(sample(seq_len(12), 3)) - 1
quantiles(x, i)
Here a full solution including OpenMP:
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::depends(RcppEigen)]]
#include <RcppEigen.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix summaryC(NumericMatrix x, int nrow, int ncores)
NumericMatrix result(nrow, 5);
int indices[6] = {-1, 0, 249, 500, 750, 999};
for (int i = 0; i < nrow; i++) {
NumericVector v = (x.row(i));
for (int q = 0; q < 5; ++q) {
std::nth_element(v.begin() + indices[q] + 1, v.begin() + indices[q+1], v.end());
result(i,q) = *(v.begin() + indices[q+1]);
return result;
// [[Rcpp::export]]
Eigen::MatrixXd summaryParC(Eigen::MatrixXd x,int nrow, int ncores) {
Eigen::MatrixXd result(nrow, 5);
int indices[6] = {-1, 0, 249, 500, 750, 999};
#pragma omp parallel num_threads(ncores)
#pragma omp for schedule(dynamic)
for (int i = 0; i < nrow; i++) {
Eigen::VectorXd v = x.row(i);
for (int q = 0; q < 5; ++q) {
std::nth_element( + indices[q] + 1, + indices[q+1], + v.size());
result(i,q) = v[indices[q+1]];
return result;
/*** R
x <- matrix(as.numeric(1:1000000), ncol = 1000)
summaryC = summaryC(x = x, nrow = 1000, ncores = 4),
summaryParC = summaryParC(x = x, nrow = 1000, ncores = 4),
times = 100)
I have never seen a crash with this parallel version. And on my dual-core machine it is about 44% percent faster than the serial code.


Thrust's exclusive_scan_by_key function takes the same amount of time as a sequential implementation?

I'm relatively new to Thrust and I'm trying to perform a segmented scan. Here is my code, which you should be able to run as-is:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <chrono>
// Sequential scan for CPU
float* test_seqScan(float* in, int s, int m) {
float* out = new float[s * m];
for (unsigned int i = 0; i < s; i++) {
out[i * m] = 0;
for (unsigned int i = 0; i < s; i++) {
for (unsigned int j = 1; j < m; j++) {
out[i * m + j] = out[i * m + j - 1] + in[i * m + j - 1];
return out;
void test_sumScan(thrust::device_vector<float> dev_in, thrust::device_vector<int> dev_keys, int s, int m) {
// Allocate device memory for output
thrust::device_vector<float> dev_out(s * m);
thrust::exclusive_scan_by_key(thrust::device, dev_keys.begin(), dev_keys.end(), dev_in.begin(), dev_out.begin());
int main(){
int s = 100;
int m = 100000;
float* seq_in = new float[s * m];
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
seq_in[i * m + j] = j + 1;
thrust::host_vector<float> par_in(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
par_in[i * m + j] = j + 1;
thrust::host_vector<int> keys(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
keys[i * m + j] = i;
thrust::device_vector<float> dev_in = par_in;
thrust::device_vector<int> dev_keys = keys;
auto t1 = std::chrono::high_resolution_clock::now();
test_seqScan(seq_in, s, m);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Sequential duration: " << duration1 << "\n\n";
auto t3 = std::chrono::high_resolution_clock::now();
test_sumScan(dev_in, dev_keys, s, m);
auto t4 = std::chrono::high_resolution_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Parallel duration: " << duration2 << "\n\n";
My issue is that both these snippets of code take exactly the same amount of time to run regardless of how small or large I set s and m. I assume that I'm doing something wrong, but I don't know what; can anyone point out the issue?

Sorting with thrust library on gpu

I work on stereovision and I have a problem with sort of thrust library, when I use it in my kernel function the application run and bug because all the kernel aren't launch in my <<< >>> call but when I remove the sort all works (but the result isn't good). I have search for alternative but I have find nothing except thrust for gpu sort. Thanks
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/core/core.hpp>
#include <iostream>
#include <time.h>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <device_launch_parameters.h>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/core/cuda.hpp>
#include <algorithm>
#include <functional>
#include <array>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <opencv2/imgproc/imgproc.hpp>
using namespace std;
using namespace cv;
const int correlationWindow = 81;
const int widthWindow = (int)sqrt((float)correlationWindow);
const int searchWindow = 52;
__constant__ int widthWindow2 = 9;
void makeVector(float *mat, float *vec, int col, int x, int y) {
int ind = 0;
for (int i = x; i < x + widthWindow2; i++) {
for (int j = y; j < y + widthWindow2; j++) {
vec[ind] = mat[col * i + j];
void disparityUpdate2(int i, int j, int col, int distance, float *d_disparity) {
d_disparity[col * i + j] = ((255 / searchWindow) * distance);
void resize(float *d_disparity, Mat &disparity) {
for (int i = 0; i < disparity.rows; i++) {
for (int j = 0; j < disparity.cols; j++) {
int ind = disparity.cols * i + j;<float>(i, j) = d_disparity[ind];
void computeSMAD2(int minX, float *d_mL, float *d_mR, float *dif, float *windowL, float *windowR, float *d_disparity, int colmLO, int colmL, int seachWindow) {
int mini;
int pOiX(threadIdx.x + minX); // + minX
int pOiY(blockIdx.x + minX);
int newPoIx(max(minX, pOiX - searchWindow));
int newPoIy(pOiY);
/*int pOiX(pox);
int pOiY(poy);
int newPoIx(npox);
int newPoIy(npoy); */
int minPoIx(newPoIx);
int smad = 0;
int bMax = (int)(correlationWindow / 2);
makeVector(d_mL, windowL, colmL, pOiY, pOiX); // ATTENTION ligne / colonne
makeVector(d_mR, windowR, colmL, newPoIy, newPoIx);
for (int h = 0; h < correlationWindow; h++) {
dif[h] = windowL[h] - windowR[h];
thrust::sort(thrust::seq, dif, dif + correlationWindow);
int median = dif[(correlationWindow - 1) / 2];
for (int h = 0; h < correlationWindow; h++) {
dif[h] = abs(dif[h] - median);
thrust::sort(thrust::seq, dif, dif + correlationWindow);
for (int i = 0; i <= bMax - 1; i++) {
smad = smad + pow(dif[i], 2);
mini = smad;
for (int i = newPoIx; i <= pOiX; i++) {
smad = 0;
makeVector(d_mR, windowR, colmL, newPoIy, i);
for (int h = 0; h < correlationWindow; h++) {
dif[h] = windowL[h] - windowR[h];
thrust::sort(thrust::seq, dif, dif + correlationWindow);
median = dif[(correlationWindow - 1) / 2];
for (int h = 0; h < correlationWindow; h++) {
dif[h] = abs(dif[h] - median);
thrust::sort(thrust::seq, dif, dif + correlationWindow);
for (int j = 0; j <= bMax - 1; j++) {
if (smad < mini) {
smad = smad + pow(dif[j], 2);
else {
if (smad < mini) {
mini = smad;
minPoIx = i;
int distance = pOiX - minPoIx;
d_disparity[colmLO * (pOiY - minX) + (pOiX - minX)] = ((255 / searchWindow) * distance);
Mat SMAD2(int minX, Mat mLO, Mat mRO) {
Mat mL = Mat::zeros(mLO.rows + 2 * minX, mLO.cols + 2 * minX, CV_32FC1);
Mat mR = Mat::zeros(mLO.rows + 2 * minX, mLO.cols + 2 * minX, CV_32FC1);
Mat disparity = Mat::zeros(mRO.rows, mRO.cols, CV_32FC1);
mLO.copyTo(mL.rowRange(minX, mL.rows - minX).colRange(minX, mL.cols - minX));
mRO.copyTo(mR.rowRange(minX, mL.rows - minX).colRange(minX, mL.cols - minX));
float *d_mL, *windowL;
float *d_mR, *windowR;
float *dif;
float *d_disparity;
cudaMallocManaged(&dif, correlationWindow * sizeof(float));
cudaMallocManaged(&windowL, correlationWindow * sizeof(float));
cudaMallocManaged(&windowR, correlationWindow * sizeof(float));
cudaMallocManaged(&d_mL, mL.rows * mL.cols * sizeof(float));
cudaMallocManaged(&d_mR, mR.rows * mR.cols * sizeof(float));
cudaMallocManaged(&d_disparity, disparity.rows * disparity.cols * sizeof(float));
/*dif = new float[correlationWindow];
windowL = new float[correlationWindow];
windowR = new float[correlationWindow];
d_mL = new float[mL.rows * mL.cols];
d_mR = new float[mR.rows * mR.cols];
d_disparity = new float[disparity.rows * disparity.cols]; */
memcpy(d_mL,, mL.rows * mL.cols * sizeof(float));
memcpy(d_mR,, mR.rows * mR.cols * sizeof(float));
memcpy(d_disparity,, disparity.rows * disparity.cols * sizeof(float));
int ind = 0;
int colmL = mL.cols;
int colmLO = mLO.cols;
int npox, npoy;
clock_t begin = clock();
computeSMAD2 <<<70, 50>>>(minX, d_mL, d_mR, dif, windowL, windowR, d_disparity, mLO.cols, mL.cols, searchWindow);
//computeSMAD2 <<<mLO.rows, mLO.cols>>>(minX, d_mL, d_mR, dif, windowL, windowR, d_disparity, mLO.cols, mL.cols, searchWindow);
for (int poy = minX; poy < mR.rows - minX; poy++)
for (int pox = minX; pox < mR.cols - minX; pox++)
//////////////////////// DE GAUCHE A DROITE
npox = max(minX, pox - searchWindow);
npoy = poy;
computeSMAD2(minX, d_mL, d_mR, disparity, d_disparity, windowL, windowR, dif, colmLO, colmL, pox, poy, npox, npoy);
} */
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
cout << "time " << elapsed_secs << endl;
return disparity;
int main(int argc, char* argv[]) {
int minX = (int)floor((float)(widthWindow / 2));
Mat mL2 = Mat::ones(70, 50, CV_8UC1) * 255;
Mat mR2 = Mat::zeros(70, 50, CV_8UC1);
Mat disparity = SMAD2(minX, mL2, mR2);
disparity.convertTo(disparity, CV_8UC1);
Mat im;
hconcat(mL2, mR2, im);
hconcat(im, disparity, im);
imshow("test", im);
return 0;
your code is wrong. as I see, all threads will sort the same array dif. It caused memory violation.

openacc create data while running inside a kernels

I'm having a task that is to be accelerated by OpenACC. I need to do dynamic memory allocation within a kernel computation. I've built a simpler demo for it as following.
#include <iostream>
using namespace std;
#pragma acc routine seq
int *routine(int init) {
int *ptr;
#pragma acc data create(ptr[:10])
for (int i = 0; i < 10; ++i) {
ptr[i] = init + i;
return ptr;
void print_array(int *arr) {
for (int i = 0; i < 10; ++i) {
cout << arr[i] << " ";
cout << endl;
int main(void) {
int *arrs[5];
#pragma acc kernels
for (int i = 0; i < 5; ++i) {
arrs[i] = routine(i);
for (int i = 0; i < 5; ++i) {
return 0;
In this demo, I'm trying to call the routine while running inside a kernel construct. The routine procedure wants to create some data within the GPU and put some values into it.
While I can compile the code, but it reports runtime problems as following.
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ pgc++ -o test -acc -Minfo=accel
6, Generating acc routine seq
23, Generating implicit copyout(arrs[:])
26, Accelerator restriction: size of the GPU copy of arrs is unknown
Loop is parallelizable
Generating implicit copy(arrs[:][:])
Accelerator kernel generated
Generating Tesla code
26, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ ./test
call to cuStreamSynchronize returned error 715: Illegal instruction
I'm wondering what I should do to accomplish this task (dynamically allocating memory within processing of a kernel construct). Really appreciate it if you could help.
This is untested, and probably very slow, but this might do what you need it to.
int main() {
const int num = 20;
int a[x] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
int* sizes = (int *)malloc(num * sizeof(int));
int *ptrs[num];
int* temp, *temp2;
int sum;
int* finished = (int *)malloc(num * sizeof(int));
for (int x = 0; x < num; ++x){
finished[x] = 0;
#pragma acc kernels copyin(a[0:10]) copyout(ptrs[:num][:1]) async(num*2+1)
#pragma acc loop private(temp)
for (int i = 0; i < num; ++i){
#pragma acc loop seq async(i)
for (int j = 0; j < 1; ++j){
temp = ptrs[x];
sizes[i] = ...
while (ptrs[x] != x);
ptrs[x] = routine(a, sizes[i]);
while (true){
sum = 0;
for (int x = 0; x < num; ++x){
sum += finished[x];
if (sum == num){
for (int x = 0; x < num; ++x){
if (acc_async_test(x) != 0 && finished[x] == 0){
finished[x] = 1;
#pragma acc update host(sizes[x:1])
temp = (int *)malloc(size[x] * sizeof(int));
#pragma acc enter data copyin(temp[0:x])
temp2 = acc_deviceptr(temp);
ptrs[x] = temp2;
#pragma acc update device(ptrs[x:1][0:1])

square Matrix transpose with CUDA

I'm trying to write the matrix transpose algorithm. I test this program with matrix size equal to 1024, the result shows that not all elements are in the right places.
Why isn't my array transposing correctly? Does anyone can help me or give me any hint? I will appreciate it. Thanks a lot!
there is the whole cpu code:
__global__ void transpose_naive (float *out, float *in, int w, int h )
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
if ( xIdx <=w && yIdx <=h ) {
unsigned int idx_in = xIdx + w * yIdx;
unsigned int idx_out = yIdx + h * xIdx;
out[idx_out] = in[idx_in];
int main()
int nx=1024;
int mem_size = nx*nx*sizeof(float);
int t=32;
dim3 dimGrid(((nx-1)/t) +1, ((nx-1)/t) +1);
dim3 dimBlock(t,t);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
checkCuda(cudaMalloc(&d_idata, mem_size) );
checkCuda(cudaMalloc(&d_cdata, mem_size) );
// host
for (int j = 0; j < nx; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// device
checkCuda(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) );
// events for timing
cudaEvent_t startEvent, stopEvent;
checkCuda(cudaEventCreate(&startEvent) );
checkCuda(cudaEventCreate(&stopEvent) );
float ms;
checkCuda( cudaEventRecord(startEvent, 0) );
transpose_naive<<<dimGrid, dimBlock>>>(d_cdata, d_idata,nx,nx);
checkCuda(cudaEventRecord(stopEvent, 0) );
checkCuda(cudaEventSynchronize(stopEvent) );
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent) );
checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) );
printf("the time %5f ", ms);
// cleanup
checkCuda(cudaEventDestroy(startEvent) );
checkCuda(cudaEventDestroy(stopEvent) );
checkCuda( cudaFree(d_cdata) );
checkCuda( cudaFree(d_idata) );
I think there is something wrong with file output "i.txt" and "t.txt" otherwise the program looks to be correct. I have made some minor changes in your code by adding error checking and printing on the standard output stream. I am printing the last (1020 - 1024) 3 x 3 matrix to cross check the transpose. Run it on your system and verify whether the matrix transpose is correct or not?
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include "device_launch_parameters.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
if (code != cudaSuccess)
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file, line);
if (abort) exit(code);
__global__ void transpose_naive(float *out, float *in, int w, int h)
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
if (xIdx <= w && yIdx <= h) {
unsigned int idx_in = xIdx + w * yIdx;
unsigned int idx_out = yIdx + h * xIdx;
out[idx_out] = in[idx_in];
int main()
int nx = 1024;
int mem_size = nx*nx*sizeof(float);
int t = 32;
dim3 dimGrid(((nx - 1) / t) + 1, (((nx - 1) / t) + 1));
dim3 dimBlock(t, t);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
gpuErrchk(cudaMalloc(&d_idata, mem_size));
gpuErrchk(cudaMalloc(&d_cdata, mem_size));
// host
for (int j = 0; j < nx; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// device
// events for timing
cudaEvent_t startEvent, stopEvent;
float ms;
gpuErrchk(cudaEventRecord(startEvent, 0));
transpose_naive << <dimGrid, dimBlock >> >(d_cdata, d_idata, nx, nx);
gpuErrchk(cudaEventRecord(stopEvent, 0));
gpuErrchk(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("the time %5f ", ms);
for (int i = 1020; i < 1024; i++) {
for (int j = 1020; j < 1024; j++) {
printf("%.2f ", h_idata[i*nx + j]);
for (int i = 1020; i < 1024; i++) {
for (int j = 1020; j < 1024; j++) {
printf("%.2f ", h_cdata[i*nx + j]);
//savetofile(h_idata, "i.txt", nx, nx);
//savetofile(h_cdata, "t.txt", nx, nx);
// cleanup
The only flaw in the code is the incorrect bound checks in the following line of the kernel.
if ( xIdx <=w && yIdx <=h ) {
As the indices are from 0 to w-1 and 0 to h-1 for x and y dimensions respectively, the if condition should be as follows:
if ( xIdx <w && yIdx <h ) {

How to improve Dijkstra algorithm when querying n times?

I'm currently working on a problem at Codechef. You can find the problem statement here:
Delivery Boy
In short, the problem is asking to query n times the shortest path from a start to an end. My solution is to use Dijsktra with priority_queue plus caching the result into a hash_map in case we already had a start. Unfortunately, I got time limit exceed many times and I couldn't find a better way to make it faster. I wonder am I in the right track? or there is a better algorithm to this problem?
By the way, since the contest is still going, please don't post any solution. A hint is more than enough to me. Thanks.
Here is my attempt:
#ifdef __GNUC__
#include <ext/hash_map>
#include <hash_map>
#include <iostream>
#include <iomanip>
#include <vector>
#include <string>
#include <algorithm>
#include <map>
#include <set>
#include <utility>
#include <stack>
#include <deque>
#include <queue>
#include <fstream>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <cassert>
using namespace std;
#ifdef __GNUC__
namespace std {
using namespace __gnu_cxx;
const int MAX_VERTICES = 250;
const int INFINIY = (1 << 28);
int weight[MAX_VERTICES + 1][MAX_VERTICES + 1];
bool visited_start[MAX_VERTICES + 1] = { 0 };
struct vertex {
int node;
int cost;
vertex(int node = 0, int cost = 0)
: node(node), cost(cost) {
bool operator <(const vertex& rhs) const {
return cost < rhs.cost;
bool operator >(const vertex& rhs) const {
return cost > rhs.cost;
hash_map<int, vector<vertex> > cache;
typedef priority_queue<vertex, vector<vertex>, greater<vertex> > min_pq;
vector<vertex> dijkstra_compute_path(int start, int n) {
min_pq pq;
vector<vertex> path;
vector<int> visited(n, 0);
int min_cost = 0;
int better_cost;
vertex u;
for (int i = 0; i < n; ++i) {
path.push_back(vertex(i, INFINIY));
path[start].cost = 0;
pq.push(vertex(start, path[start].cost));
while (!pq.empty()) {
// extract min cost
u =;
// mark it as visited
visited[u.node] = 1;
// for each vertex v that is adjacent to u
for (int v = 0; v < n; ++v) {
// if it's not visited, visit it
if (visited[v] == 0) {
better_cost = path[u.node].cost + weight[u.node][v];
// update cost
if (path[v].cost > better_cost) {
path[v].cost = better_cost;
pq.push(vertex(v, path[v].cost));
return path;
void check_in_cache(vector<vertex>& path, int start, int no_street) {
if (visited_start[start] == 0) {
path = dijkstra_compute_path(start, no_street);
cache.insert(make_pair(start, path));
visited_start[start] = 1;
else {
path = cache[start];
void display_cost(int stop_at_gas_cost, int direct_cost) {
printf("%d ", stop_at_gas_cost);
if (stop_at_gas_cost > direct_cost) {
printf("%d\n", stop_at_gas_cost - direct_cost);
else {
void handle_case_one() {
int no_scenario;
int dummy;
int s, g, d;
scanf("%d", &dummy);
scanf("%d", &no_scenario);
for (int i = 0; i < no_scenario; ++i) {
scanf("%d %d %d", &s, &g, &d);
printf("0 0\n");
void inout_delivery_boy() {
int no_street;
int no_scenario;
int restaurant;
int gas_station;
int destination;
int stop_at_gas_cost;
int direct_cost;
vector<vertex> direct;
vector<vertex> indirect;
vector<vertex> d;
int c;
scanf("%d", &no_street);
if (no_street == 1) {
for (int x = 0; x < no_street; ++x) {
for (int y = 0; y < no_street; ++y) {
scanf("%d", &c);
weight[x][y] = c;
for (int i = 0; i < no_street; ++i) {
d.push_back(vertex(i, INFINIY));
scanf("%d", &no_scenario);
for (int i = 0; i < no_scenario; ++i) {
scanf("%d %d %d", &restaurant, &gas_station, &destination);
// check in cache
check_in_cache(direct, restaurant, no_street);
check_in_cache(indirect, gas_station, no_street);
// calculate the cost
stop_at_gas_cost = direct[gas_station].cost + indirect[destination].cost;
direct_cost = direct[destination].cost;
// output
display_cost(stop_at_gas_cost, direct_cost);
void dijkstra_test(istream& in) {
int start;
int no_street;
int temp[4] = { 0 };
vector<vertex> path;
in >> no_street;
for (int x = 0; x < no_street; ++x) {
for (int y = 0; y < no_street; ++y) {
in >> weight[x][y];
// arrange
start = 0;
temp[0] = 0;
temp[1] = 2;
temp[2] = 1;
temp[3] = 3;
// act
path = dijkstra_compute_path(start, no_street);
// assert
for (int i = 0; i < no_street; ++i) {
assert(path[i].cost == temp[i]);
// arrange
start = 1;
temp[0] = 1;
temp[1] = 0;
temp[2] = 2;
temp[3] = 4;
// act
path = dijkstra_compute_path(start, no_street);
// assert
for (int i = 0; i < no_street; ++i) {
assert(path[i].cost == temp[i]);
// arrange
start = 2;
temp[0] = 2;
temp[1] = 1;
temp[2] = 0;
temp[3] = 3;
// act
path = dijkstra_compute_path(start, no_street);
// assert
for (int i = 0; i < no_street; ++i) {
assert(path[i].cost == temp[i]);
// arrange
start = 3;
temp[0] = 1;
temp[1] = 1;
temp[2] = 1;
temp[3] = 0;
// act
path = dijkstra_compute_path(start, no_street);
// assert
for (int i = 0; i < no_street; ++i) {
assert(path[i].cost == temp[i]);
int main() {
// ifstream inf("test_data.txt");
// dijkstra_test(inf);
return 0;
please notice N is small in the problem. have you tried Floyd shortest path algorithm to pre-calculate shortest path between each two nodes ? it will cost O(N^3) time, which is 250^3=15625000 in the problem, should be easy to be finished running in 1 second. Then you can answer each query in O(1).
the introduction of Floyd :
ps: i think cached dijstra costs a maximum running time of O(N^3) for overall test case as well . but the way you implement the cache will spend more unnecessary time on memory copying, which may lead to a TLE. Just a guess.
Indeed Floyd-Warshall's Algorithm is better than Dijkstra's in this case, the complexity for Dijkstra is O(m*n^2) and in this problem M is much much higher than N so the O(n^3) time complexity of Floyd-Warshall is better.
