(Homework) Converting a function to a CUDA kernel function - debugging

I'm working on an assignment that asks to optimise this C program using CUDA parallelisation.
This is what I managed to come up with:
//...
__global__ void gpu_score_function(void *gpu_frame_pixels, void *gpu_pattern_pixels, void *gpu_results,
int frame_rowstride, int pattern_rowstride,
int pattern_width, int pattern_height,
int frame_width, int frame_height) {
if ((blockIdx.y * blockDim.y + threadIdx.y < frame_height - pattern_height) &&
(blockIdx.x * blockDim.x + threadIdx.x < frame_width - pattern_width)) {
guchar *frame_pixels = (guchar *) gpu_frame_pixels +
(blockIdx.y * blockDim.y + threadIdx.y) * frame_rowstride +
(blockIdx.x * blockDim.x + threadIdx.x) * N_CHANNELS;
guchar *pattern_pixels = (guchar *) gpu_pattern_pixels;
int *results = (int *) gpu_results;
int res = 0;
for (int y = 0; y < pattern_height; ++y) {
if (blockIdx.y * blockDim.y + threadIdx.y + y < frame_height - pattern_height) {
for (int x = 0; x < pattern_width; ++x) {
if (blockIdx.x * blockDim.x + threadIdx.x + x < frame_width - pattern_width) {
const guchar *frame_pixel = frame_pixels + x * N_CHANNELS;
const guchar *pattern_pixel = pattern_pixels + x * N_CHANNELS;
for (int c = 0; c < N_CHANNELS; ++c) {
res += (frame_pixel[c] - 128) * (pattern_pixel[c] - 128);
}
} else {
break;
}
}
frame_pixels += frame_rowstride;
pattern_pixels += pattern_rowstride;
} else {
break;
}
}
results[(blockIdx.y * blockDim.y + threadIdx.y) * (frame_width - pattern_width) + blockIdx.x * blockDim.x + threadIdx.x] = res;
}
}
int main(int argc, const char *argv[]) {
//...
/**
* CUDA
*/
void *gpu_pattern_pixels;
void *gpu_frame_pixels;
void *gpu_results;
cudaMalloc(&gpu_pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar));
cudaMalloc(&gpu_frame_pixels, frame_height * frame_rowstride * sizeof(guchar));
cudaMalloc(&gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy(gpu_pattern_pixels, (void *) pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_frame_pixels, (void *) frame_pixels, frame_height * frame_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
//Kernel configuration, where a two-dimensional grid and
//three-dimensional blocks are configured.
dim3 dimGrid(ceil((float) (frame_width - pattern_width) / 32), ceil((float) (frame_height - pattern_height) / 32));
dim3 dimBlock(32, 32);
gpu_score_function<<<dimGrid, dimBlock>>>(gpu_frame_pixels, gpu_pattern_pixels, gpu_results, frame_rowstride, pattern_rowstride, pattern_width, pattern_height, frame_width, frame_height);
cudaDeviceSynchronize();
int *results = (int *) malloc((frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy((void *) results, gpu_results,
(frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int), cudaMemcpyDeviceToHost);
int gpu_x_best, gpu_y_best;
double gpu_best_score;
for (int *cur = results; cur != results + (frame_width - pattern_width) * (frame_height - pattern_height); cur++) {
if (cur == results || *cur > gpu_best_score) {
gpu_best_score = *cur;
gpu_x_best = (cur - results) % (frame_width - pattern_width);
gpu_y_best = (cur - results) / (frame_width - pattern_width);
}
}
cudaFree(gpu_pattern_pixels);
cudaFree(gpu_frame_pixels);
cudaFree(gpu_results);
free(results);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
/**
* END CUDA
*/
//...
return 0;
}
The program doesn't segfault, cuda-memcheck gives 0 errors and the result matrix is filled.
The problem is, the results are wrong.
I'm quite sure it's some off-by-one pointer error, but I have no idea how to spot it.
I'm working on OSX 10.9, what tools could I use to debug this program?
Any help is appreciated.

I found the bug.
The two if statements inside the for loops of gpu_score_function make no sense. Deleting them solved the problem.

Related

Is there a way to avoid CUDA atomicAdd in my situation?

I'm doing an operation as the figure below.
Here is my kernel.
As shown in the figure, I make a small matrix using about one million vectors and accumulate it in a large prepared matrix.
I need an idea that can improve performance without exceeding 8Gb of GPU global memory.
How can I avoid atomic operations? I use the GTX1080. Existing kernels take about 250ms.
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
if (src[idx].mask == 1)
{
// matrix width
int cols = 6 * (mw_width + 1);
// calc position for insert
int idx0 = (src[idx].fid0 - st);
if (idx0 == mw_width - 2)
{
idx0 = idx0 - 1;
}
else if (idx0 == mw_width - 1)
{
idx0 = idx0 - 2;
}
int idx1 = (src[idx].fid1 - st);
if (idx1 == mw_width - 2)
{
idx1 = idx1 - 1;
}
else if (idx1 == mw_width - 1)
{
idx1 = idx1 - 2;
}
int pos0 = idx0 * 6;
int pos1 = idx1 * 6;
// set tempolar matrix
double _A00[24 * 24];
double _A11[24 * 24];
double _A01[24 * 24];
double _b0[24];
double _b1[24];
for (int y = 0; y < 24; y++)
{
for (int x = 0; x < 24; x++)
{
_A00[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J0[x];
_A11[y * 24 + x] = src[idx].w * src[idx].J1[y] * src[idx].J1[x];
_A01[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J1[x];
}
_b0[y] = src[idx].w * src[idx].c * src[idx].J0[y];
_b1[y] = src[idx].w * src[idx].c * src[idx].J1[y];
}
// set final matrix
for (int i = 0; i < 24; i++)
{
for (int j = 0; j < 24; j++)
{
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], _A00[i * 24 + j]); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], _A11[i * 24 + j]); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], _A01[i * 24 + j]); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], _A01[j * 24 + i]); // 10
}
atomicAdd(&b[i + pos0], _b0[i]); // 0
atomicAdd(&b[i + pos1], _b1[i]); // 1
}
}
}
}
2019.3.6.
I modified the code below to see some performance improvements.
250ms -> 95ms
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
int src_idx = idx / 576;
if (src[src_idx].mask == 1)
{
int cols = 6 * (mw_width + 1);
int pos0 = src[src_idx].pos0;
int pos1 = src[src_idx].pos1;
double w = src[src_idx].w;
double c = src[src_idx].c;
int sub_idx = idx % 576;
int i = sub_idx / 24;
int j = sub_idx % 24;
double J0_i = src[src_idx].J0[i];
double J0_j = src[src_idx].J0[j];
double J1_i = src[src_idx].J1[i];
double J1_j = src[src_idx].J1[j];
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], w * J0_i * J0_j); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], w * J1_i * J1_j); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], w * J0_i * J1_j); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], w * J1_i * J0_j); // 10
if (j == 0)
{
atomicAdd(&b[i + pos0], w * c * J0_i); // 0
atomicAdd(&b[i + pos1], w * c * J1_i); // 1
}
}
}
}

square Matrix transpose with CUDA

I'm trying to write the matrix transpose algorithm. I test this program with matrix size equal to 1024, the result shows that not all elements are in the right places.
Why isn't my array transposing correctly? Does anyone can help me or give me any hint? I will appreciate it. Thanks a lot!
there is the whole cpu code:
__global__ void transpose_naive (float *out, float *in, int w, int h )
{
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
if ( xIdx <=w && yIdx <=h ) {
unsigned int idx_in = xIdx + w * yIdx;
unsigned int idx_out = yIdx + h * xIdx;
out[idx_out] = in[idx_in];
}
}
int main()
{
int nx=1024;
int mem_size = nx*nx*sizeof(float);
int t=32;
dim3 dimGrid(((nx-1)/t) +1, ((nx-1)/t) +1);
dim3 dimBlock(t,t);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
checkCuda(cudaMalloc(&d_idata, mem_size) );
checkCuda(cudaMalloc(&d_cdata, mem_size) );
// host
for (int j = 0; j < nx; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// device
checkCuda(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) );
// events for timing
cudaEvent_t startEvent, stopEvent;
checkCuda(cudaEventCreate(&startEvent) );
checkCuda(cudaEventCreate(&stopEvent) );
float ms;
checkCuda( cudaEventRecord(startEvent, 0) );
transpose_naive<<<dimGrid, dimBlock>>>(d_cdata, d_idata,nx,nx);
checkCuda(cudaEventRecord(stopEvent, 0) );
checkCuda(cudaEventSynchronize(stopEvent) );
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent) );
checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) );
printf("the time %5f ", ms);
printf("\n");
savetofile(h_idata,"i.txt",nx,nx);
savetofile(h_cdata,"t.txt",nx,nx);
error_exit:
// cleanup
checkCuda(cudaEventDestroy(startEvent) );
checkCuda(cudaEventDestroy(stopEvent) );
checkCuda( cudaFree(d_cdata) );
checkCuda( cudaFree(d_idata) );
free(h_idata);
free(h_cdata);
system("pause");
}
I think there is something wrong with file output "i.txt" and "t.txt" otherwise the program looks to be correct. I have made some minor changes in your code by adding error checking and printing on the standard output stream. I am printing the last (1020 - 1024) 3 x 3 matrix to cross check the transpose. Run it on your system and verify whether the matrix transpose is correct or not?
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include "device_launch_parameters.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file, line);
if (abort) exit(code);
}
}
__global__ void transpose_naive(float *out, float *in, int w, int h)
{
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
if (xIdx <= w && yIdx <= h) {
unsigned int idx_in = xIdx + w * yIdx;
unsigned int idx_out = yIdx + h * xIdx;
out[idx_out] = in[idx_in];
}
}
int main()
{
int nx = 1024;
int mem_size = nx*nx*sizeof(float);
int t = 32;
dim3 dimGrid(((nx - 1) / t) + 1, (((nx - 1) / t) + 1));
dim3 dimBlock(t, t);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
gpuErrchk(cudaMalloc(&d_idata, mem_size));
gpuErrchk(cudaMalloc(&d_cdata, mem_size));
// host
for (int j = 0; j < nx; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// device
gpuErrchk(cudaMemcpy(d_idata,h_idata,mem_size,cudaMemcpyHostToDevice));
// events for timing
cudaEvent_t startEvent, stopEvent;
gpuErrchk(cudaEventCreate(&startEvent));
gpuErrchk(cudaEventCreate(&stopEvent));
float ms;
gpuErrchk(cudaEventRecord(startEvent, 0));
transpose_naive << <dimGrid, dimBlock >> >(d_cdata, d_idata, nx, nx);
gpuErrchk(cudaEventRecord(stopEvent, 0));
gpuErrchk(cudaEventSynchronize(stopEvent));
gpuErrchk(cudaEventElapsedTime(&ms, startEvent, stopEvent));
gpuErrchk(cudaMemcpy(h_cdata,d_cdata,mem_size,cudaMemcpyDeviceToHost));
printf("the time %5f ", ms);
printf("\n");
for (int i = 1020; i < 1024; i++) {
for (int j = 1020; j < 1024; j++) {
printf("%.2f ", h_idata[i*nx + j]);
}
printf("\n");
}
printf("\n");
for (int i = 1020; i < 1024; i++) {
for (int j = 1020; j < 1024; j++) {
printf("%.2f ", h_cdata[i*nx + j]);
}
printf("\n");
}
//savetofile(h_idata, "i.txt", nx, nx);
//savetofile(h_cdata, "t.txt", nx, nx);
//error_exit:
// cleanup
gpuErrchk(cudaEventDestroy(startEvent));
gpuErrchk(cudaEventDestroy(stopEvent));
gpuErrchk(cudaFree(d_cdata));
gpuErrchk(cudaFree(d_idata));
free(h_idata);
free(h_cdata);
//system("pause");
}
The only flaw in the code is the incorrect bound checks in the following line of the kernel.
if ( xIdx <=w && yIdx <=h ) {
As the indices are from 0 to w-1 and 0 to h-1 for x and y dimensions respectively, the if condition should be as follows:
if ( xIdx <w && yIdx <h ) {

OpenCL kernel error on Mac OSx

I wrote some OpenCL code which works fine on LINUX, but it is failing with errors on Mac OSX. Can someone please help me to identify why these should occur. The kernel code is shown after the error. My kernel uses double, so I have the corresponding pragma at the top. But I don't know why the error shows float data type:
inline float8 __OVERLOAD__ _name(float8 x) { return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30: note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:421:29:
note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float16 __OVERLOAD__ _name(float16 x){ return _default_name(x); }
^
<program source>:206:19: error: call to '__fast_relax_log' is ambiguous
det_zkinin + log((2.0) * 3.14));
^~~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4608:22:
note: expanded from macro 'log'
#define log(__x) __fast_relax_log(__x)
^~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30:
note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:416:27:
note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float __OVERLOAD__ _name(float x) { return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30
note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
^
This is the kernel code:
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void ckf_kernel2(int dimx, int aligned_dimx,
int numOfCKF, int aligned_ckf,
int iter,
double epsilon,
__global double * yrlists,
__global double * zrlists,
__global double * rlists,
__global double * init_state,
__global double * init_var,
__global double * sing_j,
__global double * covMatrixSum,
__global double * cummulative,
__global double * temp_var,
__global double * x_k_f,
__global double * z_k_j,
__global double * crossCovMatrixSum,
__global double * z_k_f,
__global double * innCovMatrixSum,
__global double * zk_diff,
__global double * reduce_gain_matrix,
__global double * llk
)
{
int ckf_id = get_global_id(0);
if( ckf_id < numOfCKF){
for (int i = 0 ; i < dimx ; i++)
{
for (int idx = 0; idx < dimx * 2 ; idx++)
{
int column = idx % dimx;
int mode = (idx >= dimx) ? -1 : 1;
sing_j[(i * dimx * 2 + idx) * aligned_ckf + ckf_id] = temp_var[(i * dimx + column) * aligned_ckf + ckf_id] * epsilon * mode + init_state[i * aligned_ckf + ckf_id];
}
}
z_k_f[ckf_id] = 0;
innCovMatrixSum[ckf_id] = 0;
for (int idx = 0; idx < dimx * 2 ; idx++)
{
z_k_j[idx * aligned_ckf + ckf_id] = 0;
for (int i = 0 ; i < dimx ; i++)
z_k_j[idx * aligned_ckf + ckf_id] += sing_j[(i * dimx * 2 + idx) * aligned_ckf + ckf_id] * zrlists[iter * aligned_dimx + i ];
z_k_f[ckf_id] += z_k_j[idx* aligned_ckf + ckf_id] ;
innCovMatrixSum[ckf_id] += z_k_j[idx* aligned_ckf + ckf_id] * z_k_j[idx* aligned_ckf + ckf_id];
}
z_k_f[ckf_id] = z_k_f[ckf_id] / (dimx * 2);
innCovMatrixSum[ckf_id] = innCovMatrixSum[ckf_id] / (dimx * 2);
innCovMatrixSum[ckf_id] = (innCovMatrixSum[ckf_id] - z_k_f[ckf_id] *z_k_f[ckf_id]) + rlists[ckf_id];
// calcualte crossCovMatrixSum
for (int idx = 0; idx < dimx; idx ++)
{
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = 0;
for (int i = 0 ; i < 2 * dimx ; i++)
{
crossCovMatrixSum[idx * aligned_ckf + ckf_id] += sing_j[(idx * dimx*2 + i) * aligned_ckf + ckf_id ] * z_k_j[i* aligned_ckf + ckf_id];
}
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = crossCovMatrixSum[idx * aligned_ckf + ckf_id]/ (dimx * 2);
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = crossCovMatrixSum[idx * aligned_ckf + ckf_id] - x_k_f[idx* aligned_ckf + ckf_id] * z_k_f[ckf_id];
}
// calculate zk_diff
int z_check = (int)yrlists[iter];
if (z_check == -1)
zk_diff[ckf_id] = 0;
else
zk_diff[ckf_id] = yrlists[iter] - z_k_f[ckf_id];
// calculate reduce_gain_matrix and (reduce_state_matrix <==> init_state);
for (int idx = 0 ; idx < dimx; idx++)
{
reduce_gain_matrix[idx * aligned_ckf + ckf_id] = (crossCovMatrixSum[idx * aligned_ckf + ckf_id] / innCovMatrixSum[ckf_id]);
init_state[idx * aligned_ckf + ckf_id] = reduce_gain_matrix[idx * aligned_ckf + ckf_id] * zk_diff[ckf_id] + x_k_f[idx* aligned_ckf + ckf_id];
}
for (int idx = 0 ; idx < dimx; idx++)
{
init_var[idx * aligned_ckf + ckf_id ] = covMatrixSum[(idx * dimx + idx) * aligned_ckf + ckf_id] -
reduce_gain_matrix[idx * aligned_ckf + ckf_id] * innCovMatrixSum[ckf_id] *
reduce_gain_matrix[idx * aligned_ckf + ckf_id];
}
double det_zkinin = zk_diff[ckf_id] * zk_diff[ckf_id] * (1.0f /innCovMatrixSum[ckf_id]);
if (innCovMatrixSum[ckf_id] <= 0)
llk[ckf_id] = 0;
else
llk[ckf_id] = 0.5 * ((log(innCovMatrixSum[ckf_id])) +
det_zkinin + log((2.0) * 3.14));
cummulative[ckf_id] += llk[ckf_id];
}
}
I suspect you are trying to run this on an integrated Intel GPU which does not support double precision. I can only reproduce your error on my own Macbook Pro if I compile your kernel code for the Intel HD 4000 - it compiles just fine when I target the CPU or the discrete NVIDIA GPU.
You can check if the device supports double precision by querying the CL_DEVICE_DOUBLE_FP_CONFIG device information parameter:
cl_device_fp_config cfg;
clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
printf("Double FP config = %llu\n", cfg);
If this function returns the value 0, then double precision is not supported. This explains why the compiler log is only reporting float variants of the log function.

Kernel crashes while trying to do a simple value assignment

I am learning CUDA and still at the very beginner level. I am trying a simple assignment but my code crashes when I run it and I am not sure why. Any help would be appreciated.
EDIT: Crashes on cudaMemcpy and in Image structure, the pixelVal is of type int**. Is that the cause?
Original C++ code:
void Image::reflectImage(bool flag, Image& oldImage)
/*Reflects the Image based on users input*/
{
int rows = oldImage.N;
int cols = oldImage.M;
Image tempImage(oldImage);
for(int i = 0; i < rows; i++)
{
for(int j = 0; j < cols; j++)
tempImage.pixelVal[rows - (i + 1)][j] = oldImage.pixelVal[i][j];
}
oldImage = tempImage;
}
My CUDA kernel & code:
#define NTPB 512
__global__ void fliph(int* a, int* b, int r, int c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= r || j >= c)
return;
a[(r - i * c) + j] = b[i * c + j];
}
void Image::reflectImage(bool flag, Image& oldImage)
/*Reflects the Image based on users input*/
{
int rows = oldImage.N;
int cols = oldImage.M;
Image tempImage(oldImage);
if(flag == true) //horizontal reflection
{
//Allocate device memory
int* dpixels;
int* oldPixels;
int n = rows * cols;
cudaMalloc((void**)&dpixels, n * sizeof(int));
cudaMalloc((void**)&oldPixels, n * sizeof(int));
cudaMemcpy(dpixels, tempImage.pixelVal, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(oldPixels, oldImage.pixelVal, n * sizeof(int), cudaMemcpyHostToDevice);
int nblks = (n + NTPB - 1) / NTPB;
fliph<<<nblks, NTPB>>>(dpixels, oldPixels, rows, cols);
cudaMemcpy(tempImage.pixelVal, dpixels, n * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dpixels);
cudaFree(oldPixels);
}
oldImage = tempImage;
}
You have to create a 2D Grid in order to process the image using 2D indices i and j. In the current case, the kernel is processing only the first row of the image.
To create a 2D indexing mechanism, create a 2D block and 2D grid like this:
const int BLOCK_DIM = 16;
dim3 Block(BLOCK_DIM,BLOCK_DIM);
dim3 Grid;
Grid.x = (cols + Block.x - 1)/Block.x;
Grid.y = (rows + Block.y - 1)/Block.y;
fliph<<<Grid, Block>>>(dpixels, oldPixels, rows, cols);

CUDA: Accessing arbitrary long matrices in both dimensions

Hey there,
currently I'm using threads indexed only in one dimension to access all elements of a matrix like that:
// Thread-ID
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Offset:
int offset = gridDim.x * blockDim.x;
while(idx < MATRIX_ROWS * MATRIX_COLS)
{
row = idx % MATRIX_ROWS;
col = idx / MATRIX_ROWS;
matrix[ row ][ col ] = ...;
idx += offset;
}
Now I wondered how to access arbitrary long matrices with two dimensional indexing. I would like that one block always access the single elements of one row. Something like that (x-index is refering to the cols and the y-index to the rows of the matrix):
// Thread-IDs
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
// Offset:
int offset = gridDim.x * blockDim.x;
while(idx < MATRIX_COLS)
{
matrix[ idy ][ idx ] = ...;
idx += offset;
}
Now let's assume the matrix has more rows than I started blocks when calling the kernel: When starting N blocks, the first N rows of the matrix are handled right, but what about the other rows? How would you do that?
Thanks!
EDIT: I came up with an idea but I don't know if that's somehow 'ugly' coding!?
// Thread-IDs
int idx0 = blockIdx.x * blockDim.x + threadIdx.x;
int idx = idx0;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
// Offset:
int offsetx = gridDim.x * blockDim.x;
int offsety = gridDim.y * blockDim.y;
while(idx < MATRIX_COLS && idy < MATRIX_ROWS)
{
matrix[ idy ][ idx ] = ...;
idx += offsetx;
if(idx > MATRIX_COLS)
{
// Jump to nex row and start from 'beginning' concerning columns
idy += offsety;
idx = idx0;
}
}
Perhaps something like this is what you are looking for?
// Thread-IDs
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
// Offset:
int offsetx = gridDim.x * blockDim.x;
int offsety = gridDim.y * blockDim.y;
for(row = idy; row < MATRIX_ROWS; i+=offsety) {
float * pos = matrix + row;
#pragma unroll
for(col = idx; col < MATRIX_COLS; col+=offsetx) {
pos[col] = .....;
}
}
If MATRIX_COLS is a preprocessor define or constant, the compiler might be able to unroll the inner loop and give a bit more performance.
EDIT: The first version of the code was written with column-major ordering stuck in the back of my head, so ignore the comment that was here. Access should be coalesced doing it this way.

Resources