OpenCL kernel error on Mac OSx - macos

I wrote some OpenCL code which works fine on LINUX, but it is failing with errors on Mac OSX. Can someone please help me to identify why these should occur. The kernel code is shown after the error. My kernel uses double, so I have the corresponding pragma at the top. But I don't know why the error shows float data type:
inline float8 __OVERLOAD__ _name(float8 x) { return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30: note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:421:29:
note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float16 __OVERLOAD__ _name(float16 x){ return _default_name(x); }
^
<program source>:206:19: error: call to '__fast_relax_log' is ambiguous
det_zkinin + log((2.0) * 3.14));
^~~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4608:22:
note: expanded from macro 'log'
#define log(__x) __fast_relax_log(__x)
^~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30:
note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:416:27:
note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float __OVERLOAD__ _name(float x) { return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30
note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
^
This is the kernel code:
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void ckf_kernel2(int dimx, int aligned_dimx,
int numOfCKF, int aligned_ckf,
int iter,
double epsilon,
__global double * yrlists,
__global double * zrlists,
__global double * rlists,
__global double * init_state,
__global double * init_var,
__global double * sing_j,
__global double * covMatrixSum,
__global double * cummulative,
__global double * temp_var,
__global double * x_k_f,
__global double * z_k_j,
__global double * crossCovMatrixSum,
__global double * z_k_f,
__global double * innCovMatrixSum,
__global double * zk_diff,
__global double * reduce_gain_matrix,
__global double * llk
)
{
int ckf_id = get_global_id(0);
if( ckf_id < numOfCKF){
for (int i = 0 ; i < dimx ; i++)
{
for (int idx = 0; idx < dimx * 2 ; idx++)
{
int column = idx % dimx;
int mode = (idx >= dimx) ? -1 : 1;
sing_j[(i * dimx * 2 + idx) * aligned_ckf + ckf_id] = temp_var[(i * dimx + column) * aligned_ckf + ckf_id] * epsilon * mode + init_state[i * aligned_ckf + ckf_id];
}
}
z_k_f[ckf_id] = 0;
innCovMatrixSum[ckf_id] = 0;
for (int idx = 0; idx < dimx * 2 ; idx++)
{
z_k_j[idx * aligned_ckf + ckf_id] = 0;
for (int i = 0 ; i < dimx ; i++)
z_k_j[idx * aligned_ckf + ckf_id] += sing_j[(i * dimx * 2 + idx) * aligned_ckf + ckf_id] * zrlists[iter * aligned_dimx + i ];
z_k_f[ckf_id] += z_k_j[idx* aligned_ckf + ckf_id] ;
innCovMatrixSum[ckf_id] += z_k_j[idx* aligned_ckf + ckf_id] * z_k_j[idx* aligned_ckf + ckf_id];
}
z_k_f[ckf_id] = z_k_f[ckf_id] / (dimx * 2);
innCovMatrixSum[ckf_id] = innCovMatrixSum[ckf_id] / (dimx * 2);
innCovMatrixSum[ckf_id] = (innCovMatrixSum[ckf_id] - z_k_f[ckf_id] *z_k_f[ckf_id]) + rlists[ckf_id];
// calcualte crossCovMatrixSum
for (int idx = 0; idx < dimx; idx ++)
{
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = 0;
for (int i = 0 ; i < 2 * dimx ; i++)
{
crossCovMatrixSum[idx * aligned_ckf + ckf_id] += sing_j[(idx * dimx*2 + i) * aligned_ckf + ckf_id ] * z_k_j[i* aligned_ckf + ckf_id];
}
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = crossCovMatrixSum[idx * aligned_ckf + ckf_id]/ (dimx * 2);
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = crossCovMatrixSum[idx * aligned_ckf + ckf_id] - x_k_f[idx* aligned_ckf + ckf_id] * z_k_f[ckf_id];
}
// calculate zk_diff
int z_check = (int)yrlists[iter];
if (z_check == -1)
zk_diff[ckf_id] = 0;
else
zk_diff[ckf_id] = yrlists[iter] - z_k_f[ckf_id];
// calculate reduce_gain_matrix and (reduce_state_matrix <==> init_state);
for (int idx = 0 ; idx < dimx; idx++)
{
reduce_gain_matrix[idx * aligned_ckf + ckf_id] = (crossCovMatrixSum[idx * aligned_ckf + ckf_id] / innCovMatrixSum[ckf_id]);
init_state[idx * aligned_ckf + ckf_id] = reduce_gain_matrix[idx * aligned_ckf + ckf_id] * zk_diff[ckf_id] + x_k_f[idx* aligned_ckf + ckf_id];
}
for (int idx = 0 ; idx < dimx; idx++)
{
init_var[idx * aligned_ckf + ckf_id ] = covMatrixSum[(idx * dimx + idx) * aligned_ckf + ckf_id] -
reduce_gain_matrix[idx * aligned_ckf + ckf_id] * innCovMatrixSum[ckf_id] *
reduce_gain_matrix[idx * aligned_ckf + ckf_id];
}
double det_zkinin = zk_diff[ckf_id] * zk_diff[ckf_id] * (1.0f /innCovMatrixSum[ckf_id]);
if (innCovMatrixSum[ckf_id] <= 0)
llk[ckf_id] = 0;
else
llk[ckf_id] = 0.5 * ((log(innCovMatrixSum[ckf_id])) +
det_zkinin + log((2.0) * 3.14));
cummulative[ckf_id] += llk[ckf_id];
}
}

I suspect you are trying to run this on an integrated Intel GPU which does not support double precision. I can only reproduce your error on my own Macbook Pro if I compile your kernel code for the Intel HD 4000 - it compiles just fine when I target the CPU or the discrete NVIDIA GPU.
You can check if the device supports double precision by querying the CL_DEVICE_DOUBLE_FP_CONFIG device information parameter:
cl_device_fp_config cfg;
clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
printf("Double FP config = %llu\n", cfg);
If this function returns the value 0, then double precision is not supported. This explains why the compiler log is only reporting float variants of the log function.

Related

use ctypes to call a dll file in Python

I write a dll file with openmp, then I use ctypes to use it in Python, but the speed becomes super slow. When I rewrite the code in C, it only takes several seconds to finish.
The main part of the dll file is:
double martini(double a[], double b[], double d_title[], double c[], double d[], double A[], int num_element)
{
double nu_start = 0;
double mu_start = 0;
double z_start = 0;
double step_nu = 2 * 3.1415926 / 100;
double step_mu = 3.1415926 / 100;
double step_z = 0;
double nu = 0;
double mu = 0;
double z = 0;
double integral_first = 0;
double d_uv = 0;
int i = 0;
int j = 0;
int k = 0;
int loop = 0;
#pragma omp parallel for default(none) shared(a, d_title, b, c, nu_start, mu_start, z_start, step_nu, step_mu) private( i,j,k,mu, nu, step_z, z, d_uv) reduction(+:integral_first)
for (loop = 0; loop < num_element; loop++)
{
for (i = 0; i < 100; i++)
{
mu = mu_start + (i + 1) * step_mu;
for (j = 0; j < 100; j++)
{
nu = nu_start + (j + 1) * step_nu;
for (k = 0; k < 1500; k++)
{
d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) + b[loop] * b[loop] * cos(mu) * cos(mu)) / (c[loop] * c[loop]);
step_z = 20 / (d_uv * 1500);
z = z_start + (k + 1) * step_z;
integral_first = integral_first + 1 / 2 * A[loop] * sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu / (c[loop] * c[loop]);
}
}
}
}
return integral_first;
}
I use CDLL() in python to load this dll file. Then this function is called. But the speed is very low.
I also write another C file where I create a, b,d... by giving them 500 random numbers. I can get the result in seconds.
Python code:
martini=CDLL('C:/Users/38914/source/repos/Dll2/x64/Release/Dll2.dll')
integral=martini.martini
integral.restype=c_double
a_func=(c_double*num_element)()
b_func=(c_double*num_element)()
d_title_func=(c_double*num_element)()
c_func=(c_double*num_element)()
d_func=(c_double*num_element)()
A_func=(c_double*num_element)()
a_func[:]=a
b_func[:]=b
d_title_func[:]=d_title
c_func[:]=c
d_func[:]=d
A_func[:]=A
num_element_func=c_int(num_element)
out=integral(a_func,b_func,d_title_func,c_func,d_func,A_func,num_element_func)
a,b,c,d,d_title and A are array I use Python to read from a txt file.

Is there a way to avoid CUDA atomicAdd in my situation?

I'm doing an operation as the figure below.
Here is my kernel.
As shown in the figure, I make a small matrix using about one million vectors and accumulate it in a large prepared matrix.
I need an idea that can improve performance without exceeding 8Gb of GPU global memory.
How can I avoid atomic operations? I use the GTX1080. Existing kernels take about 250ms.
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
if (src[idx].mask == 1)
{
// matrix width
int cols = 6 * (mw_width + 1);
// calc position for insert
int idx0 = (src[idx].fid0 - st);
if (idx0 == mw_width - 2)
{
idx0 = idx0 - 1;
}
else if (idx0 == mw_width - 1)
{
idx0 = idx0 - 2;
}
int idx1 = (src[idx].fid1 - st);
if (idx1 == mw_width - 2)
{
idx1 = idx1 - 1;
}
else if (idx1 == mw_width - 1)
{
idx1 = idx1 - 2;
}
int pos0 = idx0 * 6;
int pos1 = idx1 * 6;
// set tempolar matrix
double _A00[24 * 24];
double _A11[24 * 24];
double _A01[24 * 24];
double _b0[24];
double _b1[24];
for (int y = 0; y < 24; y++)
{
for (int x = 0; x < 24; x++)
{
_A00[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J0[x];
_A11[y * 24 + x] = src[idx].w * src[idx].J1[y] * src[idx].J1[x];
_A01[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J1[x];
}
_b0[y] = src[idx].w * src[idx].c * src[idx].J0[y];
_b1[y] = src[idx].w * src[idx].c * src[idx].J1[y];
}
// set final matrix
for (int i = 0; i < 24; i++)
{
for (int j = 0; j < 24; j++)
{
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], _A00[i * 24 + j]); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], _A11[i * 24 + j]); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], _A01[i * 24 + j]); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], _A01[j * 24 + i]); // 10
}
atomicAdd(&b[i + pos0], _b0[i]); // 0
atomicAdd(&b[i + pos1], _b1[i]); // 1
}
}
}
}
2019.3.6.
I modified the code below to see some performance improvements.
250ms -> 95ms
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
int src_idx = idx / 576;
if (src[src_idx].mask == 1)
{
int cols = 6 * (mw_width + 1);
int pos0 = src[src_idx].pos0;
int pos1 = src[src_idx].pos1;
double w = src[src_idx].w;
double c = src[src_idx].c;
int sub_idx = idx % 576;
int i = sub_idx / 24;
int j = sub_idx % 24;
double J0_i = src[src_idx].J0[i];
double J0_j = src[src_idx].J0[j];
double J1_i = src[src_idx].J1[i];
double J1_j = src[src_idx].J1[j];
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], w * J0_i * J0_j); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], w * J1_i * J1_j); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], w * J0_i * J1_j); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], w * J1_i * J0_j); // 10
if (j == 0)
{
atomicAdd(&b[i + pos0], w * c * J0_i); // 0
atomicAdd(&b[i + pos1], w * c * J1_i); // 1
}
}
}
}

(Homework) Converting a function to a CUDA kernel function

I'm working on an assignment that asks to optimise this C program using CUDA parallelisation.
This is what I managed to come up with:
//...
__global__ void gpu_score_function(void *gpu_frame_pixels, void *gpu_pattern_pixels, void *gpu_results,
int frame_rowstride, int pattern_rowstride,
int pattern_width, int pattern_height,
int frame_width, int frame_height) {
if ((blockIdx.y * blockDim.y + threadIdx.y < frame_height - pattern_height) &&
(blockIdx.x * blockDim.x + threadIdx.x < frame_width - pattern_width)) {
guchar *frame_pixels = (guchar *) gpu_frame_pixels +
(blockIdx.y * blockDim.y + threadIdx.y) * frame_rowstride +
(blockIdx.x * blockDim.x + threadIdx.x) * N_CHANNELS;
guchar *pattern_pixels = (guchar *) gpu_pattern_pixels;
int *results = (int *) gpu_results;
int res = 0;
for (int y = 0; y < pattern_height; ++y) {
if (blockIdx.y * blockDim.y + threadIdx.y + y < frame_height - pattern_height) {
for (int x = 0; x < pattern_width; ++x) {
if (blockIdx.x * blockDim.x + threadIdx.x + x < frame_width - pattern_width) {
const guchar *frame_pixel = frame_pixels + x * N_CHANNELS;
const guchar *pattern_pixel = pattern_pixels + x * N_CHANNELS;
for (int c = 0; c < N_CHANNELS; ++c) {
res += (frame_pixel[c] - 128) * (pattern_pixel[c] - 128);
}
} else {
break;
}
}
frame_pixels += frame_rowstride;
pattern_pixels += pattern_rowstride;
} else {
break;
}
}
results[(blockIdx.y * blockDim.y + threadIdx.y) * (frame_width - pattern_width) + blockIdx.x * blockDim.x + threadIdx.x] = res;
}
}
int main(int argc, const char *argv[]) {
//...
/**
* CUDA
*/
void *gpu_pattern_pixels;
void *gpu_frame_pixels;
void *gpu_results;
cudaMalloc(&gpu_pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar));
cudaMalloc(&gpu_frame_pixels, frame_height * frame_rowstride * sizeof(guchar));
cudaMalloc(&gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy(gpu_pattern_pixels, (void *) pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_frame_pixels, (void *) frame_pixels, frame_height * frame_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
//Kernel configuration, where a two-dimensional grid and
//three-dimensional blocks are configured.
dim3 dimGrid(ceil((float) (frame_width - pattern_width) / 32), ceil((float) (frame_height - pattern_height) / 32));
dim3 dimBlock(32, 32);
gpu_score_function<<<dimGrid, dimBlock>>>(gpu_frame_pixels, gpu_pattern_pixels, gpu_results, frame_rowstride, pattern_rowstride, pattern_width, pattern_height, frame_width, frame_height);
cudaDeviceSynchronize();
int *results = (int *) malloc((frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy((void *) results, gpu_results,
(frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int), cudaMemcpyDeviceToHost);
int gpu_x_best, gpu_y_best;
double gpu_best_score;
for (int *cur = results; cur != results + (frame_width - pattern_width) * (frame_height - pattern_height); cur++) {
if (cur == results || *cur > gpu_best_score) {
gpu_best_score = *cur;
gpu_x_best = (cur - results) % (frame_width - pattern_width);
gpu_y_best = (cur - results) / (frame_width - pattern_width);
}
}
cudaFree(gpu_pattern_pixels);
cudaFree(gpu_frame_pixels);
cudaFree(gpu_results);
free(results);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
/**
* END CUDA
*/
//...
return 0;
}
The program doesn't segfault, cuda-memcheck gives 0 errors and the result matrix is filled.
The problem is, the results are wrong.
I'm quite sure it's some off-by-one pointer error, but I have no idea how to spot it.
I'm working on OSX 10.9, what tools could I use to debug this program?
Any help is appreciated.
I found the bug.
The two if statements inside the for loops of gpu_score_function make no sense. Deleting them solved the problem.

opencl- image object clamp to edge

I have two different kernels for my sobel operator.
One uses buffer object and the other one uses image object.
In my opinion, these two kernels should make the same result, but they're not.
These two codes handle edges using (clamp to edge)
Where is the problem?
Code with buffer object
__kernel void sobel_filter(__global uchar *ucGRAY, __global float *sobel, __global float *grad_max, int im_width, int im_height)
{
float2 xt;
int i = get_global_id(0);
int j = get_global_id(1);
int ii_p, jj_p, ii_n, jj_n; // ii_n,jj_n = (i,j)-1 ii_p,jj_p = (i,j)+1
if (i == 0)
ii_n = i;
else if (i == im_width - 1)
ii_p = i;
else
{
ii_n = i - 1;
ii_p = i + 1;
}
if (j == 0)
jj_n = i;
else if (j == im_height - 1)
jj_p = j;
else
{
jj_n = j - 1;
jj_p = j + 1;
}
xt.x = (float)(ucGRAY[(jj_n)* im_width + (ii_p)] // 3
+ ucGRAY[j * im_width + (ii_p)] * 2 //6
+ ucGRAY[(jj_p) * im_width + (ii_p)] //9
- ucGRAY[(jj_n)* im_width + (ii_n)] //1
- ucGRAY[j * im_width + (ii_n)] * 2 //4
- ucGRAY[(jj_p)* im_width + (ii_n)]) / 1020; //7
xt.y =(float)( ucGRAY[(jj_p)* im_width + (ii_n)] //7
+ucGRAY[(jj_p)* im_width + (i)] * 2 //8
+ucGRAY[(jj_p)* im_width + (ii_p)] //9
- ucGRAY[(jj_n)* im_width + (ii_n)] //1
- ucGRAY[(jj_n)* im_width + (i)] * 2 //2
- ucGRAY[(jj_n)* im_width + (ii_p)]) / 1020; //3
sobel[j * im_height + i] = length(xt);
AtomicMax(grad_max, sobel[j * im_width + i]);
}
Code with image object
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | //Natural coordinates
CLK_ADDRESS_CLAMP_TO_EDGE | //Clamp to edge
CLK_FILTER_NEAREST; //Don't interpolate
__kernel void sobel_filter_image(read_only image2d_t ucGRAY,__global float *sobel,__global float *grad_max,int Width, int Height)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
float2 xt;
float temp;
uchar val5=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y)).x;
uchar val1=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y-1)).x;
uchar val2=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y-1)).x;
uchar val3=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y-1)).x;
uchar val4=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y)).x;
uchar val6=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y)).x;
uchar val7=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y+1)).x;
uchar val8=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y+1)).x;
uchar val9=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y+1)).x;
xt.x = (float)(val3 + (val6 * 2) + val9
- val1 - (val4 * 2) - val7) / 1020;
xt.y = (float)(val7 + (val8 * 2) + val9
- val1 - (val2 * 2) - val3) / 1020;
sobel[coord.y * Width + coord.x] = length(xt);// G=sqrt(Gy^2+Gx^2)
AtomicMax(grad_max,sobel[coord.y * Width + coord.x]);
}
In your buffer version, you have this:
if (j == 0)
jj_n = i;
Presumably that should be:
if (j == 0)
jj_n = j;

CUDA Image Rotation

I am having trouble implementing image rotation in CUDA. I have a very simple Rotate function working as follows:
__device__ float readPixVal( float* ImgSrc,int ImgWidth,int x,int y)
{
return (float)ImgSrc[y*ImgWidth+x];
}
__device__ void putPixVal( float* ImgSrc,int ImgWidth,int x,int y, float floatVal)
{
ImgSrc[y*ImgWidth+x] = floatVal;
}
__global__ void Rotate(float* Source, float* Destination, int sizeX, int sizeY, float deg)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;// Kernel definition
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i < sizeX && j < sizeY)
{
putPixVal(Destination, sizeX, ((float)i)*cos(deg) - ((float)j)*sin(deg), ((float)i)*sin(deg) + ((float)j)*cos(deg)), readPixVal(Source, sizeX, i, j));
}
}
The problem is, I do not know how to do any interpolation. With the above, many pixels are skipped due to integer roundoff. Anyone know how to fix this, or are there any free/opensource implementations of image rotate? I could not find any for CUDA.
Generally in this sort of image manipulation you loop over all destination pixel positions calculating the corresponding pixel (or interpolating groups of pixels) in the source image.
This ensures that you evenly and uniformly fill the resulting image which is normally what you care about.
void rotateImage_Kernel(cufftComplex* trg, const cufftComplex* src, const unsigned int imageWidth,const unsigned int imageHeight, const float angle, const float scale)
{
// compute thread dimension
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//// compute target address
const unsigned int idx = x + y * imageWidth;
const int xA = (x - imageWidth/2 );
const int yA = (y - imageHeight/2 );
const int xR = (int)floor(1.0f/scale * (xA * cos(angle) - yA * sin(angle)));
const int yR = (int)floor(1.0f/scale * (xA * sin(angle) + yA * cos(angle)));
float src_x = xR + imageWidth/2;
float src_y = yR + imageHeight/2;
if ( src_x >= 0.0f && src_x < imageWidth && src_y >= 0.0f && src_y < imageHeight) {
// BI - LINEAR INTERPOLATION
float src_x0 = (float)(int)(src_x);
float src_x1 = (src_x0+1);
float src_y0 = (float)(int)(src_y);
float src_y1 = (src_y0+1);
float sx = (src_x-src_x0);
float sy = (src_y-src_y0);
int idx_src00 = min(max(0.0f,src_x0 + src_y0 * imageWidth),imageWidth*imageHeight-1.0f);
int idx_src10 = min(max(0.0f,src_x1 + src_y0 * imageWidth),imageWidth*imageHeight-1.0f);
int idx_src01 = min(max(0.0f,src_x0 + src_y1 * imageWidth),imageWidth*imageHeight-1.0f);
int idx_src11 = min(max(0.0f,src_x1 + src_y1 * imageWidth),imageWidth*imageHeight-1.0f);
trg[idx].y = 0.0f;
trg[idx].x = (1.0f-sx)*(1.0f-sy)*src[idx_src00].x;
trg[idx].x += ( sx)*(1.0f-sy)*src[idx_src10].x;
trg[idx].x += (1.0f-sx)*( sy)*src[idx_src01].x;
trg[idx].x += ( sx)*( sy)*src[idx_src11].x;
} else {
trg[idx].x = 0.0f;
trg[idx].y = 0.0f;
}
DEVICE_METHODE_LAST_COMMAND;
}
void translateImage_Kernel(cufftComplex* trg, const cufftComplex* src, const unsigned int imageWidth, const unsigned int imageHeight, const float tX, const float tY)
{
// compute thread dimension
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//// compute target address
const unsigned int idx = x + y * imageWidth;
const int xB = ((int)x + (int)tX );
const int yB = ((int)y + (int)tY );
if ( xB >= 0 && xB < imageWidth && yB >= 0 && yB < imageHeight) {
trg[idx] = src[xB + yB * imageWidth];
} else {
trg[idx].x = 0.0f;
trg[idx].y = 0.0f;
}
DEVICE_METHODE_LAST_COMMAND;
}
This seems to do the trick
__global__ void Rotate(float* Source, float* Destination, int sizeX, int sizeY, float deg)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;// Kernel definition
int j = blockIdx.y * blockDim.y + threadIdx.y;
int xc = sizeX - sizeX/2;
int yc = sizeY - sizeY/2;
int newx = ((float)i-xc)*cos(deg) - ((float)j-yc)*sin(deg) + xc;
int newy = ((float)i-xc)*sin(deg) + ((float)j-yc)*cos(deg) + yc;
if (newx >= 0 && newx < sizeX && newy >= 0 && newy < sizeY)
{
putPixVal(Destination, sizeX, i , j, readPixVal(Source, sizeX, newx, newy));
}
}

Resources