I have two different kernels for my sobel operator.
One uses buffer object and the other one uses image object.
In my opinion, these two kernels should make the same result, but they're not.
These two codes handle edges using (clamp to edge)
Where is the problem?
Code with buffer object
__kernel void sobel_filter(__global uchar *ucGRAY, __global float *sobel, __global float *grad_max, int im_width, int im_height)
{
float2 xt;
int i = get_global_id(0);
int j = get_global_id(1);
int ii_p, jj_p, ii_n, jj_n; // ii_n,jj_n = (i,j)-1 ii_p,jj_p = (i,j)+1
if (i == 0)
ii_n = i;
else if (i == im_width - 1)
ii_p = i;
else
{
ii_n = i - 1;
ii_p = i + 1;
}
if (j == 0)
jj_n = i;
else if (j == im_height - 1)
jj_p = j;
else
{
jj_n = j - 1;
jj_p = j + 1;
}
xt.x = (float)(ucGRAY[(jj_n)* im_width + (ii_p)] // 3
+ ucGRAY[j * im_width + (ii_p)] * 2 //6
+ ucGRAY[(jj_p) * im_width + (ii_p)] //9
- ucGRAY[(jj_n)* im_width + (ii_n)] //1
- ucGRAY[j * im_width + (ii_n)] * 2 //4
- ucGRAY[(jj_p)* im_width + (ii_n)]) / 1020; //7
xt.y =(float)( ucGRAY[(jj_p)* im_width + (ii_n)] //7
+ucGRAY[(jj_p)* im_width + (i)] * 2 //8
+ucGRAY[(jj_p)* im_width + (ii_p)] //9
- ucGRAY[(jj_n)* im_width + (ii_n)] //1
- ucGRAY[(jj_n)* im_width + (i)] * 2 //2
- ucGRAY[(jj_n)* im_width + (ii_p)]) / 1020; //3
sobel[j * im_height + i] = length(xt);
AtomicMax(grad_max, sobel[j * im_width + i]);
}
Code with image object
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | //Natural coordinates
CLK_ADDRESS_CLAMP_TO_EDGE | //Clamp to edge
CLK_FILTER_NEAREST; //Don't interpolate
__kernel void sobel_filter_image(read_only image2d_t ucGRAY,__global float *sobel,__global float *grad_max,int Width, int Height)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
float2 xt;
float temp;
uchar val5=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y)).x;
uchar val1=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y-1)).x;
uchar val2=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y-1)).x;
uchar val3=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y-1)).x;
uchar val4=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y)).x;
uchar val6=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y)).x;
uchar val7=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y+1)).x;
uchar val8=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y+1)).x;
uchar val9=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y+1)).x;
xt.x = (float)(val3 + (val6 * 2) + val9
- val1 - (val4 * 2) - val7) / 1020;
xt.y = (float)(val7 + (val8 * 2) + val9
- val1 - (val2 * 2) - val3) / 1020;
sobel[coord.y * Width + coord.x] = length(xt);// G=sqrt(Gy^2+Gx^2)
AtomicMax(grad_max,sobel[coord.y * Width + coord.x]);
}
In your buffer version, you have this:
if (j == 0)
jj_n = i;
Presumably that should be:
if (j == 0)
jj_n = j;
Related
I'm doing an operation as the figure below.
Here is my kernel.
As shown in the figure, I make a small matrix using about one million vectors and accumulate it in a large prepared matrix.
I need an idea that can improve performance without exceeding 8Gb of GPU global memory.
How can I avoid atomic operations? I use the GTX1080. Existing kernels take about 250ms.
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
if (src[idx].mask == 1)
{
// matrix width
int cols = 6 * (mw_width + 1);
// calc position for insert
int idx0 = (src[idx].fid0 - st);
if (idx0 == mw_width - 2)
{
idx0 = idx0 - 1;
}
else if (idx0 == mw_width - 1)
{
idx0 = idx0 - 2;
}
int idx1 = (src[idx].fid1 - st);
if (idx1 == mw_width - 2)
{
idx1 = idx1 - 1;
}
else if (idx1 == mw_width - 1)
{
idx1 = idx1 - 2;
}
int pos0 = idx0 * 6;
int pos1 = idx1 * 6;
// set tempolar matrix
double _A00[24 * 24];
double _A11[24 * 24];
double _A01[24 * 24];
double _b0[24];
double _b1[24];
for (int y = 0; y < 24; y++)
{
for (int x = 0; x < 24; x++)
{
_A00[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J0[x];
_A11[y * 24 + x] = src[idx].w * src[idx].J1[y] * src[idx].J1[x];
_A01[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J1[x];
}
_b0[y] = src[idx].w * src[idx].c * src[idx].J0[y];
_b1[y] = src[idx].w * src[idx].c * src[idx].J1[y];
}
// set final matrix
for (int i = 0; i < 24; i++)
{
for (int j = 0; j < 24; j++)
{
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], _A00[i * 24 + j]); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], _A11[i * 24 + j]); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], _A01[i * 24 + j]); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], _A01[j * 24 + i]); // 10
}
atomicAdd(&b[i + pos0], _b0[i]); // 0
atomicAdd(&b[i + pos1], _b1[i]); // 1
}
}
}
}
2019.3.6.
I modified the code below to see some performance improvements.
250ms -> 95ms
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
int src_idx = idx / 576;
if (src[src_idx].mask == 1)
{
int cols = 6 * (mw_width + 1);
int pos0 = src[src_idx].pos0;
int pos1 = src[src_idx].pos1;
double w = src[src_idx].w;
double c = src[src_idx].c;
int sub_idx = idx % 576;
int i = sub_idx / 24;
int j = sub_idx % 24;
double J0_i = src[src_idx].J0[i];
double J0_j = src[src_idx].J0[j];
double J1_i = src[src_idx].J1[i];
double J1_j = src[src_idx].J1[j];
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], w * J0_i * J0_j); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], w * J1_i * J1_j); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], w * J0_i * J1_j); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], w * J1_i * J0_j); // 10
if (j == 0)
{
atomicAdd(&b[i + pos0], w * c * J0_i); // 0
atomicAdd(&b[i + pos1], w * c * J1_i); // 1
}
}
}
}
I'm working on an assignment that asks to optimise this C program using CUDA parallelisation.
This is what I managed to come up with:
//...
__global__ void gpu_score_function(void *gpu_frame_pixels, void *gpu_pattern_pixels, void *gpu_results,
int frame_rowstride, int pattern_rowstride,
int pattern_width, int pattern_height,
int frame_width, int frame_height) {
if ((blockIdx.y * blockDim.y + threadIdx.y < frame_height - pattern_height) &&
(blockIdx.x * blockDim.x + threadIdx.x < frame_width - pattern_width)) {
guchar *frame_pixels = (guchar *) gpu_frame_pixels +
(blockIdx.y * blockDim.y + threadIdx.y) * frame_rowstride +
(blockIdx.x * blockDim.x + threadIdx.x) * N_CHANNELS;
guchar *pattern_pixels = (guchar *) gpu_pattern_pixels;
int *results = (int *) gpu_results;
int res = 0;
for (int y = 0; y < pattern_height; ++y) {
if (blockIdx.y * blockDim.y + threadIdx.y + y < frame_height - pattern_height) {
for (int x = 0; x < pattern_width; ++x) {
if (blockIdx.x * blockDim.x + threadIdx.x + x < frame_width - pattern_width) {
const guchar *frame_pixel = frame_pixels + x * N_CHANNELS;
const guchar *pattern_pixel = pattern_pixels + x * N_CHANNELS;
for (int c = 0; c < N_CHANNELS; ++c) {
res += (frame_pixel[c] - 128) * (pattern_pixel[c] - 128);
}
} else {
break;
}
}
frame_pixels += frame_rowstride;
pattern_pixels += pattern_rowstride;
} else {
break;
}
}
results[(blockIdx.y * blockDim.y + threadIdx.y) * (frame_width - pattern_width) + blockIdx.x * blockDim.x + threadIdx.x] = res;
}
}
int main(int argc, const char *argv[]) {
//...
/**
* CUDA
*/
void *gpu_pattern_pixels;
void *gpu_frame_pixels;
void *gpu_results;
cudaMalloc(&gpu_pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar));
cudaMalloc(&gpu_frame_pixels, frame_height * frame_rowstride * sizeof(guchar));
cudaMalloc(&gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy(gpu_pattern_pixels, (void *) pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_frame_pixels, (void *) frame_pixels, frame_height * frame_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
//Kernel configuration, where a two-dimensional grid and
//three-dimensional blocks are configured.
dim3 dimGrid(ceil((float) (frame_width - pattern_width) / 32), ceil((float) (frame_height - pattern_height) / 32));
dim3 dimBlock(32, 32);
gpu_score_function<<<dimGrid, dimBlock>>>(gpu_frame_pixels, gpu_pattern_pixels, gpu_results, frame_rowstride, pattern_rowstride, pattern_width, pattern_height, frame_width, frame_height);
cudaDeviceSynchronize();
int *results = (int *) malloc((frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy((void *) results, gpu_results,
(frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int), cudaMemcpyDeviceToHost);
int gpu_x_best, gpu_y_best;
double gpu_best_score;
for (int *cur = results; cur != results + (frame_width - pattern_width) * (frame_height - pattern_height); cur++) {
if (cur == results || *cur > gpu_best_score) {
gpu_best_score = *cur;
gpu_x_best = (cur - results) % (frame_width - pattern_width);
gpu_y_best = (cur - results) / (frame_width - pattern_width);
}
}
cudaFree(gpu_pattern_pixels);
cudaFree(gpu_frame_pixels);
cudaFree(gpu_results);
free(results);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
/**
* END CUDA
*/
//...
return 0;
}
The program doesn't segfault, cuda-memcheck gives 0 errors and the result matrix is filled.
The problem is, the results are wrong.
I'm quite sure it's some off-by-one pointer error, but I have no idea how to spot it.
I'm working on OSX 10.9, what tools could I use to debug this program?
Any help is appreciated.
I found the bug.
The two if statements inside the for loops of gpu_score_function make no sense. Deleting them solved the problem.
How I can convert VPX_IMG_FMT_I420 vpx_image_t to RGB?
Example code:
int DecodeFrame()
{
vpx_video_reader_read_frame(reader);
vpx_codec_iter_t iter = NULL;
vpx_image_t *img = NULL;
size_t frame_size = 0;
const unsigned char *frame = vpx_video_reader_get_frame(reader, &frame_size);
if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0)) return 0;
img = vpx_codec_get_frame(&codec, &iter);
// here I need to convert img to RGB array
return 1;
};
Code from uTox:
void yuv420tobgr(uint16_t width, uint16_t height,
const uint8_t *y, const uint8_t *u, const uint8_t *v,
unsigned int ystride,
unsigned int ustride,
unsigned int vstride,
uint8_t *out)
{
unsigned long int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
uint8_t *point = out + 4 * ((i * width) + j);
int t_y = y[((i * ystride) + j)];
int t_u = u[(((i / 2) * ustride) + (j / 2))];
int t_v = v[(((i / 2) * vstride) + (j / 2))];
t_y = t_y < 16 ? 16 : t_y;
int r = (298 * (t_y - 16) + 409 * (t_v - 128) + 128) >> 8;
int g = (298 * (t_y - 16) - 100 * (t_u - 128) - 208 * (t_v - 128) + 128) >> 8;
int b = (298 * (t_y - 16) + 516 * (t_u - 128) + 128) >> 8;
point[2] = r>255? 255 : r<0 ? 0 : r;
point[1] = g>255? 255 : g<0 ? 0 : g;
point[0] = b>255? 255 : b<0 ? 0 : b;
point[3] = ~0;
}
}
}
I am trying to use calcOpticalFlowPyrLK function of openCV for motion detection but I am getting error message :
OpenCV Error: Assertion failed ((npoints = prevPtsMat.checkVector(2, CV_32F, tru
e)) >= 0) in cv::calcOpticalFlowPyrLK, file ..\..\..\..\opencv\modules\video\src
\lkpyramid.cpp, line 845
I have checked that the previous corners stored in corners1 of my code is not empty and are valid points."goodFeaturesToTrack" function works fine.
I have been stuck here for 2 days any help would be appreciated.
Code Block:
int opticalflow()
{
Mat frame1, frame2, frame3, difference1, difference2, frame, mask;
VideoCapture capture;
vector <Point2d> corners1;
vector <Point2d> corners2;
int maxCorners = 200;
double qualityLevel = 0.01;
double minDistance = 5;
int blockSize = 3;
bool useHarrisDetector = false;
double k = 0.04;
vector <unsigned char> optical_flow_found_feature;
vector <float > optical_flow_feature_error;
//Read the video stream
capture.open(0);
if (!capture.isOpened())
{
printf("Error Opening Video Capture!!!\n");
}
waitKey(10);
capture.read(frame1);
cvtColor( frame1, frame1, COLOR_BGR2GRAY );
goodFeaturesToTrack( frame1, corners1, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k );
for(int i=0; i<corners1.size(); i++)
cout<<corners1[i].x<<" "<<corners1[i].y<<endl;
while (1)
{
capture.read(frame2);
cvtColor( frame2, frame2, COLOR_BGR2GRAY );
if(corners1.size()>10)
calcOpticalFlowPyrLK(frame1, frame2, corners1, corners2, optical_flow_found_feature, optical_flow_feature_error);// Size(21,21), 3, TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01), 0, 1e-4);
cout<<"Reached Here\n";
for(int i=0; i<maxCorners; i++)
{
if(optical_flow_found_feature[i] == 0)
continue;
int line_thickness = 1;
CvScalar line_color = CV_RGB(255,0,0);
Point2d p,q;
p.x = (int) corners1[i].x;
p.y = (int) corners1[i].y;
q.x = (int) corners2[i].x;
q.y = (int) corners2[i].y;
double angle = atan2( (double) p.y - q.y, (double) p.x - q.x );
double hypotenuse = sqrt((long double)(p.y - q.y)*(p.y - q.y) + (p.x - q.x)*(p.x - q.x) );
q.x = (int) (p.x - 3 * hypotenuse * cos(angle));
q.y = (int) (p.y - 3 * hypotenuse * sin(angle));
line( frame1, p, q, line_color, line_thickness, CV_AA, 0 );
p.x = (int) (q.x + 9 * cos(angle + pi / 4));
p.y = (int) (q.y + 9 * sin(angle + pi / 4));
line( frame1, p, q, line_color, line_thickness, CV_AA, 0 );
p.x = (int) (q.x + 9 * cos(angle - pi / 4));
p.y = (int) (q.y + 9 * sin(angle - pi / 4));
line( frame1, p, q, line_color, line_thickness, CV_AA, 0 );
}
display(frame1);
int c = waitKey(25);
//Exit if escape is pressed
if((char)c == 27)
{
break;
}
}
return 0;
}
Platform:
I am using Visual Studio 2010
OpenCV 2.4.9
on Windows 8
I am having trouble implementing image rotation in CUDA. I have a very simple Rotate function working as follows:
__device__ float readPixVal( float* ImgSrc,int ImgWidth,int x,int y)
{
return (float)ImgSrc[y*ImgWidth+x];
}
__device__ void putPixVal( float* ImgSrc,int ImgWidth,int x,int y, float floatVal)
{
ImgSrc[y*ImgWidth+x] = floatVal;
}
__global__ void Rotate(float* Source, float* Destination, int sizeX, int sizeY, float deg)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;// Kernel definition
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i < sizeX && j < sizeY)
{
putPixVal(Destination, sizeX, ((float)i)*cos(deg) - ((float)j)*sin(deg), ((float)i)*sin(deg) + ((float)j)*cos(deg)), readPixVal(Source, sizeX, i, j));
}
}
The problem is, I do not know how to do any interpolation. With the above, many pixels are skipped due to integer roundoff. Anyone know how to fix this, or are there any free/opensource implementations of image rotate? I could not find any for CUDA.
Generally in this sort of image manipulation you loop over all destination pixel positions calculating the corresponding pixel (or interpolating groups of pixels) in the source image.
This ensures that you evenly and uniformly fill the resulting image which is normally what you care about.
void rotateImage_Kernel(cufftComplex* trg, const cufftComplex* src, const unsigned int imageWidth,const unsigned int imageHeight, const float angle, const float scale)
{
// compute thread dimension
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//// compute target address
const unsigned int idx = x + y * imageWidth;
const int xA = (x - imageWidth/2 );
const int yA = (y - imageHeight/2 );
const int xR = (int)floor(1.0f/scale * (xA * cos(angle) - yA * sin(angle)));
const int yR = (int)floor(1.0f/scale * (xA * sin(angle) + yA * cos(angle)));
float src_x = xR + imageWidth/2;
float src_y = yR + imageHeight/2;
if ( src_x >= 0.0f && src_x < imageWidth && src_y >= 0.0f && src_y < imageHeight) {
// BI - LINEAR INTERPOLATION
float src_x0 = (float)(int)(src_x);
float src_x1 = (src_x0+1);
float src_y0 = (float)(int)(src_y);
float src_y1 = (src_y0+1);
float sx = (src_x-src_x0);
float sy = (src_y-src_y0);
int idx_src00 = min(max(0.0f,src_x0 + src_y0 * imageWidth),imageWidth*imageHeight-1.0f);
int idx_src10 = min(max(0.0f,src_x1 + src_y0 * imageWidth),imageWidth*imageHeight-1.0f);
int idx_src01 = min(max(0.0f,src_x0 + src_y1 * imageWidth),imageWidth*imageHeight-1.0f);
int idx_src11 = min(max(0.0f,src_x1 + src_y1 * imageWidth),imageWidth*imageHeight-1.0f);
trg[idx].y = 0.0f;
trg[idx].x = (1.0f-sx)*(1.0f-sy)*src[idx_src00].x;
trg[idx].x += ( sx)*(1.0f-sy)*src[idx_src10].x;
trg[idx].x += (1.0f-sx)*( sy)*src[idx_src01].x;
trg[idx].x += ( sx)*( sy)*src[idx_src11].x;
} else {
trg[idx].x = 0.0f;
trg[idx].y = 0.0f;
}
DEVICE_METHODE_LAST_COMMAND;
}
void translateImage_Kernel(cufftComplex* trg, const cufftComplex* src, const unsigned int imageWidth, const unsigned int imageHeight, const float tX, const float tY)
{
// compute thread dimension
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//// compute target address
const unsigned int idx = x + y * imageWidth;
const int xB = ((int)x + (int)tX );
const int yB = ((int)y + (int)tY );
if ( xB >= 0 && xB < imageWidth && yB >= 0 && yB < imageHeight) {
trg[idx] = src[xB + yB * imageWidth];
} else {
trg[idx].x = 0.0f;
trg[idx].y = 0.0f;
}
DEVICE_METHODE_LAST_COMMAND;
}
This seems to do the trick
__global__ void Rotate(float* Source, float* Destination, int sizeX, int sizeY, float deg)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;// Kernel definition
int j = blockIdx.y * blockDim.y + threadIdx.y;
int xc = sizeX - sizeX/2;
int yc = sizeY - sizeY/2;
int newx = ((float)i-xc)*cos(deg) - ((float)j-yc)*sin(deg) + xc;
int newy = ((float)i-xc)*sin(deg) + ((float)j-yc)*cos(deg) + yc;
if (newx >= 0 && newx < sizeX && newy >= 0 && newy < sizeY)
{
putPixVal(Destination, sizeX, i , j, readPixVal(Source, sizeX, newx, newy));
}
}