libVPX convert VPX_IMG_FMT_I420 -> RGB - image

How I can convert VPX_IMG_FMT_I420 vpx_image_t to RGB?
Example code:
int DecodeFrame()
vpx_codec_iter_t iter = NULL;
vpx_image_t *img = NULL;
size_t frame_size = 0;
const unsigned char *frame = vpx_video_reader_get_frame(reader, &frame_size);
if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0)) return 0;
img = vpx_codec_get_frame(&codec, &iter);
// here I need to convert img to RGB array
return 1;

Code from uTox:
void yuv420tobgr(uint16_t width, uint16_t height,
const uint8_t *y, const uint8_t *u, const uint8_t *v,
unsigned int ystride,
unsigned int ustride,
unsigned int vstride,
uint8_t *out)
unsigned long int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
uint8_t *point = out + 4 * ((i * width) + j);
int t_y = y[((i * ystride) + j)];
int t_u = u[(((i / 2) * ustride) + (j / 2))];
int t_v = v[(((i / 2) * vstride) + (j / 2))];
t_y = t_y < 16 ? 16 : t_y;
int r = (298 * (t_y - 16) + 409 * (t_v - 128) + 128) >> 8;
int g = (298 * (t_y - 16) - 100 * (t_u - 128) - 208 * (t_v - 128) + 128) >> 8;
int b = (298 * (t_y - 16) + 516 * (t_u - 128) + 128) >> 8;
point[2] = r>255? 255 : r<0 ? 0 : r;
point[1] = g>255? 255 : g<0 ? 0 : g;
point[0] = b>255? 255 : b<0 ? 0 : b;
point[3] = ~0;


Digital Image Processing Contrast Stretching Histogram

Here I attach my code that I use to Draw the Histogram of the Contrasted image and also to convert a gray image into Contrast Image. Here I used low pint as 122 and highest point as 244. In the output histogram it reduce the height of the histogram.
I cannot find the error in my code
#include "opencv2/opencv.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/core.hpp"
using namespace cv;
using namespace std;
int main(int argc, char* argv[]) {
Mat img = imread(argv[1], 1);
if (! {
cout << "Could not find the image!" << endl;
return -1;
int height = img.rows;
int width = img.cols;
int widthstep = img.step;
int ch = img.channels();
printf("Height : %d\n", height);
printf("Width : %d\n", width);
printf("Widthstep : %d\n", widthstep);
printf("No of channels : %d\n", ch);
Mat gray_image(height, width, CV_8UC1, Scalar(0));
cvtColor(img, gray_image, COLOR_BGR2GRAY);
Mat new_image = gray_image.clone();
int v;
int output{};
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int v = (int)<uchar>(y, x);
if (v >= 0 && v <= 122) {
output = int((6 / 122) * v);
else if (v > 100 && v <= 244) {
output = int(((244) / (122)) * (v - 122) + 6);
else if (v > 244 && v <= 255) {
output = int(((5) / (11)) * (v - 244) + 250);
}<uchar>(y, x) = (uchar)output;
int histn[256];
for (int i = 0; i < 256; i++) {
histn[i] = 0;
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
histn[(int)<uchar>(y, x)] = histn[(int)<uchar>(y, x)] + 1;
for (int i = 0; i < 256; i++) {
cout << i << ":" << histn[i] << endl;
int hist_wn = 512;
int hist_hn = 400;
int bin_wn = cvRound((double)hist_wn / 256);
Mat new_histogramImage(hist_hn, hist_wn, CV_8UC1, Scalar(255));
int maxn = histn[0];
for (int i = 0; i < 256; i++) {
if (maxn < histn[i]) {
maxn = histn[i];
for (int i = 0; i < 256; i++) {
histn[i] = ((double)histn[i] / maxn) * new_histogramImage.rows;
for (int i = 0; i < 256; i++) {
line(new_histogramImage, Point(bin_wn * (i), hist_hn), Point(bin_wn * (i), hist_hn - histn[i]), Scalar(0), 1, 8, 0);
imwrite("Gray_Image.png", gray_image);
imwrite("newcontrast_Image.png", new_image);
imwrite("Histogram.png", new_histogramImage);
imshow("Image", img);
imshow("Gray_Image", gray_image);
imshow("newcontrast_Image", new_image);
imshow("New_Histogram", new_histogramImage);
imshow("Old_Histogram", histImage);
return 0;
Here are the new and old histograms that I got as outputs
I found the solution for the question. Here I changed the lowest and highest point values as 100 and 240 and when using the values set those as decimals values.
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int v = (int)<uchar>(y, x);
if (v >= 0 && v <= 100) {
output = int((5.0/ 100.0) * v);
else if (v > 100 && v <= 240) {
output = int(((245.0) / (140.0)) * (v - 100.0) + 5.0);
else if (v > 240 && v <= 255) {
output = int(((5.0) / (15.0)) * (v - 240.0) + 250.0);
}<uchar>(y, x) = (uchar)output;

Is there a way to avoid CUDA atomicAdd in my situation?

I'm doing an operation as the figure below.
Here is my kernel.
As shown in the figure, I make a small matrix using about one million vectors and accumulate it in a large prepared matrix.
I need an idea that can improve performance without exceeding 8Gb of GPU global memory.
How can I avoid atomic operations? I use the GTX1080. Existing kernels take about 250ms.
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
if (src[idx].mask == 1)
// matrix width
int cols = 6 * (mw_width + 1);
// calc position for insert
int idx0 = (src[idx].fid0 - st);
if (idx0 == mw_width - 2)
idx0 = idx0 - 1;
else if (idx0 == mw_width - 1)
idx0 = idx0 - 2;
int idx1 = (src[idx].fid1 - st);
if (idx1 == mw_width - 2)
idx1 = idx1 - 1;
else if (idx1 == mw_width - 1)
idx1 = idx1 - 2;
int pos0 = idx0 * 6;
int pos1 = idx1 * 6;
// set tempolar matrix
double _A00[24 * 24];
double _A11[24 * 24];
double _A01[24 * 24];
double _b0[24];
double _b1[24];
for (int y = 0; y < 24; y++)
for (int x = 0; x < 24; x++)
_A00[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J0[x];
_A11[y * 24 + x] = src[idx].w * src[idx].J1[y] * src[idx].J1[x];
_A01[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J1[x];
_b0[y] = src[idx].w * src[idx].c * src[idx].J0[y];
_b1[y] = src[idx].w * src[idx].c * src[idx].J1[y];
// set final matrix
for (int i = 0; i < 24; i++)
for (int j = 0; j < 24; j++)
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], _A00[i * 24 + j]); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], _A11[i * 24 + j]); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], _A01[i * 24 + j]); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], _A01[j * 24 + i]); // 10
atomicAdd(&b[i + pos0], _b0[i]); // 0
atomicAdd(&b[i + pos1], _b1[i]); // 1
I modified the code below to see some performance improvements.
250ms -> 95ms
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
int src_idx = idx / 576;
if (src[src_idx].mask == 1)
int cols = 6 * (mw_width + 1);
int pos0 = src[src_idx].pos0;
int pos1 = src[src_idx].pos1;
double w = src[src_idx].w;
double c = src[src_idx].c;
int sub_idx = idx % 576;
int i = sub_idx / 24;
int j = sub_idx % 24;
double J0_i = src[src_idx].J0[i];
double J0_j = src[src_idx].J0[j];
double J1_i = src[src_idx].J1[i];
double J1_j = src[src_idx].J1[j];
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], w * J0_i * J0_j); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], w * J1_i * J1_j); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], w * J0_i * J1_j); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], w * J1_i * J0_j); // 10
if (j == 0)
atomicAdd(&b[i + pos0], w * c * J0_i); // 0
atomicAdd(&b[i + pos1], w * c * J1_i); // 1

C++ convert 32 bit bmp image to 24 bit bmp and 16 bit bmp

Trying to convert 32 bit image B8R8G8A8 to 24bit image R8G8B8 and 16bit R5G5B5.
But result is very strange, maybe I do not understand how to convert image properly. How to do it properly and fix colors?
Input image:
After Convert32to16():
After Convert32to24():
// stdafx.h : include file for standard system include files,
// or project specific include files that are used frequently, but
// are changed infrequently
#pragma once
#include "targetver.h"
#include <stdio.h>
#include <tchar.h>
#include <iostream>
#include <fstream>
#include <windows.h>
// TODO: reference additional headers your program requires here
#include "stdafx.h"
using std::cout;
using std::endl;
using std::ofstream;
using std::ifstream;
void Convert32to24(void* B8G8R8A8, BYTE* R8G8B8, int width, int height)
long B8G8R8A8Size = (width * height * 4);
long j = 0;
for (long i = 0; i < (B8G8R8A8Size - 3); i = i + 4)
BYTE Red = ((PBYTE)B8G8R8A8)[i + 2];
BYTE Green = ((PBYTE)B8G8R8A8)[i + 1];
BYTE Blue = ((PBYTE)B8G8R8A8)[i];
BYTE Alpha = ((PBYTE)B8G8R8A8)[i + 3];
R8G8B8[j] = Red;
R8G8B8[j + 1] = Green;
R8G8B8[j + 2] = Blue;
j = j + 3;
void Convert32to16(void* B8G8R8A8, BYTE* R5G5B5, int width, int height)
long B8G8R8A8Size = (width * height * 4);
long j = 0;
for (long i = 0; i < (B8G8R8A8Size - 3); i = i + 4)
BYTE Red = ((PBYTE)B8G8R8A8)[i + 2] >> 3;
BYTE Green = ((PBYTE)B8G8R8A8)[i + 1] >> 3;
BYTE Blue = ((PBYTE)B8G8R8A8)[i] >> 3;
BYTE Alpha = ((PBYTE)B8G8R8A8)[i + 3];
uint16_t RGB565 = ((Red >> 3) << 11) | ((Green >> 2) << 5) | (Blue >> 3);
R5G5B5[j] = RGB565 >> 8;
R5G5B5[j + 1] = RGB565 & 0xFF;
j = j + 2;
void WriteDataToBmp(const WCHAR *filename, void *imageData, int width, int height, int BitCount, int bytesPerPixel)
DWORD bytesWritten;
fileInfo.biSize = sizeof(BITMAPINFOHEADER);
fileInfo.biBitCount = BitCount;
fileInfo.biCompression = BI_RGB;
fileInfo.biWidth = width;
fileInfo.biHeight = 0 - height;
fileInfo.biPlanes = 1;
fileInfo.biSizeImage = (width * height * bytesPerPixel);
fileHeader.bfSize = sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER) + fileInfo.biSizeImage;
fileHeader.bfType = 'MB';
fileHeader.bfOffBits = sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER);
hdl = CreateFile(filename, GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, 0, NULL);
WriteFile(hdl, &fileHeader, sizeof(fileHeader), &bytesWritten, NULL);
WriteFile(hdl, &fileInfo, sizeof(fileInfo), &bytesWritten, NULL);
WriteFile(hdl, imageData, fileInfo.biSizeImage, &bytesWritten, NULL);
unsigned char* ReadDataFromBmp(char* filename)
FILE* f = fopen(filename, "rb");
unsigned char info[54];
fread(info, sizeof(unsigned char), 54, f);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int size = abs(4 * width * height);
unsigned char* data = new unsigned char[size];
fread(data, sizeof(unsigned char), size, f);
return data;
int main(int args, char** cat) {
int width = 1440;
int height = 900;
int bytesOnPixel;
BYTE *OutputImage24Bit = new BYTE[width * height * 3];
BYTE *OutputImage16Bit = new BYTE[width * height * 2];
unsigned char* inputImage32Bit = ReadDataFromBmp((char*)"E:/TestImage.bmp");
bytesOnPixel = 2;
Convert32to16(inputImage32Bit, OutputImage16Bit, width, height);
WriteDataToBmp(L"E:/TestImage16bit.bmp", OutputImage16Bit, width, height, 8 * bytesOnPixel, bytesOnPixel);
bytesOnPixel = 3;
Convert32to24(inputImage32Bit, OutputImage24Bit, width, height);
WriteDataToBmp(L"E:/TestImage24bit.bmp", OutputImage24Bit, width, height, 8 * bytesOnPixel, bytesOnPixel);
return 1;
fileInfo.biCompression = BI_RGB;
16-bit bitmap uses BI_BITFIELDS compression. In addition 16-bit bitmap has to populate the color table to show if it is using 555 format, 565 format, or a different format.
24-bit and 16-bit bitmap need padding. Albeit, that's not an issue if the width in bytes is a multiple of 4. In general you cannot read/write pixel after pixel because the padding can throw everything off. Instead make 2 loops to go through the height and width. Pixel size would also depend on padding.
Note that you can do the same with or GDI+ or WIC. You can change the bitmap to different formats PixelFormat16bppRGB555, PixelFormat16bppRGB565, PixelFormat16bppARGB1555, PixelFormat24bppRGB...
GDI+ example:
int main()
Gdiplus::GdiplusStartupInput tmp;
ULONG_PTR token;
Gdiplus::GdiplusStartup(&token, &tmp, NULL);
auto *source = Gdiplus::Bitmap::FromFile(L"test.bmp");
auto *destination = source->Clone(0, 0, source->GetWidth(), source->GetHeight(),
CLSID clsid_bmp;
CLSIDFromString(L"{557cf400-1a04-11d3-9a73-0000f81ef32e}", &clsid_bmp);
destination->Save(L"copy.bmp", &clsid_bmp);
delete destination;
delete source;
return 0;
Home made version: (using std::vector for memory, instead of new/delete)
void Convert32to24(const wchar_t* file, std::vector<BYTE> &src, int width, int height)
int width_in_bytes_32 = width * 4;
int width_in_bytes_24 = ((width * 24 + 31) / 32) * 4;
DWORD size = width_in_bytes_24 * height;
std::vector<BYTE> dst(size);
for(int h = 0; h < height; h++)
for(int w = 0; w < width; w++)
int i = h * width_in_bytes_32 + w * 4;
int j = h * width_in_bytes_24 + w * 3;
dst[j + 0] = src[i + 0];
dst[j + 1] = src[i + 1];
dst[j + 2] = src[i + 2];
BITMAPFILEHEADER bf = { 'MB', 54 + size, 0, 0, 54 };
BITMAPINFOHEADER bi = { sizeof(bi), width, height, 1, 24, BI_RGB };
std::ofstream fout(file, std::ios::binary);
fout.write((char*)&bf, sizeof(bf));
fout.write((char*)&bi, sizeof(bi));
fout.write((char*)&dst[0], size);
void Convert32to16(const wchar_t* file, std::vector<BYTE> &src, int width, int height)
int width_in_bytes_32 = width * 4;
int width_in_bytes_16 = ((width * 16 + 31) / 32) * 4;
DWORD size = width_in_bytes_16 * height;
std::vector<BYTE> dst(size);
for(int h = 0; h < height; h++)
for(int w = 0; w < width; w++)
int i = h * width_in_bytes_32 + w * 4;
int j = h * width_in_bytes_16 + w * 2;
//555 format, each color is from 0 to 32, instead of 0 to 256
uint16_t blu = (uint16_t)(src[i + 0] * 31.f / 255.f);
uint16_t grn = (uint16_t)(src[i + 1] * 31.f / 255.f);
uint16_t red = (uint16_t)(src[i + 2] * 31.f / 255.f);
uint16_t sum = (red) | (grn << 5) | (blu << 10);
memcpy(&dst[j], &sum, 2);
BITMAPFILEHEADER bf = { 'MB', 54 + size, 0, 0, 54 };
BITMAPINFOHEADER bi = { sizeof(bi), width, height, 1, 16, BI_BITFIELDS };
std::ofstream fout(file, std::ios::binary);
fout.write((char*)&bf, sizeof(bf));
fout.write((char*)&bi, sizeof(bi));
//555 format
COLORREF color[]{
0b0000001111100000,//31 << 5
0b0111110000000000 //31 << 10
fout.write((char*)&color, sizeof(color));
fout.write((char*)&dst[0], size);
int main()
const wchar_t* file_32 = L"E:\\TestImage.bmp";
const wchar_t* file_16 = L"E:\\OutputImage16Bit.bmp";
const wchar_t* file_24 = L"E:\\OutputImage24Bit.bmp";
std::ifstream fin(file_32, std::ios::binary);
return 0;*)&bh, sizeof(bh));*)&bi, sizeof(bi));
if(bi.biBitCount != 32)
return 0;
std::vector<BYTE> source(bh.bfSize);*)&source[0], bh.bfSize);
Convert32to16(file_16, source, bi.biWidth, bi.biHeight);
Convert32to24(file_24, source, bi.biWidth, bi.biHeight);
return 0;

square Matrix transpose with CUDA

I'm trying to write the matrix transpose algorithm. I test this program with matrix size equal to 1024, the result shows that not all elements are in the right places.
Why isn't my array transposing correctly? Does anyone can help me or give me any hint? I will appreciate it. Thanks a lot!
there is the whole cpu code:
__global__ void transpose_naive (float *out, float *in, int w, int h )
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
if ( xIdx <=w && yIdx <=h ) {
unsigned int idx_in = xIdx + w * yIdx;
unsigned int idx_out = yIdx + h * xIdx;
out[idx_out] = in[idx_in];
int main()
int nx=1024;
int mem_size = nx*nx*sizeof(float);
int t=32;
dim3 dimGrid(((nx-1)/t) +1, ((nx-1)/t) +1);
dim3 dimBlock(t,t);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
checkCuda(cudaMalloc(&d_idata, mem_size) );
checkCuda(cudaMalloc(&d_cdata, mem_size) );
// host
for (int j = 0; j < nx; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// device
checkCuda(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) );
// events for timing
cudaEvent_t startEvent, stopEvent;
checkCuda(cudaEventCreate(&startEvent) );
checkCuda(cudaEventCreate(&stopEvent) );
float ms;
checkCuda( cudaEventRecord(startEvent, 0) );
transpose_naive<<<dimGrid, dimBlock>>>(d_cdata, d_idata,nx,nx);
checkCuda(cudaEventRecord(stopEvent, 0) );
checkCuda(cudaEventSynchronize(stopEvent) );
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent) );
checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) );
printf("the time %5f ", ms);
// cleanup
checkCuda(cudaEventDestroy(startEvent) );
checkCuda(cudaEventDestroy(stopEvent) );
checkCuda( cudaFree(d_cdata) );
checkCuda( cudaFree(d_idata) );
I think there is something wrong with file output "i.txt" and "t.txt" otherwise the program looks to be correct. I have made some minor changes in your code by adding error checking and printing on the standard output stream. I am printing the last (1020 - 1024) 3 x 3 matrix to cross check the transpose. Run it on your system and verify whether the matrix transpose is correct or not?
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include "device_launch_parameters.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
if (code != cudaSuccess)
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file, line);
if (abort) exit(code);
__global__ void transpose_naive(float *out, float *in, int w, int h)
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
if (xIdx <= w && yIdx <= h) {
unsigned int idx_in = xIdx + w * yIdx;
unsigned int idx_out = yIdx + h * xIdx;
out[idx_out] = in[idx_in];
int main()
int nx = 1024;
int mem_size = nx*nx*sizeof(float);
int t = 32;
dim3 dimGrid(((nx - 1) / t) + 1, (((nx - 1) / t) + 1));
dim3 dimBlock(t, t);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
gpuErrchk(cudaMalloc(&d_idata, mem_size));
gpuErrchk(cudaMalloc(&d_cdata, mem_size));
// host
for (int j = 0; j < nx; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// device
// events for timing
cudaEvent_t startEvent, stopEvent;
float ms;
gpuErrchk(cudaEventRecord(startEvent, 0));
transpose_naive << <dimGrid, dimBlock >> >(d_cdata, d_idata, nx, nx);
gpuErrchk(cudaEventRecord(stopEvent, 0));
gpuErrchk(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("the time %5f ", ms);
for (int i = 1020; i < 1024; i++) {
for (int j = 1020; j < 1024; j++) {
printf("%.2f ", h_idata[i*nx + j]);
for (int i = 1020; i < 1024; i++) {
for (int j = 1020; j < 1024; j++) {
printf("%.2f ", h_cdata[i*nx + j]);
//savetofile(h_idata, "i.txt", nx, nx);
//savetofile(h_cdata, "t.txt", nx, nx);
// cleanup
The only flaw in the code is the incorrect bound checks in the following line of the kernel.
if ( xIdx <=w && yIdx <=h ) {
As the indices are from 0 to w-1 and 0 to h-1 for x and y dimensions respectively, the if condition should be as follows:
if ( xIdx <w && yIdx <h ) {

opencl- image object clamp to edge

I have two different kernels for my sobel operator.
One uses buffer object and the other one uses image object.
In my opinion, these two kernels should make the same result, but they're not.
These two codes handle edges using (clamp to edge)
Where is the problem?
Code with buffer object
__kernel void sobel_filter(__global uchar *ucGRAY, __global float *sobel, __global float *grad_max, int im_width, int im_height)
float2 xt;
int i = get_global_id(0);
int j = get_global_id(1);
int ii_p, jj_p, ii_n, jj_n; // ii_n,jj_n = (i,j)-1 ii_p,jj_p = (i,j)+1
if (i == 0)
ii_n = i;
else if (i == im_width - 1)
ii_p = i;
ii_n = i - 1;
ii_p = i + 1;
if (j == 0)
jj_n = i;
else if (j == im_height - 1)
jj_p = j;
jj_n = j - 1;
jj_p = j + 1;
xt.x = (float)(ucGRAY[(jj_n)* im_width + (ii_p)] // 3
+ ucGRAY[j * im_width + (ii_p)] * 2 //6
+ ucGRAY[(jj_p) * im_width + (ii_p)] //9
- ucGRAY[(jj_n)* im_width + (ii_n)] //1
- ucGRAY[j * im_width + (ii_n)] * 2 //4
- ucGRAY[(jj_p)* im_width + (ii_n)]) / 1020; //7
xt.y =(float)( ucGRAY[(jj_p)* im_width + (ii_n)] //7
+ucGRAY[(jj_p)* im_width + (i)] * 2 //8
+ucGRAY[(jj_p)* im_width + (ii_p)] //9
- ucGRAY[(jj_n)* im_width + (ii_n)] //1
- ucGRAY[(jj_n)* im_width + (i)] * 2 //2
- ucGRAY[(jj_n)* im_width + (ii_p)]) / 1020; //3
sobel[j * im_height + i] = length(xt);
AtomicMax(grad_max, sobel[j * im_width + i]);
Code with image object
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | //Natural coordinates
CLK_FILTER_NEAREST; //Don't interpolate
__kernel void sobel_filter_image(read_only image2d_t ucGRAY,__global float *sobel,__global float *grad_max,int Width, int Height)
int2 coord = (int2)(get_global_id(0), get_global_id(1));
float2 xt;
float temp;
uchar val5=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y)).x;
uchar val1=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y-1)).x;
uchar val2=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y-1)).x;
uchar val3=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y-1)).x;
uchar val4=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y)).x;
uchar val6=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y)).x;
uchar val7=read_imageui(ucGRAY, smp, (int2)(coord.x-1,coord.y+1)).x;
uchar val8=read_imageui(ucGRAY, smp, (int2)(coord.x,coord.y+1)).x;
uchar val9=read_imageui(ucGRAY, smp, (int2)(coord.x+1,coord.y+1)).x;
xt.x = (float)(val3 + (val6 * 2) + val9
- val1 - (val4 * 2) - val7) / 1020;
xt.y = (float)(val7 + (val8 * 2) + val9
- val1 - (val2 * 2) - val3) / 1020;
sobel[coord.y * Width + coord.x] = length(xt);// G=sqrt(Gy^2+Gx^2)
AtomicMax(grad_max,sobel[coord.y * Width + coord.x]);
In your buffer version, you have this:
if (j == 0)
jj_n = i;
Presumably that should be:
if (j == 0)
jj_n = j;
