Correct use of sscanf_s - c++11

I'm trying to parse a string into a char and multiple floats in the following format:
v 1.00000 1.00000 1.00000
Originally using sscanf I could do so with:
char b;
float x = 0.0f;
float y = 0.0f;
float z = 0.0f;
int result = sscanf(lineHeader,"%c %f %f %f", &b, &x, &y, &z);
However, I'm trying to eliminate an unrelated problem which requires me to use sscanf_s.
If I simply change the code to:
int result = sscanf_s(lineHeader,"%c %f %f %f", &b, &x, &y, &z);
I get an exception stating an unsigned integer is expected. Does sscanf_s use different formatting to parse strings?

According to the documentation, sscanf_s's %c conversion specifier requires an additional parameter to check for the buffer size:
int result = sscanf_s(lineHeader,"%c %f %f %f", &b, rsize_t{1}, &vertex.x, &vertex.y, &vertex.z);
// It's a single char ^^^^^^^^^^

Related

Cuda matrix addition

I have written the following code to sum two 4x4 matrices in cuda.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
__global__ void Matrix_add(double* a, double* b, double* c,int n)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int index = row * n + col;
if(col<n && row <n)
c[index] = a[index] + b[index];
}
int main()
{
int n=4;
double **h_a;
double **h_b;
double **h_c;
double *d_a, *d_b, *d_c;
int size = n*n*sizeof(double);
h_a = (double **) malloc(n*sizeof(double*));
h_b = (double **) malloc(n*sizeof(double*));
h_c = (double **) malloc(n*sizeof(double*));
cudaMalloc((void**)&d_a,size);
cudaMalloc((void**)&d_b,size);
cudaMalloc((void**)&d_c,size);
int t=0;
for (t=0;t<n;t++)
{
h_a[t]= (double *)malloc(n*sizeof(double));
h_b[t]= (double *)malloc(n*sizeof(double));
h_c[t]= (double *)malloc(n*sizeof(double));
}
int i=0,j=0;
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
h_a[i][j]=sin(i)*sin(i);
h_b[i][j]=cos(i)*cos(i);
}
}
cudaMemcpy(d_a,h_a+n,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b+n,size,cudaMemcpyHostToDevice);
dim3 dimBlock(4,4);
dim3 dimGrid(1,1);
Matrix_add<<<dimGrid, dimBlock>>>(d_a,d_b,d_c,n);
cudaMemcpy(h_c+n,d_c,size,cudaMemcpyDeviceToHost);
for(i=0;i<n;i++)
{
for( j=0;j<n;j++)
{
printf("%f",h_c[i][j]);
printf("\t");
}
printf("\n");
}
for(i=0;i<n;i++)
{
free(h_a[i]);
free(h_b[i]);
free(h_c[i]);
}
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Result of this addition should be a 2x2 all-ones matrix but in the result all the elements of matrix are 0. Also I get this message after getting result:
Segmentation fault (core dumped)
Can anyone please help me to find out the problem.
Thank you
Your host arrays (h_a, h_b, h_c) are not contiguous in memory, so your initial cudaMemcpy() calls will read garbage into GPU memory (apparently zeros in your case).
The reason is that your hosts arrays are not actually flat, but instead are represented as arrays of pointers. I guess to fake two-dimensional arrays in C? In any case, you either need to be more careful with your cudaMemcpy()s and copy the host arrays row-by-row, or use a flat representation on the host.

Why is sizeof giving wrong answer

ok I've come across a weirdness, maybe someone can explain it.
Source code is (c++ 11) :
‪#‎include‬ <stdio.h>
struct xyz_ {
float xyz[3];
float &x = xyz[0];
float &y = xyz[1];
float &z = xyz[2];
};
int main(int argc, char *argv[])
{
xyz_ xyz;
xyz.x = 0;
xyz.y = 1;
xyz.z = 2;
xyz.xyz[1] = 1;
printf("as array %f %f %f\n",xyz.xyz[0],xyz.xyz[1],xyz.xyz[2]);
printf("as elements %f %f %f\n",xyz.x,xyz.y,xyz.z);
int sizexyz = sizeof(xyz);
int sizefloat = sizeof(float);
printf("float is %d big, but xyz is %d big\n",sizefloat,sizexyz);
return 0;
}
output is:
as array 0.000000 1.000000 2.000000
as elements 0.000000 1.000000 2.000000
float is 4 big, but xyz is 24 big
So the structure works as I would expect, but the size is twice as large as it should be. Using chars instead of float in the structure gives a segfault when run.
I wanted to use struct xyz_ as either an array of floats or individual float elements.
It is unspecified whether references require storage. In this case your output suggests that your compiler has decided to use storage to implement the references x, y and z.
Suppose you add another constructor:
struct xyz_ {
float xyz[3];
float &x = xyz[0];
float &y = xyz[1];
float &z = xyz[2];
xyz_()
{
}
xyz_(float& a, float& b, float& c)
: x(a), y(b), z(c)
{
}
};
It should be clear that now the three x, y and z members may be bound to the array elements or may be bound to something else.
Looks like what you are looking for is
union P3d {
float xyz[3];
struct {
float x, y, z;
};
};
Unfortunately for some strange reasons (apparently mostly political) this is not supported in the standard (despite compilers do actually support it).
How about this:
struct xyz_
{
float xyz[3];
float &x() {return xyz[0];}
float &y() {return xyz[1];}
float &z() {return xyz[2];}
};
Not as beautiful or elegant, but might reduce the size a bit, though I think the this pointer might occupy additional space, not sure...
Of course you would have to use x(), y() and z().
What would be the size of xyz_ if it's declared like this?
struct xyz_ {
float xyz[3];
float *x = &xyz[0];
float *y = &xyz[1];
float *z = &xyz[2];
};
The reference also needs it's own space to store the information where it is pointing at.
In C you could do the following, but it's not legal in C++11.
union xyz_ {
float xyz[3];
struct { float x, y, z; };
};

Sobel filter in cuda (cant show full image)

I have a classic problem about the output of sobel filter using CUDA.
this is a main class (main.cpp)
/*main class */
int main(int argc, char** argv)
{
IplImage* image_source = cvLoadImage("test.jpg",
CV_LOAD_IMAGE_GRAYSCALE);
IplImage* image_input = cvCreateImage(cvGetSize(image_source),
IPL_DEPTH_8U,image_source->nChannels);
IplImage* image_output = cvCreateImage(cvGetSize(image_source),
IPL_DEPTH_8U,image_source->nChannels);
/* Convert from IplImage tofloat */
cvConvert(image_source,image_input);
unsigned char *h_out = (unsigned char*)image_output->imageData;
unsigned char *h_in = (unsigned char*)image_input->imageData;
width = image_input->width;
height = image_input->height;
widthStep = image_input->widthStep;
sobel_parallel(h_in, h_out, width, height, widthStep);
cvShowImage( "CPU", image_output );
cvReleaseImage( &image_output );
waitKey(0);
}
And this is the CUDA file (kernel_gpu.cu)
__global__ void kernel ( unsigned char *d_in , unsigned char *d_out , int width ,
int height, int widthStep ) {
int col = blockIdx . x * blockDim . x + threadIdx . x ;
int row = blockIdx . y * blockDim . y + threadIdx . y ;
int dx [3][3] = { -1 , 0 , 1 ,
-2 , 0 , 2 ,
-1 , 0 , 1};
int dy [3][3] = {1 ,2 ,1 ,
0 ,0 ,0 ,
-1 , -2 , -1};
int s;
if( col < width && row < height)
{
int i = row;
int j = col;
// apply kernel in X direction
int sum_x=0;
for(int m=-1; m<=1; m++)
for(int n=-1; n<=1; n++)
{
s=d_in[(i+m)*widthStep+j+n]; // get the (i,j) pixel value
sum_x+=s*dx[m+1][n+1];
}
// apply kernel in Y direction
int sum_y=0;
for(int m=-1; m<=1; m++)
for(int n=-1; n<=1; n++)
{
s=d_in[(i+m)*widthStep+j+n]; // get the (i,j) pixel value
sum_y+=s*dy[m+1][n+1];
}
int sum=abs(sum_x)+abs(sum_y);
if (sum>255)
sum=255;
d_out[i*widthStep+j]=sum; // set the (i,j) pixel value
}
}
// Kernel Calling Function
extern "C" void sobel_parallel( unsigned char* h_in, unsigned char* h_out,
int rows, int cols, int widthStep){
unsigned char* d_in;
unsigned char* d_out;
cudaMalloc((void**) &d_in, rows*cols);
cudaMalloc((void**) &d_out, rows*cols);
cudaMemcpy(d_in, h_in, rows*cols*sizeof( unsigned char), cudaMemcpyHostToDevice);
dim3 block (16,16);
dim3 grid ((rows * cols) / 256.0);
kernel<<<grid,block>>>(d_in, d_out, rows, cols, widthStep);
cudaMemcpy(h_out, d_out, rows*cols*sizeof( unsigned char), cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);
}
Error :
the result image does not appear in their entirety, only part of the image.
Why is the result(GPU) like this?? (I tried to make CPU computation using the same function and no problem).
You are creating 1 Dimensional grid, while using 2D indexing inside the kernel which will cover only the x direction and only the top 16 rows of the image will be filtered (because the height of the block is 16).
dim3 grid ((rows * cols) / 256.0); //This is incorrect in current case
Consider creating 2 dimensional grid, so that it spans all the rows of the image.
dim3 grid ((cols + 15)/16, (rows + 15)/16);
Check the width and widthStep variables to see if they are actually equal or not because in your sobel_parallel function you are implicitly assuming this (which might not be true since your data is aligned). If this is not true the code
cudaMalloc((void**) &d_in, rows*cols);
will actually allocate less memory than necessary and hence you will only process part of your image. It would be better to use
cudaMalloc((void**) &d_in, rows*widthStep);
And of course adjust the rest of your code as necessary.
You are also calling
void sobel_parallel( unsigned char* h_in, unsigned char* h_out,
int rows, int cols, int widthStep)
with
sobel_parallel(h_in, h_out, width, height, widthStep);
which exchanges rows with cols and this is again exchanged when you are calling your kernel. This will cause a problem when you use the above suggestion.

Atomic max for floats in OpenCL

I need an atomic max function for floats in OpenCL. This is my current naive code using atomic_xchg
float value = data[index];
if ( value > *max_value )
{
atomic_xchg(max_value, value);
}
This code gives the correct result when using an Intel CPU, but not for a Nvidia GPU. Is this code correct, or can anyone help me?
You can do it like this:
//Function to perform the atomic max
inline void AtomicMax(volatile __global float *source, const float operand) {
union {
unsigned int intVal;
float floatVal;
} newVal;
union {
unsigned int intVal;
float floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = max(prevVal.floatVal,operand);
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel mykern(__global float *data, __global float *max_value){
unsigned int index = get_global_id(0);
float value = data[index];
AtomicMax(max_value, value);
}
As stated in LINK.
What it does is create a union of float and int. Perform the math on the float, but compare integers when doing the atomic xchg. As long as the integers match, the operation is completed.
However, the speed decrease due to the use of these methods is very high. Use them carefully.

Manually Converting rgba8 to rgba5551

I need to convert rgba8 to rgba5551 manually. I found some helpful code from another post and want to modify it to convert from rgba8 to rgba5551. I don't really have experience with bitewise stuff and haven't had any luck messing with the code myself.
void* rgba8888_to_rgba4444( void* src, int src_bytes)
{
// compute the actual number of pixel elements in the buffer.
int num_pixels = src_bytes / 4;
unsigned long* psrc = (unsigned long*)src;
unsigned short* pdst = (unsigned short*)src;
// convert every pixel
for(int i = 0; i < num_pixels; i++){
// read a source pixel
unsigned px = psrc[i];
// unpack the source data as 8 bit values
unsigned r = (px << 8) & 0xf000;
unsigned g = (px >> 4) & 0x0f00;
unsigned b = (px >> 16) & 0x00f0;
unsigned a = (px >> 28) & 0x000f;
// and store
pdst[i] = r | g | b | a;
}
return pdst;
}
The value of RGBA5551 is that it has color info condensed into 16 bits - or two bytes, with only one bit for the alpha channel (on or off). RGBA8888, on the other hand, uses a byte for each channel. (If you don't need an alpha channel, I hear RGB565 is better - as humans are more sensitive to green). Now, with 5 bits, you get the numbers 0 through 31, so r, g, and b each need to be converted to some number between 0 and 31, and since they are originally a byte each (0-255), we multiply each by 31/255. Here is a function that takes RGBA bytes as input and outputs RGBA5551 as a short:
short int RGBA8888_to_RGBA5551(unsigned char r, unsigned char g, unsigned char b, unsigned char a){
unsigned char r5 = r*31/255; // All arithmetic is integer arithmetic, and so floating points are truncated. If you want to round to the nearest integer, adjust this code accordingly.
unsigned char g5 = g*31/255;
unsigned char b5 = b*31/255;
unsigned char a1 = (a > 0) ? 1 : 0; // 1 if a is positive, 0 else. You must decide what is sensible.
// Now that we have our 5 bit r, g, and b and our 1 bit a, we need to shift them into place before combining.
short int rShift = (short int)r5 << 11; // (short int)r5 looks like 00000000000vwxyz - 11 zeroes. I'm not sure if you need (short int), but I've wasted time tracking down bugs where I didn't typecast properly before shifting.
short int gShift = (short int)g5 << 6;
short int bShift = (short int)b5 << 1;
// Combine and return
return rShift | gShift | bShift | a1;
}
You can, of course condense this code.

Resources