Atomic max for floats in OpenCL - max

I need an atomic max function for floats in OpenCL. This is my current naive code using atomic_xchg
float value = data[index];
if ( value > *max_value )
{
atomic_xchg(max_value, value);
}
This code gives the correct result when using an Intel CPU, but not for a Nvidia GPU. Is this code correct, or can anyone help me?

You can do it like this:
//Function to perform the atomic max
inline void AtomicMax(volatile __global float *source, const float operand) {
union {
unsigned int intVal;
float floatVal;
} newVal;
union {
unsigned int intVal;
float floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = max(prevVal.floatVal,operand);
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel mykern(__global float *data, __global float *max_value){
unsigned int index = get_global_id(0);
float value = data[index];
AtomicMax(max_value, value);
}
As stated in LINK.
What it does is create a union of float and int. Perform the math on the float, but compare integers when doing the atomic xchg. As long as the integers match, the operation is completed.
However, the speed decrease due to the use of these methods is very high. Use them carefully.

Related

Cuda matrix addition

I have written the following code to sum two 4x4 matrices in cuda.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
__global__ void Matrix_add(double* a, double* b, double* c,int n)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int index = row * n + col;
if(col<n && row <n)
c[index] = a[index] + b[index];
}
int main()
{
int n=4;
double **h_a;
double **h_b;
double **h_c;
double *d_a, *d_b, *d_c;
int size = n*n*sizeof(double);
h_a = (double **) malloc(n*sizeof(double*));
h_b = (double **) malloc(n*sizeof(double*));
h_c = (double **) malloc(n*sizeof(double*));
cudaMalloc((void**)&d_a,size);
cudaMalloc((void**)&d_b,size);
cudaMalloc((void**)&d_c,size);
int t=0;
for (t=0;t<n;t++)
{
h_a[t]= (double *)malloc(n*sizeof(double));
h_b[t]= (double *)malloc(n*sizeof(double));
h_c[t]= (double *)malloc(n*sizeof(double));
}
int i=0,j=0;
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
h_a[i][j]=sin(i)*sin(i);
h_b[i][j]=cos(i)*cos(i);
}
}
cudaMemcpy(d_a,h_a+n,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b+n,size,cudaMemcpyHostToDevice);
dim3 dimBlock(4,4);
dim3 dimGrid(1,1);
Matrix_add<<<dimGrid, dimBlock>>>(d_a,d_b,d_c,n);
cudaMemcpy(h_c+n,d_c,size,cudaMemcpyDeviceToHost);
for(i=0;i<n;i++)
{
for( j=0;j<n;j++)
{
printf("%f",h_c[i][j]);
printf("\t");
}
printf("\n");
}
for(i=0;i<n;i++)
{
free(h_a[i]);
free(h_b[i]);
free(h_c[i]);
}
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Result of this addition should be a 2x2 all-ones matrix but in the result all the elements of matrix are 0. Also I get this message after getting result:
Segmentation fault (core dumped)
Can anyone please help me to find out the problem.
Thank you
Your host arrays (h_a, h_b, h_c) are not contiguous in memory, so your initial cudaMemcpy() calls will read garbage into GPU memory (apparently zeros in your case).
The reason is that your hosts arrays are not actually flat, but instead are represented as arrays of pointers. I guess to fake two-dimensional arrays in C? In any case, you either need to be more careful with your cudaMemcpy()s and copy the host arrays row-by-row, or use a flat representation on the host.

Why is sizeof giving wrong answer

ok I've come across a weirdness, maybe someone can explain it.
Source code is (c++ 11) :
‪#‎include‬ <stdio.h>
struct xyz_ {
float xyz[3];
float &x = xyz[0];
float &y = xyz[1];
float &z = xyz[2];
};
int main(int argc, char *argv[])
{
xyz_ xyz;
xyz.x = 0;
xyz.y = 1;
xyz.z = 2;
xyz.xyz[1] = 1;
printf("as array %f %f %f\n",xyz.xyz[0],xyz.xyz[1],xyz.xyz[2]);
printf("as elements %f %f %f\n",xyz.x,xyz.y,xyz.z);
int sizexyz = sizeof(xyz);
int sizefloat = sizeof(float);
printf("float is %d big, but xyz is %d big\n",sizefloat,sizexyz);
return 0;
}
output is:
as array 0.000000 1.000000 2.000000
as elements 0.000000 1.000000 2.000000
float is 4 big, but xyz is 24 big
So the structure works as I would expect, but the size is twice as large as it should be. Using chars instead of float in the structure gives a segfault when run.
I wanted to use struct xyz_ as either an array of floats or individual float elements.
It is unspecified whether references require storage. In this case your output suggests that your compiler has decided to use storage to implement the references x, y and z.
Suppose you add another constructor:
struct xyz_ {
float xyz[3];
float &x = xyz[0];
float &y = xyz[1];
float &z = xyz[2];
xyz_()
{
}
xyz_(float& a, float& b, float& c)
: x(a), y(b), z(c)
{
}
};
It should be clear that now the three x, y and z members may be bound to the array elements or may be bound to something else.
Looks like what you are looking for is
union P3d {
float xyz[3];
struct {
float x, y, z;
};
};
Unfortunately for some strange reasons (apparently mostly political) this is not supported in the standard (despite compilers do actually support it).
How about this:
struct xyz_
{
float xyz[3];
float &x() {return xyz[0];}
float &y() {return xyz[1];}
float &z() {return xyz[2];}
};
Not as beautiful or elegant, but might reduce the size a bit, though I think the this pointer might occupy additional space, not sure...
Of course you would have to use x(), y() and z().
What would be the size of xyz_ if it's declared like this?
struct xyz_ {
float xyz[3];
float *x = &xyz[0];
float *y = &xyz[1];
float *z = &xyz[2];
};
The reference also needs it's own space to store the information where it is pointing at.
In C you could do the following, but it's not legal in C++11.
union xyz_ {
float xyz[3];
struct { float x, y, z; };
};

Conditional reduction in CUDA

I need to sum about 100000 values stored in an array, but with conditions.
Is there a way to do that in CUDA to produce fast results?
Can anyone post a small code to do that?
I think that, to perform conditional reduction, you can directly introduce the condition as a multiplication by 0 (false) or 1 (true) to the addends. In other words, suppose that the condition you would like to meet is that the addends be smaller than 10.f. In this case, borrowing the first code at Optimizing Parallel Reduction in CUDA by M. Harris, then the above would mean
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i]*(g_data[i]<10.f);
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
If you wish to use CUDA Thrust to perform conditional reduction, you can do the same by using thrust::transform_reduce. Alternatively, you can create a new vector d_b copying in that all the elements of d_a satisfying the predicate by thrust::copy_if and then applying thrust::reduce on d_b. I haven't checked which solution performs the best. Perhaps, the second solution will perform better on sparse arrays. Below is an example with an implementation of both the approaches.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/count.h>
#include <thrust/copy.h>
// --- Operator for the first approach
struct conditional_operator {
__host__ __device__ float operator()(const float a) const {
return a*(a<10.f);
}
};
// --- Operator for the second approach
struct is_smaller_than_10 {
__host__ __device__ bool operator()(const float a) const {
return (a<10.f);
}
};
void main(void)
{
int N = 20;
// --- Host side allocation and vector initialization
thrust::host_vector<float> h_a(N,1.f);
h_a[0] = 20.f;
h_a[1] = 20.f;
// --- Device side allocation and vector initialization
thrust::device_vector<float> d_a(h_a);
// --- First approach
float sum = thrust::transform_reduce(d_a.begin(), d_a.end(), conditional_operator(), 0.f, thrust::plus<float>());
printf("Result = %f\n",sum);
// --- Second approach
int N_prime = thrust::count_if(d_a.begin(), d_a.end(), is_smaller_than_10());
thrust::device_vector<float> d_b(N_prime);
thrust::copy_if(d_a.begin(), d_a.begin() + N, d_b.begin(), is_smaller_than_10());
sum = thrust::reduce(d_b.begin(), d_b.begin() + N_prime, 0.f);
printf("Result = %f\n",sum);
getchar();
}

C++ Tuples and Readability

I think this is more of a philosophical question about readability and tupled types in C++11.
I am writing some code to produce Gaussian Mixture Models (the details are kind of irrelevant but it serves and a nice example.) My code is below:
GMM.hpp
#pragma once
#include <opencv2/opencv.hpp>
#include <vector>
#include <tuple>
#include "../Util/Types.hpp"
namespace LocalDescriptorAndBagOfFeature
{
// Weighted gaussian is defined as a (weight, mean vector, covariance matrix)
typedef std::tuple<double, cv::Mat, cv::Mat> WeightedGaussian;
class GMM
{
public:
GMM(int numGaussians);
void Train(const FeatureSet &featureSet);
std::vector<double> Supervector(const BagOfFeatures &bof);
int NumGaussians(void) const;
double operator ()(const cv::Mat &x) const;
private:
static double ComputeWeightedGaussian(const cv::Mat &x, WeightedGaussian wg);
std::vector<WeightedGaussian> _Gaussians;
int _NumGaussians;
};
}
GMM.cpp
using namespace LocalDescriptorAndBagOfFeature;
double GMM::ComputeWeightedGaussian(const cv::Mat &x, WeightedGaussian wg)
{
double weight;
cv::Mat mean, covariance;
std::tie(weight, mean, covariance) = wg;
cv::Mat precision;
cv::invert(covariance, precision);
double detp = cv::determinant(precision);
double outter = std::sqrt(detp / 2.0 * M_PI);
cv::Mat meanDist = x - mean;
cv::Mat meanDistTrans;
cv::transpose(meanDist, meanDistTrans);
cv::Mat symmetricProduct = meanDistTrans * precision * meanDist; // This is a "1x1" matrix e.g. a scalar value
double inner = symmetricProduct.at<double>(0,0) / -2.0;
return weight * outter * std::exp(inner);
}
double GMM::operator ()(const cv::Mat &x) const
{
return std::accumulate(_Gaussians.begin(), _Gaussians.end(), 0, [&x](double val, WeightedGaussian wg) { return val + ComputeWeightedGaussian(x, wg); });
}
In this case, am I gaining anything (clarity, readability, speed, ...) by using a tuple representation for the weighted Gaussian distribution over using a struct, or even a class with its own operator()?
You're reducing the size of your source code a little bit, but I'd argue that you're reducing its overall readability and type safety. Specifically, if you defined:
struct WeightedGaussian {
double weight;
cv::Mat mean, covariance;
};
then you wouldn't have a chance of writing the incorrect
std::tie(weight, covariance, mean) = wg;
and you'd guarantee that your users would use wg.mean instead of std::get<0>(wg). The biggest downside is that std::tuple comes with definitions of operator< and operator==, while you have to implement them yourself for a custom struct:
operator<(const WeightedGaussian& lhs, const WeightedGaussian& rhs) {
return std::tie(lhs.weight, lhs.mean, lhs.covariance) <
std::tie(rhs.weight, rhs.mean, rhs.covariance);
}

Optimizing CUDA interpolation

I have developped the following interpolation with CUDA and I am looking for a way of improving this interpolation. For some reasons, I dont want to use CUDA textures.
The other point that I have noticed that for some unknown reasons, is that the interpolation is not performed on the whole vector in my case if the size of the vector is superior than the number of threads (for example with a vector of size 1000, and a number of threads equal to 512,. A thread does its first job and that’s all. I would like to optimize the singleInterp function.
Here is my code:
__device__ float singleInterp(float* data, float x, int lx_data) {
float res = 0;
int i1=0;
int j=lx_data;
int imid;
while (j>i1+1)
{
imid = (int)(i1+j+1)/2;
if (data[imid]<x)
i1=imid;
else
j=imid;
}
if (i1==j)
res = data[i1+lx_data];
else
res =__fmaf_rn( __fdividef(data[j+lx_data]-data[i1+lx_data],(data[j]-data[i1])),x-data[i1], data[i1+lx_data]);
return res;
}
Kernel:
__global__ void linearInterpolation(float* data, float* x_in, int lx_data) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int index = i;
if (index < lx_data)
x_in[index] = singleInterp(data, x_in[index], lx_data);
}
It seems that you are interested in 1D linear interpolation. I already had the problem of optimizing such a kind of interpolation and I ended up with the following code
__global__ void linear_interpolation_kernel_function_GPU(double* __restrict__ result_d, const double* __restrict__ data_d, const double* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
double reg_x_out = x_out_d[j/2]+M/2;
int k = floor(reg_x_out);
double a = (reg_x_out)-floor(reg_x_out);
double dk = data_d[2*k+(j&1)];
double dkp1 = data_d[2*k+2+(j&1)];
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}
The data are assumed to be sampled at integer nodes between -M/2 and M/2.
The code is "equivalent" to 1D texture interpolation, as explained at the following web-page. For the 1D linear texture interpolation, see Fig. 13 of the CUDA-Programming-Guide. For comparisons betwee different solutions, please see the following thread.

Resources