Sorting arrays of structures in CUDA - sorting

I have a laptop with an NVIDIA GT750M 4Gb (compute capability 3.0) graphics card. I need to sort an array of structures on CUDA (about 3 × 10^7 elements). But I cannot figure out how, since I do not have enough experience in CUDA. When using thrust::sort I get strange results (it takes a few tens of minutes, while std::sort takes 1 minute).
struct MyStruct
{
float key;
float a;
float b;
int c;
int d;
int e;
int f;
bool flag;
}
bool minCompare(const MyStruct lhs, const MyStruct rhs)
{
return lhs.key < rhs.key;
}

As Robert Crovella has pointed out in his comment, tents of minutes most likely means that you are doing something wrong. I'm providing an example below in which I compare the performance of sorting an Array of Structures (AoS) and a Structure of Arrays (SoA) using thrust::sort and thrust::sort_by_key. I'm running on a laptop GeForce GT 540M and compiling with CUDA 5.5, so you have a more powerful card than mine. For 100000 elements the execution time is of the order of seconds in both cases. As I pointed out in my comment, the first case is more demanding in terms of computation time (1675ms) than the second (668.9ms).
#include <thrust\device_vector.h>
#include <thrust\sort.h>
struct MyStruct1
{
int key;
int value1;
int value2;
};
struct MyStruct2
{
int N;
int* key;
int* value1;
int* value2;
MyStruct2(int N_) {
N = N_;
cudaMalloc((void**)&key,N*sizeof(int));
cudaMalloc((void**)&value1,N*sizeof(int));
cudaMalloc((void**)&value2,N*sizeof(int));
}
};
__host__ __device__ bool operator<(const MyStruct1 &lhs, const MyStruct1 &rhs) { return (lhs.key < rhs.key); };
void main(void)
{
const int N = 10000;
float time;
cudaEvent_t start, stop;
/*******************************/
/* SORTING ARRAY OF STRUCTURES */
/*******************************/
thrust::host_vector<MyStruct1> h_struct1(N);
for (int i = 0; i<N; i++)
{
MyStruct1 s;
s.key = rand()*255;
s.value1 = rand()*255;
s.value2 = rand()*255;
h_struct1[i] = s;
}
thrust::device_vector<MyStruct1> d_struct(h_struct1);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::sort(d_struct.begin(), d_struct.end());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting array of structs - elapsed time: %3.1f ms \n", time);
h_struct1 = d_struct;
//for (int i = 0; i<N; i++)
//{
// MyStruct1 s = h_struct1[i];
// printf("key %i value1 %i value2 %i\n",s.key,s.value1,s.value2);
//}
//printf("\n\n");
/*******************************/
/* SORTING STRUCTURES OF ARRAYS*/
/*******************************/
MyStruct2 d_struct2(N);
thrust::host_vector<int> h_temp_key(N);
thrust::host_vector<int> h_temp_value1(N);
thrust::host_vector<int> h_temp_value2(N);
//for (int i = 0; i<N; i++)
//{
// h_temp_key[i] = rand()*255;
// h_temp_value1[i] = rand()*255;
// h_temp_value2[i] = rand()*255;
// printf("Original data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//}
//printf("\n\n");
cudaMemcpy(d_struct2.key,h_temp_key.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value1,h_temp_value1.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value2,h_temp_value2.data(),N*sizeof(int),cudaMemcpyHostToDevice);
// wrap raw pointers with device pointers
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::device_ptr<int> dev_ptr_key = thrust::device_pointer_cast(d_struct2.key);
thrust::device_ptr<int> dev_ptr_value1 = thrust::device_pointer_cast(d_struct2.value1);
thrust::device_ptr<int> dev_ptr_value2 = thrust::device_pointer_cast(d_struct2.value2);
thrust::device_vector<int> d_indices(N);
thrust::sequence(d_indices.begin(), d_indices.end(), 0, 1);
// first sort the keys and indices by the keys
thrust::sort_by_key(dev_ptr_key, dev_ptr_key + N, d_indices.begin());
// Now reorder the ID arrays using the sorted indices
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value1, dev_ptr_value1);
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value2, dev_ptr_value2);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting struct of arrays - elapsed time: %3.1f ms \n", time);
cudaMemcpy(h_temp_key.data(),d_struct2.key,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value1.data(),d_struct2.value1,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value2.data(),d_struct2.value2,N*sizeof(int),cudaMemcpyDeviceToHost);
//for (int i = 0; i<N; i++) printf("Ordered data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//printf("\n\n");
getchar();
}
For the sake of simplicity, I have skipped adding proper CUDA error check in the sense of What is the canonical way to check for errors using the CUDA runtime API?.

Just in case anyone's still looking, if you decide to use Thrust libraries, you can make a zip iterator of tuples from the structure of arrays that you want to sort, and pass this iterator to thrust::sort_by_key() like so:
void sort() {
const int N = 6;
int *keys = new int[N]; // = { 1, 4, 2, 8, 5, 7};
char *vals = new char[N]; // = {'a', 'b', 'c', 'd', 'e', 'f'};
int *addr = new int[N]; // = { 1, 2, 3, 4, 5, 6};
keys[0]=1; keys[1]=4; keys[2]=2; keys[3]=8; keys[4]=5; keys[5]=7;
vals[0]='a'; vals[1]='b'; vals[2]='c'; vals[3]='d'; vals[4]='e'; vals[5]='f';
addr[0]=1; addr[1]=2; addr[2]=3; addr[3]=4; addr[4]=5; addr[5]=6;
int *d_keys, *d_addr;
char *d_vals;
CUDA_SAFE(cudaMalloc((void **)&d_keys, N * sizeof(int)));
CUDA_SAFE(cudaMalloc((void **)&d_addr, N * sizeof(int)));
CUDA_SAFE(cudaMalloc((void **)&d_vals, N * sizeof(char)));
CUDA_SAFE(cudaMemcpy(d_keys, keys, N * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE(cudaMemcpy(d_vals, vals, N * sizeof(char), cudaMemcpyHostToDevice));
CUDA_SAFE(cudaMemcpy(d_addr, addr, N * sizeof(int), cudaMemcpyHostToDevice));
auto it = thrust::make_zip_iterator(thrust::make_tuple(d_vals, d_addr));
thrust::sort_by_key(thrust::device, d_keys, d_keys+N, it);
CUDA_SAFE(cudaMemcpy(keys, d_keys, N * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE(cudaMemcpy(vals, d_vals, N * sizeof(char), cudaMemcpyDeviceToHost));
CUDA_SAFE(cudaMemcpy(addr, d_addr, N * sizeof(int), cudaMemcpyDeviceToHost));
printf("Keys: "); for (int i = 0; i < N; i++) printf("%d ", keys[i]); printf("\n");
printf("Vals: "); for (int i = 0; i < N; i++) printf("%c ", vals[i]); printf("\n");
printf("Addr: "); for (int i = 0; i < N; i++) printf("%d ", addr[i]); printf("\n");
}
I have excluded the headers and checks for brevity.

Related

CUDA - Parallel Reduction Sum of Even and Odd Number Separately

I am trying to implement a parallel reduction sum of even and odd number Separately in CUDA.
I'm new in CUDA programming and I'm trying so hard but I can't find a solution.
I have for example the array : [5, 8, 0, -6, 2]. And the result need to be [4, 5] (Even : 8+0-6+2=4, Odd : 5=5).
But the result of my following code is [8, 5].
I think that my problem is in the notion of "shared" but I do not understand why.
__global__ void sumEvenOdd(int *a, int *b, int N){
int column = blockIdx.x * blockIdx.x + threadIdx.x;
__shared__ int s_data[2];
if (column < N){
if (a[column] % 2 == 0){
s_data[0] += a[column];
}
else{
s_data[1] += a[column];
}
__syncthreads();
b[0] = s_data[0];
b[1] = s_data[1];
}
}
void initArray(int *a, int N){
for (unsigned int i = 0; i < N; i++){
a[i] = rand() % 100;
}
}
void verify_result(int *a, int *b, int N){
int *verify_b;
verify_b = (int*)malloc(2 * sizeof(int));
verify_b[0] = 0;
verify_b[1] = 0;
for (unsigned int i = 0; i < N; i++){
if (a[i] % 2 == 0){
verify_b[0] += a[i];
}
else{
verify_b[1] += a[i];
}
}
for (unsigned int i = 0; i < 2; i++){
assert(verify_b[i] == b[i]);
}
}
void printResult(int *a, int *b, int N){
printf("\n");
for (unsigned int i = 0; i < N; i++){
printf("%d, ", a[i]);
}
printf("\n");
for (unsigned int i = 0; i < 2; i++){
printf("%d, ", b[i]);
}
}
int main(){
//Array sizes;
int N = 5;
//Size (in bytes) of matrix
size_t bytes = N * sizeof(int);
//Host pointers
int *a, *b;
// Allocate host memory
a = (int*)malloc(bytes);
b = (int*)malloc(2 * sizeof(int));
// Initialize array
initArray(a, N);
// Device pointers
int *d_a, *d_b;
// Allocated device memory
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, 2 * sizeof(int));
// Copy data to the device
cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);
//Number of threads
int THREADS = 128;
//Number of blocks
int BLOCKS = (N + THREADS - 1) / THREADS;
// Launch kernel
sumEvenOdd<<<BLOCKS, THREADS>>>(d_a, d_b, N);
cudaDeviceSynchronize();
// Copy back to the host
cudaMemcpy(b, d_b, 2 * sizeof(int), cudaMemcpyDeviceToHost);
// Check result
verify_result(a, b, N);
printResult(a, b, N);
return 0;
}
you cannot just use
s_data[1] += a[column];
remember all units are going to execute this line at the same time, and store in the same position, so all threads are storing into s_data at the same time.
instead you should use atomic add
atomicAdd(&s_data[1], a[column]);
and you should also be initializing s_data to zeros.

Array by value multiplication memory leak in c++

I'm having some trouble with multiplying an array (char array in this particular case) by a value.
My code looks like this:
char* tab1 = copy("11");
char t = '2';
int length = strlen(tab1) + 2;
char*result = populate('0', length);
int p_length = strlen(tab1);
for (int j = p_length - 1; j >= 0; j--) {
char* tmp = multiply_chars(tab1[j], t);
v_shove(tmp, j);
char* tmp2 = add_tables(result, tmp);
delete[] result;
result = tmp2;
delete[] tmp;
}
cout << result << endl;
delete[] result;
delete[] tab1;
None of the methods used (that's populate, multiply_chars and add_tables) causes a leak when ran in an infinite loop. I've narrowed the leak to the
char* tmp2 = add_tables(result, tmp);
delete[] result;
result = tmp2;
part, but have no idea why it would happen.
I check for leaks by running snippets in an infinite loop and checking memory usage.
Any help would be appreciated! If need be I'll post the code of the methods used, but decided not to for the sake of brevity here. They all return new cstrings. Also, the t2 variable is there from when I was checking the array by array multiplication, which also leaked - decided to do array by value multiplication first.
(Now, to be completely honest this is one of the methods required for a school project, but it's such a miniscule part of it, that I thought it wouldn't hurt if I asked - the teacher isn't really big on helping with particular code problems)
The functions are:
char * add_tables(const char * table1, const char * table2)
{
char* tmp1 = get_string_trailing("0",table1);
char* tmp2 = get_string_trailing("0", table2);
int l1 = strlen(tmp1), l2 = strlen(tmp2);
if (l1 != l2) {
if (l1 > l2) {
char* t = resize_string(tmp2, l1 - l2, '0');
delete[] tmp2;
tmp2 = t;
}
else {
char* t = resize_string(tmp1, l2 - l1, '0');
delete[] tmp1;
tmp1 = t;
}
}
int length = strlen(tmp1) + 2;
char*result = new char[length];
result[length - 1] = 0;
int buffer = 0;
for (int i = length - 2; i > 0; i--) {
int t = buffer + (tmp1[i-1]-'0') + (tmp2[i-1]-'0');
result[i] = (t% 10)+'0';
buffer = (t - (t % 10))/10;
}
result[0] = buffer + '0';
char* t = get_string_trailing("0", result);
delete[]result;
result = t;
delete[]tmp1;
delete[]tmp2;
return result;
}
void v_shove(char *&c, int i)
{
char* tmp = shove(c, i);
delete[] c;
c = tmp;
}
char * populate(const char populator, int length)
{
char* result = new char[length + 1];
result[length] = 0;
for (int i = 0; i < length; i++) {
result[i] = populator;
}
return result;
}
char * multiply_chars(const char c1,const char c2)
{
char*result = new char[3];
result[2] = 0;
char tmp1 = c1 - '0', tmp2 = c2 - '0';
result[1] = (tmp1*tmp2 % 10) + '0';
result[0] = (tmp1*tmp2 - (tmp1*tmp2 % 10)) / 10 + '0';
char* r = get_string_trailing("0", result);
delete[] result;
result = r;
return result;
}
int get_length_trailing(const char * ignore,const char * table)
{
int length = 0;
int i = 0;
bool flag = true;
while (i < strlen(table)) {
if (flag) {
for (int j = 0; j < strlen(ignore); j++)
if (table[i] == ignore[j])goto BREAKPOINT;
}
flag = false;
length++;
BREAKPOINT:i++;
}
return length;
}
char * get_string_trailing(const char * ignore,const char * table)
{
int result_length = get_length_trailing(ignore, table);
char* result = new char[result_length + 1];
int counter = 0;
int i = 0;
bool flag = true;
while (i < strlen(table)) {
if (flag)
for (int j = 0; j < strlen(ignore); j++)
if (table[i] == ignore[j])goto BREAKPOINT;
flag = false;
result[counter] = table[i];
counter++;
BREAKPOINT:i++;
}
result[result_length] = 0;
if (result_length == 0) return copy("0");
return result;
}
char * shove(const char * table1, int index)
{
char*result = "0";
int length = strlen(table1) + index + 1;
result = new char[length];
result[length - 1] = 0;
if (index > 0) {
for (int i = 0; i < strlen(table1); i++)
result[i] = table1[i];
for (int i = 0; i < index; i++)
result[strlen(table1) + i] = '0';
}
else {
for (int i = 0; i < strlen(result); i++)
result[i] = table1[i];
}
char* t = get_string_trailing("0", result);
delete[] result;
result = t;
return result;
}
There is at least a memory leak in get_string_trailing: if result_length is zero, you return a copy and do not delete result. There are also confusions between "string" (such as "0") and 'char': with double quotes, the terminating string character (\0) is automatically appended to the string, while simple quotes only define a character. So "0" is made of 2 char in memory and can not be stored in a pointer (undefined behavior, overwriting memory).
To summarize: here you are writing C, not learning C++. If you have to deal with C strings (you are writing a low-level pilot in C or your professor still doesn't understand that C and C++ are different languages), at least use the functions of the string.h (in C) / cstring (in C++) header to minimize the chance of memory leak or undefined behavior. If you do not have to use C strings, use std::string and the string manipulation tools of the standard library. Your work will be much easier, and your code much less vulnerable to bugs:
#include <string>
#include <iostream>
using namespace std;
int main()
{
string tab1("11")
string t("2") // never use the single quotes for a string
cout << stoi(tab1) * stoi(t) << endl;
return;
}
That's it!

error when use multiple mpi_bcast

I have a problem with 3 mpi_bcast and one mpi_scatter, my program don't work well ,mpi_scatter don't work and globalparcsr don't scatter between nodes. when i delete second and third mpi_bcast ,mpi_scatter work well. I want broadcast a and globalindividual and globalfitness and then scatter globalparcsr, part of my code as bellow:
int malloc2dint(int ***array, int n, int m) {
/* allocate the n*m contiguous items */
int *p = (int *)malloc(n*m * sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int **)malloc(n * sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (int i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int main(int argc, char *argv[]) {
int size, rank, divided_pop_size, sum = 0, root = 0, procgridsize, sum2 = 0,generation=0;
int **globalindividual, **localindividual;
int *globalfitness, *localfitness;
int *globalparcsr, *localparcsr;
int **recbuf;
int *sendcounts, *parsendcount; //specifying the number of elements to send to each processor
int *displs, *pardispls; //Entry i specifies the displacement
MPI_Status status;
int offset, rows;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
divided_pop_size = n_initial_pop / size;
if (rank == root)
{
malloc2dint(&globalindividual, n_initial_pop, num_vertices);
read_graph();
globalfitness = (int*)malloc(n_initial_pop * sizeof(int));
globalparcsr = (int*)malloc(n_initial_pop * sizeof(int));
globalindividual = initial_population(globalindividual, n_initial_pop);
for (int i = 0; i < n_initial_pop; i++) {
printf("\n");
for (int j = 0; j < num_vertices; j++)
printf("%d", globalindividual[i][j]);
}
}
for (int p = 0; p < size; p++) {
if (rank == p) {
malloc2dint(&localindividual,n_initial_pop + 2, num_vertices);
localindividual = initial_population(localindividual, divided_pop_size + 2);
}
}
MPI_Bcast(&a[0][0], 5000 * 5000, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Bcast(&globalindividual[0][0], n_initial_pop*num_vertices, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&globalfitness, n_initial_pop, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
a is a 2d array and globalindividual is a 2d array with 12 rows and 8 columns and globalfitness is 1d array with size 12
please help me.

FIR filter in CUDA (as a 1D convolution)

I'm trying to implement a FIR (Finite Impulse Response) filter in CUDA. My approach is quite simple and looks somewhat like this:
#include <cuda.h>
__global__ void filterData(const float *d_data,
const float *d_numerator,
float *d_filteredData,
const int numeratorLength,
const int filteredDataLength)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
float sum = 0.0f;
if (i < filteredDataLength)
{
for (int j = 0; j < numeratorLength; j++)
{
// The first (numeratorLength-1) elements contain the filter state
sum += d_numerator[j] * d_data[i + numeratorLength - j - 1];
}
}
d_filteredData[i] = sum;
}
int main(void)
{
// (Skipping error checks to make code more readable)
int dataLength = 18042;
int filteredDataLength = 16384;
int numeratorLength= 1659;
// Pointers to data, filtered data and filter coefficients
// (Skipping how these are read into the arrays)
float *h_data = new float[dataLength];
float *h_filteredData = new float[filteredDataLength];
float *h_filter = new float[numeratorLength];
// Create device pointers
float *d_data = nullptr;
cudaMalloc((void **)&d_data, dataLength * sizeof(float));
float *d_numerator = nullptr;
cudaMalloc((void **)&d_numerator, numeratorLength * sizeof(float));
float *d_filteredData = nullptr;
cudaMalloc((void **)&d_filteredData, filteredDataLength * sizeof(float));
// Copy data to device
cudaMemcpy(d_data, h_data, dataLength * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_numerator, h_numerator, numeratorLength * sizeof(float), cudaMemcpyHostToDevice);
// Launch the kernel
int threadsPerBlock = 256;
int blocksPerGrid = (filteredDataLength + threadsPerBlock - 1) / threadsPerBlock;
filterData<<<blocksPerGrid,threadsPerBlock>>>(d_data, d_numerator, d_filteredData, numeratorLength, filteredDataLength);
// Copy results to host
cudaMemcpy(h_filteredData, d_filteredData, filteredDataLength * sizeof(float), cudaMemcpyDeviceToHost);
// Clean up
cudaFree(d_data);
cudaFree(d_numerator);
cudaFree(d_filteredData);
// Do stuff with h_filteredData...
// Clean up some more
delete [] h_data;
delete [] h_filteredData;
delete [] h_filter;
}
The filter works, but as I'm new to CUDA programming and I'm not sure how to optimize it.
A slight problem that I see is that dataLength, filteredDataLength, and numeratorLength are not known before hand in the application I intend to use the filter in. Also, even though dataLength is a multiple of 32 in the above code, it is not guaranteed to be that in the final application.
When I compare my code above to ArrayFire, my code takes about three times longer to execute.
Does anyone have any ideas on how to speed things up?
EDIT: Have changed all filterLength to numeratorLength.
I can suggest the following to speed up your code:
Use the shared memory: it is a tiny cache-like memory but extremely
faster than the global card memory. You can find more about it by
looking for __shared__ keyword in CUDA documentation. For
example, you can pre-fetch the filter numerators and big chunks
of data in shared memory, this will significantly enhance your
performance. You need to pay extra attention to the data
alignment in this case as it really matters and it can slow down
your code.
Think about unrolling the for-loop of the numerator
sum. You can check the reduce-vector example in CUDA
documentation.
You can also think about parallelizing the
numerator loop itself by itself. This can be done by adding an extra dimension (say 'y') to your thread-block. You will need to make sum a shared vector as well that has the dimension of numeratorLength. You can also check the reduce vector example on how
to quickly take the sum of this vector at the end.
You are attempting at calculating the filter output by directly evaluating the 1D convolution through a CUDA kernel.
In the case when the filter impulse response duration is long, one thing you can do to evaluate the filtered input is performing the calculations directly in the conjugate domain using FFTs. Below I'm reporting a sample code using CUDA Thrust and the cuFFT library. It is a direct translation of the Matlab-based example reported at
Low-Pass Filtering by FFT Convolution
Let me disclaim that some optimizations are possible with this code, but I preferred to leave it as it is so that it could be more easily compared to its Matlab's counterpart.
#include <stdio.h>
#include <math.h>
#include <cufft.h>
#include <thrust\device_vector.h>
#include <thrust\sequence.h>
#define pi_f 3.14159265358979f // Greek pi in single precision
/****************/
/* SIN OPERATOR */
/****************/
class sin_op {
float fk_, Fs_;
public:
sin_op(float fk, float Fs) { fk_ = fk; Fs_ = Fs; }
__host__ __device__ float operator()(float x) const { return sin(2.f*pi_f*x*fk_/Fs_); }
};
/*****************/
/* SINC OPERATOR */
/*****************/
class sinc_op {
float fc_, Fs_;
public:
sinc_op(float fc, float Fs) { fc_ = fc; Fs_ = Fs; }
__host__ __device__ float operator()(float x) const
{
if (x==0) return (2.f*fc_/Fs_);
else return (2.f*fc_/Fs_)*sin(2.f*pi_f*fc_*x/Fs_)/(2.f*pi_f*fc_*x/Fs_);
}
};
/********************/
/* HAMMING OPERATOR */
/********************/
class hamming_op {
int L_;
public:
hamming_op(int L) { L_ = L; }
__host__ __device__ float operator()(int x) const
{
return 0.54-0.46*cos(2.f*pi_f*x/(L_-1));
}
};
/*********************************/
/* MULTIPLY CUFFTCOMPLEX NUMBERS */
/*********************************/
struct multiply_cufftComplex {
__device__ cufftComplex operator()(const cufftComplex& a, const cufftComplex& b) const {
cufftComplex r;
r.x = a.x * b.x - a.y * b.y;
r.y = a.x * b.y + a.y * b.x;
return r;
}
};
/********/
/* MAIN */
/********/
void main(){
// Signal parameters:
int M = 256; // signal length
const int N = 4;
float f[N] = { 440, 880, 1000, 2000 }; // frequencies
float Fs = 5000.; // sampling rate
// Generate a signal by adding up sinusoids:
thrust::device_vector<float> d_x(M,0.f); // pre-allocate 'accumulator'
thrust::device_vector<float> d_n(M); // discrete-time grid
thrust::sequence(d_n.begin(), d_n.end(), 0, 1);
thrust::device_vector<float> d_temp(M);
for (int i=0; i<N; i++) {
float fk = f[i];
thrust::transform(d_n.begin(), d_n.end(), d_temp.begin(), sin_op(fk,Fs));
thrust::transform(d_temp.begin(), d_temp.end(), d_x.begin(), d_x.begin(), thrust::plus<float>());
}
// Filter parameters:
int L = 257; // filter length
float fc = 600.f; // cutoff frequency
// Design the filter using the window method:
thrust::device_vector<float> d_hsupp(L);
thrust::sequence(d_hsupp.begin(), d_hsupp.end(), -(L-1)/2, 1);
thrust::device_vector<float> d_hideal(L);
thrust::transform(d_hsupp.begin(), d_hsupp.end(), d_hideal.begin(), sinc_op(fc,Fs));
thrust::device_vector<float> d_l(L);
thrust::sequence(d_l.begin(), d_l.end(), 0, 1);
thrust::device_vector<float> d_h(L);
thrust::transform(d_l.begin(), d_l.end(), d_h.begin(), hamming_op(L));
// h is our filter
thrust::transform(d_hideal.begin(), d_hideal.end(), d_h.begin(), d_h.begin(), thrust::multiplies<float>());
// --- Choose the next power of 2 greater than L+M-1
int Nfft = pow(2,(ceil(log2((float)(L+M-1))))); // or 2^nextpow2(L+M-1)
// Zero pad the signal and impulse response:
thrust::device_vector<float> d_xzp(Nfft,0.f);
thrust::device_vector<float> d_hzp(Nfft,0.f);
thrust::copy(d_x.begin(), d_x.end(), d_xzp.begin());
thrust::copy(d_h.begin(), d_h.end(), d_hzp.begin());
// Transform the signal and the filter:
cufftHandle plan;
cufftPlan1d(&plan, Nfft, CUFFT_R2C, 1);
thrust::device_vector<cufftComplex> d_X(Nfft/2+1);
thrust::device_vector<cufftComplex> d_H(Nfft/2+1);
cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_xzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_X.data()));
cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_hzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_H.data()));
thrust::device_vector<cufftComplex> d_Y(Nfft/2+1);
thrust::transform(d_X.begin(), d_X.end(), d_H.begin(), d_Y.begin(), multiply_cufftComplex());
cufftPlan1d(&plan, Nfft, CUFFT_C2R, 1);
thrust::device_vector<float> d_y(Nfft);
cufftExecC2R(plan, (cufftComplex*)thrust::raw_pointer_cast(d_Y.data()), (cufftReal*)thrust::raw_pointer_cast(d_y.data()));
getchar();
}
Besides my other answer which I expect will be more convenient for convolution kernels with long duration, below I'm reporting a different implementation, which is more compliant with the OP's initial attempt and I expect will be more convenient for convolution kernels with short duration. Such an implementation is based on a hand-written kernel exploiting caching in shared memory. More details can be found in the book by D.B. Kirk and W.-m. W. Hwu
Programming Massively Parallel Processors, Second Edition: A Hands-on Approach
#include <stdio.h>
#include <stdlib.h>
#include "TimingGPU.cuh"
#include "Utilities.cuh"
#define RG 10
#define BLOCKSIZE 8
/****************/
/* CPU FUNCTION */
/****************/
void h_convolution_1D(const float * __restrict__ h_Signal, const float * __restrict__ h_ConvKernel, float * __restrict__ h_Result_CPU,
const int N, const int K) {
for (int i = 0; i < N; i++) {
float temp = 0.f;
int N_start_point = i - (K / 2);
for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
temp += h_Signal[N_start_point+ j] * h_ConvKernel[j];
}
h_Result_CPU[i] = temp;
}
}
/********************/
/* BASIC GPU KERNEL */
/********************/
__global__ void d_convolution_1D_basic(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU,
const int N, const int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.f;
int N_start_point = i - (K / 2);
for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
temp += d_Signal[N_start_point+ j] * d_ConvKernel[j];
}
d_Result_GPU[i] = temp;
}
/***************************/
/* GPU KERNEL WITH CACHING */
/***************************/
__global__ void d_convolution_1D_caching(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU,
const int N, const int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float d_Tile[BLOCKSIZE];
d_Tile[threadIdx.x] = d_Signal[i];
__syncthreads();
float temp = 0.f;
int N_start_point = i - (K / 2);
for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
if ((N_start_point + j >= blockIdx.x * blockDim.x) && (N_start_point + j < (blockIdx.x + 1) * blockDim.x))
// --- The signal element is in the tile loaded in the shared memory
temp += d_Tile[threadIdx.x + j - (K / 2)] * d_ConvKernel[j];
else
// --- The signal element is not in the tile loaded in the shared memory
temp += d_Signal[N_start_point + j] * d_ConvKernel[j];
}
d_Result_GPU[i] = temp;
}
/********/
/* MAIN */
/********/
int main(){
const int N = 15; // --- Signal length
const int K = 5; // --- Convolution kernel length
float *h_Signal = (float *)malloc(N * sizeof(float));
float *h_Result_CPU = (float *)malloc(N * sizeof(float));
float *h_Result_GPU = (float *)malloc(N * sizeof(float));
float *h_ConvKernel = (float *)malloc(K * sizeof(float));
float *d_Signal; gpuErrchk(cudaMalloc(&d_Signal, N * sizeof(float)));
float *d_Result_GPU; gpuErrchk(cudaMalloc(&d_Result_GPU, N * sizeof(float)));
float *d_ConvKernel; gpuErrchk(cudaMalloc(&d_ConvKernel, K * sizeof(float)));
for (int i=0; i < N; i++) { h_Signal[i] = (float)(rand() % RG); }
for (int i=0; i < K; i++) { h_ConvKernel[i] = (float)(rand() % RG); }
gpuErrchk(cudaMemcpy(d_Signal, h_Signal, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ConvKernel, h_ConvKernel, K * sizeof(float), cudaMemcpyHostToDevice));
h_convolution_1D(h_Signal, h_ConvKernel, h_Result_CPU, N, K);
d_convolution_1D_basic<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}
printf("Test basic passed\n");
d_convolution_1D_caching<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}
printf("Test caching passed\n");
return 0;
}

Infinite loop in CUDA kernel

I have a CUDA kernel where every thread traverse a tree. Because of this I have a while loop which is looped until the thread reaches a leaf. In every step down the tree it checks which of the children it should pick to follow.
The code is as follows:
__global__ void search(float* centroids, float* features, int featureCount, int *votes)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < featureCount)
{
int index = 0;
while (index < N)
{
votes[tid] = index;
int childIndex = index * CHILDREN + 1;
float minValue = FLT_MAX;
if(childIndex >= (N-CHILDREN)) break;
for(int i = 0; i < CHILDREN; i++)
{
int centroidIndex = childIndex + i;
float value = distance(centroids, features, centroidIndex, tid);
if(value < minValue)
{
minValue = value;
index = childIndex + i;
}
}
}
tid += blockDim.x * gridDim.x;
}
}
__device__ float distance(float* a, float* b, int aIndex, int bIndex)
{
float sum = 0.0f;
for(int i = 0; i < FEATURESIZE; i++)
{
float val = a[aIndex + i] - b[bIndex + i];
sum += val * val;
}
return sum;
}
This code goes into an infinite loop. That is what I find weird.
If I change the distance method to return a constant it works(ie. traversing left in the tree).
Have I missed something with loops in CUDA or is there just some hidden bug I can't see? Because I don't see how the code can go into an infinite loop.
Loops in CUDA C++ have the same semantics as they do in C++, so there must be a bug somewhere in your code. One strategy for debugging it would be to do so on the host.
First, because your code is scalar (e.g., it contains no calls to __syncthreads), you can refactor it into __host__ __device__ functions.
distance contains no CUDA-specific identifiers or functions, so you can simply prepend __host__:
__host__ __device__ float distance(float* a, float* b, int aIndex, int bIndex);
To refactor your search function, hoist tid (which depends on the CUDA-specific identifiers threadIndex et al.) outside of it into a parameter, and make it a __host__ __device__ function:
__host__ __device__ void search(int tid, float* centroids, float* features, int featureCount, int *votes)
{
if(tid < featureCount)
{
int index = 0;
while (index < N)
{
votes[tid] = index;
int childIndex = index * CHILDREN + 1;
float minValue = FLT_MAX;
if(childIndex >= (N-CHILDREN)) break;
for(int i = 0; i < CHILDREN; i++)
{
int centroidIndex = childIndex + i;
float value = distance(centroids, features, centroidIndex, tid);
if(value < minValue)
{
minValue = value;
index = childIndex + i;
}
}
}
}
}
Now write a __global__ function which does nothing except calculate tid and call search:
__global__ void search_kernel(float *centroids, float features, int featureCount, int *votes)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
search(tid, centroids, features, featureCount, votes);
}
Because search is now __host__ __device__, you can debug it by calling it from the CPU, emulating what a kernel launch would do:
for(int tid = 0; tid < featureCount; ++tid)
{
search(tid, centroids, features, featureCount, votes);
}
It should hang on the host exactly as it would on the device. Stick a printf inside to find out where. Of course, you need to be sure to make host-side copies of your arrays such as centroids, because the host cannot dereference pointers to device memory.
Even though printf is available to use from __device__ functions with newer hardware, the reason you might prefer this approach is that calls to printf from a kernel do not commit until after the kernel retires. If the kernel never retires (as it apparently does not in your case) then your debugging output will never appear on the screen.

Resources