Sorting many small arrays in CUDA

Sorting many small arrays in CUDA - sorting

I am implementing a median filter in CUDA. For a particular pixel, I extract its neighbors corresponding to a window around the pixel, say a N x N (3 x 3) window, and now have an array of N x N elements. I do not envision using a window of more than 10 x 10 elements for my application.
This array is now locally present in the kernel and already loaded into device memory. From previous SO posts that I have read, the most common sorting algorithms are implemented by Thrust. But, Thrust can only be called from the host. Thread - Thrust inside user written kernels
Is there a quick and efficient way to sort a small array of N x N elements inside the kernel?

If the number of elements is fixed and small, you can use sorting networks (http://pages.ripco.net/~jgamble/nw.html). It provides a fixed number of compare/swap operations for a fixed number of elements (eg. 19 compare/swap iterations for 8 elements).

Your problem is sorting many small arrays in CUDA.
Following Robert's suggestion in his comment, CUB offers a possible solution to face this problem. Below I report an example that was constructed around Robert's code at cub BlockRadixSort: how to deal with large tile size or sort multiple tiles?.
The idea is assigning the small arrays to be sorted to different thread blocks and then using cub::BlockRadixSort to sort each array. Two versions are provided, one loading and one loading the small arrays into shared memory.
Let me finally note that your statement that CUDA Thrust is not callable from within kernels is not anymore true. The post Thrust inside user written kernels you linked to has been updated with other answers.
#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>
#include "Utilities.cuh"
using namespace cub;
/**********************************/
/* CUB BLOCKSORT KERNEL NO SHARED */
/**********************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
// --- Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
typedef cub::BlockLoad <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadT;
typedef cub::BlockStore <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreT;
typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// --- Allocate type-safe, repurposable shared memory for collectives
__shared__ union {
typename BlockLoadT ::TempStorage load;
typename BlockStoreT ::TempStorage store;
typename BlockRadixSortT::TempStorage sort;
} temp_storage;
// --- Obtain this block's segment of consecutive keys (blocked across threads)
int thread_keys[ITEMS_PER_THREAD];
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys);
__syncthreads();
// --- Collectively sort the keys
BlockRadixSortT(temp_storage.sort).Sort(thread_keys);
__syncthreads();
// --- Store the sorted segment
BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys);
}
/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(int *d_in, int *d_out)
{
// --- Shared memory allocation
__shared__ int sharedMemoryArray[BLOCK_THREADS * ITEMS_PER_THREAD];
// --- Specialize BlockStore and BlockRadixSort collective types
typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// --- Allocate type-safe, repurposable shared memory for collectives
__shared__ typename BlockRadixSortT::TempStorage temp_storage;
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
// --- Load data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) sharedMemoryArray[threadIdx.x * ITEMS_PER_THREAD + k] = d_in[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
__syncthreads();
// --- Collectively sort the keys
BlockRadixSortT(temp_storage).Sort(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArray + (threadIdx.x * ITEMS_PER_THREAD))));
__syncthreads();
// --- Write data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) d_out[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArray[threadIdx.x * ITEMS_PER_THREAD + k];
}
/********/
/* MAIN */
/********/
int main() {
const int numElemsPerArray = 8;
const int numArrays = 4;
const int N = numArrays * numElemsPerArray;
const int numElemsPerThread = 4;
const int RANGE = N * numElemsPerThread;
// --- Allocating and initializing the data on the host
int *h_data = (int *)malloc(N * sizeof(int));
for (int i = 0 ; i < N; i++) h_data[i] = rand() % RANGE;
// --- Allocating the results on the host
int *h_result1 = (int *)malloc(N * sizeof(int));
int *h_result2 = (int *)malloc(N * sizeof(int));
// --- Allocating space for data and results on device
int *d_in; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
int *d_out1; gpuErrchk(cudaMalloc((void **)&d_out1, N * sizeof(int)));
int *d_out2; gpuErrchk(cudaMalloc((void **)&d_out2, N * sizeof(int)));
// --- BlockSortKernel no shared
gpuErrchk(cudaMemcpy(d_in, h_data, N*sizeof(int), cudaMemcpyHostToDevice));
BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_in, d_out1);
gpuErrchk(cudaMemcpy(h_result1, d_out1, N*sizeof(int), cudaMemcpyDeviceToHost));
printf("BlockSortKernel no shared\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Value %i\n", k, i, h_result1[k * numElemsPerArray + i]);
// --- BlockSortKernel with shared
gpuErrchk(cudaMemcpy(d_in, h_data, N*sizeof(int), cudaMemcpyHostToDevice));
shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_in, d_out2);
gpuErrchk(cudaMemcpy(h_result2, d_out2, N*sizeof(int), cudaMemcpyDeviceToHost));
printf("\n\nBlockSortKernel with shared\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Value %i\n", k, i, h_result2[k * numElemsPerArray + i]);
return 0;
}

If you are using CUDA 5.X, you can use dynamic parallelism. You can make some child kernel in your filter kernel to finish the sort job. As how to sort by CUDA, you can use some induction skills.

Related

CUDA Initialize Array on Device

I am very new to CUDA and I am trying to initialize an array on the device and return the result back to the host to print out to show if it was correctly initialized. I am doing this because the end goal is a dot product solution in which I multiply two arrays together, storing the results in another array and then summing up the entire thing so that I only need to return the host one value.
In the code I am working on all I am only trying to see if I am initializing the array correctly. I am trying to create an array of size N following the patterns of 1,2,3,4,5,6,7,8,1,2,3....
This is the code that I've written and it compiles without issue but when I run it the terminal is hanging and I have no clue why. Could someone help me out here? I'm so incredibly confused :\
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#define ARRAY_SIZE 100
#define BLOCK_SIZE 32
__global__ void cu_kernel (int *a_d,int *b_d,int *c_d, int size)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int temp;
if(temp != 8){
a_d[x] = temp;
temp++;
} else {
a_d[x] = temp;
temp = 1;
}
}
int main (int argc, char *argv[])
{
//declare pointers for arrays
int *a_d, *b_d, *c_d, *sum_h, *sum_d,a_h[ARRAY_SIZE];
//set space for device variables
cudaMalloc((void**) &a_d, sizeof(int) * ARRAY_SIZE);
cudaMalloc((void**) &b_d, sizeof(int) * ARRAY_SIZE);
cudaMalloc((void**) &c_d, sizeof(int) * ARRAY_SIZE);
cudaMalloc((void**) &sum_d, sizeof(int));
// set execution configuration
dim3 dimblock (BLOCK_SIZE);
dim3 dimgrid (ARRAY_SIZE/BLOCK_SIZE);
// actual computation: call the kernel
cu_kernel <<<dimgrid, dimblock>>> (a_d,b_d,c_d,ARRAY_SIZE);
cudaError_t result;
// transfer results back to host
result = cudaMemcpy (a_h, a_d, sizeof(int) * ARRAY_SIZE, cudaMemcpyDeviceToHost);
if (result != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed.");
exit(1);
}
// print reversed array
printf ("Final state of the array:\n");
for (int i =0; i < ARRAY_SIZE; i++) {
printf ("%d ", a_h[i]);
}
printf ("\n");
}

There are at least 3 issues with your kernel code.
you are using shared memory variable temp without initializing it.
you are not resolving the order in which threads access a shared variable as discussed here.
you are imagining (perhaps) a particular order of thread execution, and CUDA provides no guarantees in that area
The first item seems self-evident, however naive methods to initialize it in a multi-threaded environment like CUDA are not going to work. Firstly we have the multi-threaded access pattern, again, Furthermore, in a multi-block scenario, shared memory in one block is logically distinct from shared memory in another block.
Rather than wrestle with mechanisms unsuited to create the pattern you desire, (informed by notions carried over from a serial processing environment), I would simply do something trivial like this to create the pattern you desire:
__global__ void cu_kernel (int *a_d,int *b_d,int *c_d, int size)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size) a_d[x] = (x&7) + 1;
}
Are there other ways to do it? certainly.
__global__ void cu_kernel (int *a_d,int *b_d,int *c_d, int size)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int temp;
if (!threadIdx.x) temp = blockIdx.x*blockDim.x;
__syncthreads();
if (x < size) a_d[x] = ((temp+threadIdx.x) & 7) + 1;
}
You can get as fancy as you like.
These changes will still leave a few values at zero at the end of the array, which would require changes to your grid sizing. There are many questions about this already, or study a sample code like vectorAdd.

how to divide n numbers among N processors using open mp

I was a given a task to perform CREW sort in parallel programming. As the first step of this, I have an array of size n and N processors, I need to divide these elements among N processors and sort each part sequentially and merge them back , how can I do this in openmp. I am new to using openmp ,so any resources to solve this problem will be helpful.

This is what I wrote from the back of my head. It might not be optimal and is not tested. But it should give you a direction how to handle such a problem.
#include <stddef.h>
#include <stdlib.h>
#include <openmp.h>
ptrdiff_t min(ptrdiff_t a, ptrdiff_t b) {
return ((a > b) ? b : a);
}
void inplace_sequential_sort(double *data, ptrdiff_t n) { /* ... */ }
void inplace_merge(double *data, ptrdiff_t n1, ptrdiff_t n2) { /* ... */ }
void inplace_parallel_sort(double *data, ptrdiff_t n) {
if (n < 2)
return;
/* allocate memory for helper array */
int const max_threads = omp_get_max_threads();
ptrdiff_t *my_n = calloc(max_threads, sizeof(*my_n));
if (!my_n) { /* ... */ }
#pragma omp parallel default(none) \
shared(n, data, my_n)
{
/* get thread ID and actual number of threads */
int const tid = omp_get_thread_num();
int const N = omp_get_num_threads();
/* distribute data among threads */
ptrdiff_t const max_elem_per_thread = ((n + N - 1) / N);
ptrdiff_t const my_begin = min(tid * max_elem_per_thread, n);
my_n[tid] = min(n - begin, max_elem_per_thread);
if (my_n[tid] > 1)
inplace_squential_sort(data + my_begin, my_n[tid]);
/* merge sorted data sections (parallel reduction algorithm) */
for (ptrdiff_t stride = 1; stride < N; stride *= 2 ) {
#pragma omp barrier
if (ti % (2 * stride) == 0 && my_begin + my_n[tid] != n) {
inplace_merge(data + my_begin, my_n[tid], my_n[tid + stride]);
my_n[tid] += my_n[tid + stride];
}
}
} /* end of parallel region */
free(my_n);
}
I assumed that you want a C solution (not C++ or Fortran) and sort the data inplace. This is a very basic solution. OpenMP can do much more (e.g. tasking). The functions inplace_sequantial_sort() and inplace_merge() have to be provided.

Binary Matrix Reduction in CUDA

I have to traverse all cells of an imaginary matrix m * n and add + 1 for all cells that meet a certain condition.
My naive solution was as follows:
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x;
int y = blockIdx.x;
if (x*x + y*y <= center*center) {
*count++;
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
int h_count = 0;
int *d_count;
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<l,l>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
printf("Sum: %d\n", h_count);
return 0;
}
In my use case, the value of interactions can be very large, making it impossible to allocate l * l of space.
Can someone help me? Any suggestions are welcome.

There are at least 2 problems with your code:
Your kernel code will not work correctly with an ordinary add here:
*count++;
this is because multiple threads are trying to do this at the same time, and CUDA does not automatically sort that out for you. For the purpose of this explanation, we will fix this with an atomicAdd(), although other methods are possible.
The ampersand doesn't belong here:
cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
^
I assume that is just a typo, since you did it correctly on the subsequent cudaMemcpy operation:
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
This methodology (effectively creating a square array of threads using threadIdx.x for one dimension and blockIdx.x for the other) will only work up to an interactions value that leads to an l value of 1024, or less, because CUDA threadblocks are limited to 1024 threads, and you are using l as the size of the threadblock in your kernel launch. To fix this you would want to learn how to create a CUDA 2D grid of arbitrary dimensions, and adjust your kernel launch and in-kernel indexing calculations appropriately. For now we will just make sure that the calculated l value is in range for your code design.
Here's an example addressing the above issues:
$ cat t1590.cu
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x;
int y = blockIdx.x;
if (x*x + y*y <= center*center) {
atomicAdd(count, 1);
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
if ((l > 1024) || (l < 1)) {printf("Error: interactions out of range\n"); return 0;}
int h_count = 0;
int *d_count;
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<l,l>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
cudaError_t err = cudaGetLastError();
if (err == cudaSuccess){
printf("Sum: %d\n", h_count);
printf("fraction satisfying test: %f\n", h_count/(float)interactions);
}
else
printf("CUDA error: %s\n", cudaGetErrorString(err));
return 0;
}
$ nvcc -o t1590 t1590.cu
$ ./t1590
Enter the number of interactions: 1048576
Sum: 206381
fraction satisfying test: 0.196820
$
We see that the code indicates a calculated fraction of about 0.2. Does this appear to be correct? I claim that it does appear to be correct based on your test. You are effectively creating a grid that represents dimensions of lxl. Your test is asking, effectively, "which points in that grid are within a circle, with the center at the origin (corner) of the grid, and radius l/2 ?"
Pictorially, that looks something like this:
and it is reasonable to assume the red shaded area is somewhat less than 0.25 of the total area, so 0.2 is a reasonable estimate of that area.
As a bonus, here is a version of the code that reduces the restriction listed in item 3 above:
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x+blockDim.x*blockIdx.x;
int y = threadIdx.y+blockDim.y*blockIdx.y;
if (x*x + y*y <= center*center) {
atomicAdd(count, 1);
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
int h_count = 0;
int *d_count;
const int bs = 32;
dim3 threads(bs, bs);
dim3 blocks((l+threads.x-1)/threads.x, (l+threads.y-1)/threads.y);
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<blocks,threads>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
cudaError_t err = cudaGetLastError();
if (err == cudaSuccess){
printf("Sum: %d\n", h_count);
printf("fraction satisfying test: %f\n", h_count/(float)interactions);
}
else
printf("CUDA error: %s\n", cudaGetErrorString(err));
return 0;
}
This is launching a 2D grid based on l, and should work up to at least 1 billion interactions .

Conditional reduction in CUDA

I need to sum about 100000 values stored in an array, but with conditions.
Is there a way to do that in CUDA to produce fast results?
Can anyone post a small code to do that?

I think that, to perform conditional reduction, you can directly introduce the condition as a multiplication by 0 (false) or 1 (true) to the addends. In other words, suppose that the condition you would like to meet is that the addends be smaller than 10.f. In this case, borrowing the first code at Optimizing Parallel Reduction in CUDA by M. Harris, then the above would mean
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i]*(g_data[i]<10.f);
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
If you wish to use CUDA Thrust to perform conditional reduction, you can do the same by using thrust::transform_reduce. Alternatively, you can create a new vector d_b copying in that all the elements of d_a satisfying the predicate by thrust::copy_if and then applying thrust::reduce on d_b. I haven't checked which solution performs the best. Perhaps, the second solution will perform better on sparse arrays. Below is an example with an implementation of both the approaches.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/count.h>
#include <thrust/copy.h>
// --- Operator for the first approach
struct conditional_operator {
__host__ __device__ float operator()(const float a) const {
return a*(a<10.f);
}
};
// --- Operator for the second approach
struct is_smaller_than_10 {
__host__ __device__ bool operator()(const float a) const {
return (a<10.f);
}
};
void main(void)
{
int N = 20;
// --- Host side allocation and vector initialization
thrust::host_vector<float> h_a(N,1.f);
h_a[0] = 20.f;
h_a[1] = 20.f;
// --- Device side allocation and vector initialization
thrust::device_vector<float> d_a(h_a);
// --- First approach
float sum = thrust::transform_reduce(d_a.begin(), d_a.end(), conditional_operator(), 0.f, thrust::plus<float>());
printf("Result = %f\n",sum);
// --- Second approach
int N_prime = thrust::count_if(d_a.begin(), d_a.end(), is_smaller_than_10());
thrust::device_vector<float> d_b(N_prime);
thrust::copy_if(d_a.begin(), d_a.begin() + N, d_b.begin(), is_smaller_than_10());
sum = thrust::reduce(d_b.begin(), d_b.begin() + N_prime, 0.f);
printf("Result = %f\n",sum);
getchar();
}

CUDA's Mersenne Twister for an arbitrary number of threads

CUDA's implementation of the Mersenne Twister (MT) random number generator is limited to a maximal number of threads/blocks of 256 and 200 blocks/grid, i.e. the maximal number of threads is 51200.
Therefore, it is not possible to launch the kernel that uses the MT with
kernel<<<blocksPerGrid, threadsPerBlock>>>(devMTGPStates, ...)
where
int blocksPerGrid = (n+threadsPerBlock-1)/threadsPerBlock;
and n is the total number of threads.
What is the best way to use the MT for threads > 51200?
My approach if to use constant values for blocksPerGrid and threadsPerBlock, e.g. <<<128,128>>> and use the following in the kernel code:
__global__ void kernel(curandStateMtgp32 *state, int n, ...) {
int id = threadIdx.x+blockIdx.x*blockDim.x;
while (id < n) {
float x = curand_normal(&state[blockIdx.x]);
/* some more calls to curand_normal() followed
by the algorithm that works with the data */
id += blockDim.x*gridDim.x;
}
}
I am not sure if this is the correct way or if it can influence the MT status in an undesired way?
Thank you.

I suggest you read the CURAND documentation carefully and thoroughly.
The MT API will be most efficient when using 256 threads per block with up to 64 blocks to generate numbers.
If you need more than that, you have a variety of options:
simply generate more numbers from the existing state - set (i.e. 64
blocks, 256 threads), and distribute these numbers amongst the
threads that need them.
Use more than a single state per block (but this does not allow you to exceed the overall limit within a state-set, it just addresses the need for a single block.)
Create multiple MT generators with independent seeds (and therefore independent state-sets).
Generally, I don't see a problem with the kernel that you've outlined, and it's roughly in line with choice 1 above. However it does not allow you to exceed 51200 threads. (your example has <<<128, 128>>> so 16384 threads)

Following Robert's answer, below I'm providing a fully worked example on using cuRAND's Mersenne Twister for an arbitrary number of threads. I'm using Robert's first option to generate more numbers from the existing state-set and distributing these numbers amongst the threads that need them.
// --- Generate random numbers with cuRAND's Mersenne Twister
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <curand_kernel.h>
/* include MTGP host helper functions */
#include <curand_mtgp32_host.h>
#define BLOCKSIZE 256
#define GRIDSIZE 64
/*******************/
/* GPU ERROR CHECK */
/*******************/
#define gpuErrchk(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
return EXIT_FAILURE;}} while(0)
#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
return EXIT_FAILURE;}} while(0)
/*******************/
/* iDivUp FUNCTION */
/*******************/
__host__ __device__ int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*********************/
/* GENERATION KERNEL */
/*********************/
__global__ void generate_kernel(curandStateMtgp32 * __restrict__ state, float * __restrict__ result, const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int k = tid; k < N; k += blockDim.x * gridDim.x)
result[k] = curand_uniform(&state[blockIdx.x]);
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 217 * 123;
// --- Allocate space for results on host
float *hostResults = (float *)malloc(N * sizeof(float));
// --- Allocate and initialize space for results on device
float *devResults; gpuErrchk(cudaMalloc(&devResults, N * sizeof(float)));
gpuErrchk(cudaMemset(devResults, 0, N * sizeof(float)));
// --- Setup the pseudorandom number generator
curandStateMtgp32 *devMTGPStates; gpuErrchk(cudaMalloc(&devMTGPStates, GRIDSIZE * sizeof(curandStateMtgp32)));
mtgp32_kernel_params *devKernelParams; gpuErrchk(cudaMalloc(&devKernelParams, sizeof(mtgp32_kernel_params)));
CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));
//CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, 1234));
CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, time(NULL)));
// --- Generate pseudo-random sequence and copy to the host
generate_kernel << <GRIDSIZE, BLOCKSIZE >> >(devMTGPStates, devResults, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(hostResults, devResults, N * sizeof(float), cudaMemcpyDeviceToHost));
// --- Print results
//for (int i = 0; i < N; i++) {
for (int i = 0; i < 10; i++) {
printf("%f\n", hostResults[i]);
}
// --- Cleanup
gpuErrchk(cudaFree(devMTGPStates));
gpuErrchk(cudaFree(devResults));
free(hostResults);
return 0;
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Sorting many small arrays in CUDA - sorting

If the number of elements is fixed and small, you can use sorting networks (http://pages.ripco.net/~jgamble/nw.html). It provides a fixed number of compare/swap operations for a fixed number of elements (eg. 19 compare/swap iterations for 8 elements).

If you are using CUDA 5.X, you can use dynamic parallelism. You can make some child kernel in your filter kernel to finish the sort job. As how to sort by CUDA, you can use some induction skills.

Related

CUDA Initialize Array on Device

how to divide n numbers among N processors using open mp

Binary Matrix Reduction in CUDA

Conditional reduction in CUDA

CUDA's Mersenne Twister for an arbitrary number of threads

Categories

Resources