How to generate unique random integers with curand? - random

I need to generate N unique random integers within a range (A,B) using cuda. I would prefer them to be uniformly distributed but I dont know if that conflicts with the necessity of each number being unique.
This question has been asked before without any answer with a coding hint.
How can I generate a fixed number of unique random integers within an interval without repetition?
My attempt as below generates random numbers but they are not unique.
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <math.h>
#include <assert.h>
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void generate( curandState* globalState, int * result, int *max, int *min, int count )
{
int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
if (ind < count)
result[ind] = truncf(*min +(*max - *min)*RANDOM);
}
int main( int argc, char** argv)
{
int N = 32; // no of random numbers to be generated
int MIN = 10; // max range of random number
int MAX = 100; // min range of random number
dim3 tpb(N,1,1);
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, tpb >>> ( devStates, time(NULL) );
int *d_result, *h_result;
cudaMalloc(&d_result, N * sizeof(int));
h_result = (int *)malloc(N * sizeof(int));
int *d_max, *h_max, *d_min, *h_min;
cudaMalloc(&d_max, sizeof(int));
h_max = (int *)malloc(sizeof(int));
cudaMalloc(&d_min, sizeof(int));
h_min = (int *)malloc(sizeof(int));
*h_max =MAX;
*h_min =MIN;
cudaMemcpy(d_max, h_max, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_min, h_min, sizeof(int), cudaMemcpyHostToDevice);
// generate random numbers
generate <<< 1, tpb >>> ( devStates, d_result, d_max, d_min, N );
cudaMemcpy(h_result, d_result, N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
printf("random number= %d\n", h_result[i]);
return 0;
}
20, 39, 43, 72, 39, 70, 58, 31, 44, 47, 30, 26, 42, 35, 20, 66, 94, 81, 42(repeated), 50, 90, 31(repeated), 51, 53, 39(repeated), 20, 66, 37, 42(repeated), 21, 45, 57

One possible approach, probably much less efficient than the Fisher-Yates shuffle mentioned in the comments:
Determine the length of the range of integers desired to select from (B-A). Generate a set of random numbers using CURAND of this length.
Use sort by key (e.g. thrust::sort_by_key) using this sequence of random numbers along with the sequence of the range of integers to select from, to reorder that sequence.
Take the first N numbers from that sequence (where N is the desired number of random integers to produce), as your selected values.
This will obviously be prohibitive at the point where the length of the range (B-A) of integers to select from implies memory requirements that exceed what the GPU can hold. Thrust sort-by-key requires O(N) temporary storage, so at the point where the range of integers * 8 bytes exceeds about 40% of your available GPU memory, this will become unworkable.
This has the advantages of relatively simple to implement, using ordinary libraries. It has the disadvantage of probably being much less efficient than an expertly written F-Y shuffle. However from what I can see, F-Y shuffle requires that:
All the integers in the desired sequence (A,B) be resident in memory
A set of random numbers be generated, where the set is at least of size (B-A)
global synchronization be available
Here's an example:
$ cat t1504.cu
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <math.h>
#include <assert.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
__global__ void setup_kernel ( curandState * state, unsigned long seed, int n)
{
int id = threadIdx.x+blockDim.x*blockIdx.x;
if (id < n)
curand_init ( seed, id, 0, &state[id] );
}
__global__ void generate( curandState* globalState, float * result, int count )
{
int ind = threadIdx.x+blockDim.x*blockIdx.x;
if (ind < count){
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
result[ind] = RANDOM;}
}
int main( int argc, char** argv)
{
int N = 32; // no of random numbers to be generated
int MIN = 10; // max range of random number
int MAX = 100; // min range of random number
curandState* devStates;
int R = MAX-MIN;
cudaMalloc ( &devStates, R*sizeof( curandState ) );
// setup seeds
setup_kernel <<< (R+255)/256, 256 >>> ( devStates, time(NULL), R );
float *d_result;
cudaMalloc(&d_result, R * sizeof(float));
// generate random numbers
generate <<< (R+255)/256, 256>>> ( devStates, d_result, R );
thrust::device_vector<int> d_r(R);
thrust::sequence(d_r.begin(), d_r.end(), MIN);
thrust::device_ptr<float> dp_res = thrust::device_pointer_cast(d_result);
thrust::sort_by_key(dp_res, dp_res+R, d_r.begin());
thrust::host_vector<int> h_result = d_r;
for (int i = 0; i < N; i++)
printf("random number= %d\n", h_result[i]);
return 0;
}
$ nvcc -o t1504 t1504.cu -lcurand
[user2#dc10 misc]$ ./t1504
random number= 16
random number= 97
random number= 31
random number= 80
random number= 61
random number= 21
random number= 98
random number= 70
random number= 46
random number= 41
random number= 30
random number= 71
random number= 52
random number= 92
random number= 48
random number= 39
random number= 59
random number= 63
random number= 96
random number= 40
random number= 81
random number= 32
random number= 34
random number= 79
random number= 73
random number= 49
random number= 19
random number= 24
random number= 11
random number= 78
random number= 42
random number= 12
$

Related

Using an algorithm to determine the ideal size for shipping boxes

I work in a logistic department for a company, recently we have been trying to narrow down the amount of different packaging options that we use.
I have all the necessary product data like length, width, height, volume and also sales data.
So I was thinking if it is possible to use an algorithm to cluster the different volumes of the products and maybe also take into account which sizes are selling the most, to determine, which box sizes would be ideal.
(Taking into account how often a product sells is secondary so that is not absolutely necessary)
What I want is that I can give the Algorithm an amount of how many different boxsizes I want and the algorithm should determine where to put the limits, so that there is a solution for every product that we have. With the goal of the optimization being minimum volume wasted while also not using more than the set amount of different boxes.
Also important to note, the orientation of the products and the amount per box is set, so there is no need to determine how to pack the products and how many go into one box idealy or something like that.
What kind of algorithms could be used for a problem like this and what are my options to program them? I was thinking of using Matlab, but would also be open for other possible options. I want to program it, not simply use an existing program like SPSS.
Thanks in advance and forgive me if my english is not the best, I'm not a native speaker.
The following C++ program will find optimal solutions for small instances. For 10 input box sizes, each having dimensions randomly chosen in the range 1..100, and for any number 1..10 of box sizes to choose, it computes the answer in a couple of seconds on my computer. For 15 input box sizes, it takes around 10s. For 20 input box sizes, I could compute up to 4 chosen box sizes in about 3 minutes, with memory becoming an issue (it used around 3GB). I had to increase the linker's default stack size to avoid stack overflows.
#include <iostream>
#include <algorithm>
#include <vector>
#include <array>
#include <map>
#include <set>
#include <functional>
#include <climits>
using namespace std;
ostream& operator<<(ostream& os, array<int, 3> a) {
return os << '(' << a[0] << ", " << a[1] << ", " << a[2] << ')';
}
template <int N>
long long vol(array<int, N> b) {
return static_cast<long long>(b[0]) * b[1] * b[2];
}
template <int N, int M>
bool fits(array<int, N> a, array<int, M> b) {
return a[0] <= b[0] && a[1] <= b[1] && a[2] <= b[2];
}
// Compares first by volume, then lexicographically.
struct CompareByVolumeDesc {
bool operator()(array<int, 3> a, array<int, 3> b) const {
return vol(a) > vol(b) || vol(a) == vol(b) && a < b;
}
};
vector<array<int, 3>> candSizes;
struct State {
vector<array<int, 4>> req;
int n;
int k;
// Needed for map<>
bool operator<(State const& other) const {
return make_tuple(n, k, req) < make_tuple(other.n, other.k, other.req);
}
} dummy = { {}, -1, -1 };
map<State, pair<int, State>> memo;
// Compute the minimum volume required for the given list of box sizes if we use exactly k of the first n candidate box sizes.
pair<long long, State> solve(State const& s) {
if (empty(s.req)) return { 0, dummy };
if (s.k == 0 || s.k > s.n) return { LLONG_MAX / 4, dummy };
auto previousAnswer = memo.find(s);
if (previousAnswer != end(memo)) return (*previousAnswer).second;
// Try using the nth candidate box size.
int nFitting = 0;
vector<array<int, 4>> notFitting;
for (auto r : s.req) {
if (fits(r, candSizes[s.n - 1])) {
nFitting += r[3];
} else {
notFitting.push_back(r);
}
}
pair<long long, State> solution;
solution.second = { s.req, s.n - 1, s.k };
solution.first = solve(solution.second).first;
if (nFitting > 0) {
State useNth = { notFitting, s.n - 1, s.k - 1 };
long long useNthVol = nFitting * vol(candSizes[s.n - 1]) + solve(useNth).first;
if (useNthVol < solution.first) solution = { useNthVol, useNth };
}
memo[s] = solution;
return solution;
}
void printOptimalSolution(State s) {
while (!empty(s.req)) {
State next = solve(s).second;
if (next.k < s.k) cout << candSizes[s.n - 1] << endl;
s = next;
}
}
int main(int argc, char** argv) {
int n, k;
cin >> n >> k;
vector<array<int, 4>> requestedBoxSizes;
set<int> lengths, widths, heights;
for (int i = 0; i < n; ++i) {
array<int, 4> d; // d[3] is actually the number of requests for this box size
cin >> d[0] >> d[1] >> d[2] >> d[3];
sort(begin(d), begin(d) + 3, std::greater<int>());
requestedBoxSizes.push_back(d);
lengths.insert(d[0]);
widths.insert(d[1]);
heights.insert(d[2]);
}
// Generate all candidate box sizes
for (int l : lengths) {
for (int w : widths) {
for (int h : heights) {
array<int, 3> cand = { l, w, h };
sort(begin(cand), end(cand), std::greater<int>());
candSizes.push_back(cand);
}
}
}
sort(begin(candSizes), end(candSizes), CompareByVolumeDesc());
candSizes.erase(unique(begin(candSizes), end(candSizes)), end(candSizes));
cout << "Number of candidate box sizes: " << size(candSizes) << endl;
State startState = { requestedBoxSizes, static_cast<int>(size(candSizes)), k };
long long minVolume = solve(startState).first;
cout << "Minimum achievable volume using " << k << " box sizes: " << minVolume << endl;
cout << "Optimal set of " << k << " box sizes:" << endl;
printOptimalSolution(startState);
return 0;
}
Example input:
15 5
100 61 35 27
17 89 96 47
31 69 30 55
37 23 39 9
94 11 48 19
38 17 29 36
63 79 80 36
59 52 37 51
86 63 54 7
32 30 11 26
50 88 51 5
74 70 33 14
67 46 4 79
83 94 89 58
65 42 37 69
Example output:
Number of candidate box sizes: 2310
Minimum achievable volume using 5 box sizes: 124069460
Optimal set of 5 box sizes:
(94, 48, 11)
(69, 52, 37)
(100, 89, 35)
(88, 79, 63)
(94, 89, 83)
I'll explain the algorithm behind this if there's interest. It's better than considering all possible combinations of k candidate box sizes, but not terribly efficient.

C/C++ rand() function for biased expectation

I am using <stdlib.h> rand() function to generate 100 random integers within range [0 ... 9]. I used the following way to generate them on equal distribution,
int random_numbers[100];
for(register int i = 0; i < 100; i++){
random_numbers[i] = rand() % 10;
}
This is working fine. But now I want to get 100 numbers where I want around 50% of those numbers to be 5. How do I do that?
Extended Problem
I want to get 100 numbers. What if I want 50% of those number will be between 0~2. I mean 50 percent of those number will consists only with number 0, 1, 2. How to do that?
I am expecting generalised steps which can be applied beyond the boundary of 10 or 100.
Hmmm, how about choosing a random number between 0 and 17, and if the number is greater than 9, change it to 5?
For 0 - 17, you would get a distribution like
0,1,2,3,4,5,6,7,8,9,5,5,5,5,5,5,5,5
Code:
int random_numbers[100];
for(register int i = 0; i < 100; i++){
random_numbers[i] = rand() % 18;
if (random_numbers[i] > 9) {
random_numbers[i] = 5;
}
}
You basically add a set of numbers beyond your desired range that, when translated to 5 give you equal numbers of 5 and non-5.
In order to get around 50% of these numbers to be in [0, 2] range you can split the full range of rand() into two equal halves and then use the same %-based technique to map the first half to [0, 2] range and the second half to [3, 9] range.
int random_numbers[100];
for(int i = 0; i < 100; i++)
{
int r = rand();
random_numbers[i] = r <= RAND_MAX / 2 ? r % 3 : r % 7 + 3;
}
To to get around 50% of these numbers to be 5 a similar technique will work. Just map the second half to [0, 9] range with 5 excluded
int random_numbers[100];
for(int i = 0; i < 100; i++)
{
int r = rand();
if (r <= RAND_MAX / 2)
r = 5;
else if ((r %= 9) >= 5)
++r;
random_numbers[i] = r;
}
I think it is easy to solve the particular problem of 50% using the techniques mentioned by other answers. Let us try to answer the question for a general case -
Let us say you want a distribution where you want the numbers {A1, A2, .. An} with the percentages {P1, P2, .. Pn} and sum of Pi is 100% (and all the percentages are integers, if not it can be adjusted).
We will create an array of 100 size and fill it with the numbers A1-An.
int distribution[100];
Now we fill each number, it's percentage number of times.
int postion = 0;
for (int i = 0; i < n; i++) {
for( int j = 0; j < P[i]; j++) {
// Add a check here to make sure the sum hasn't crossed 100
distribution[position] = A[i];
position ++;
}
}
Now that this initialization is done once, you can draw a random number as -
int number = distribution[rand() % 100];
In case your percentages are not integers but say you want precision of 0.1%, you can create an array of 1000 instead of 100.
In both case, the goal is 50% selected from one set and 50% from another. Code could call rand() and uses some bits (one) for choosing the group and the remaining bits for value selection.
If the range of numbers needed is much smaller than RAND_MAX, a first attempt could use:
int rand_special_50percent(int n, int special) {
int r = rand();
int r_div_2 = r/2;
if (r%2) {
return special;
}
int y = r_div_2%(n-1); // 9 numbers left
if (y >= special) y++;
return y;
}
int rand_low_50percent(int n, int low_special) {
int r = rand();
int r_div_2 = r/2;
if (r%2) {
return r_div_2%(low_special+1);
}
return r_div_2%(n - low_special) + low_special + 1;
}
Sample
int r5 = rand_special_50percent(10, 5);
int preferred_low_value_max = 2;
int r012 = rand_low_50percent(10, preferred_low_value_max);
Advanced:
With n above RAND_MAX/2, additional calls to rand() are needed.
When using rand()%n, unless (RAND_MAX+1u)%n == 0 (n is a divisor of RAND_MAX+1), a bias is introduced. The above code does not compensate for that.
C++11 solution (not optimal but easy)
std::piecewise_constant_distribution can generate random real numbers (float or double) for given intervals and weights for the each interval.
Not optimal because this solution is generating double and converting double to int. Also getting exactly 50 from [0,3) 100 samples is not guaranteed but for around 50 samples is guaranteed.
For your case : 2 intervals - [0,3), [3,100) and their weights [1,1]
Equal weights, so ~50% of the numbers from [0,3) and ~50% from [3,100)
#include <iostream>
#include <string>
#include <map>
#include <random>
int main()
{
std::random_device rd;
std::mt19937 gen(rd());
std::vector<double> intervals{0, 3, 3, 100};
std::vector<double> weights{ 1, 0, 1};
std::piecewise_constant_distribution<> d(intervals.begin(), intervals.end(), weights.begin());
std::map<int, int> hist;
for(int n=0; n<100; ++n) {
++hist[(int)d(gen)];
}
for(auto p : hist) {
std::cout << p.first << " : generated " << p.second << " times"<< '\n';
}
}
Output:
0 : generated 22 times
1 : generated 19 times
2 : generated 16 times
4 : generated 1 times
5 : generated 2 times
8 : generated 1 times
12 : generated 1 times
17 : generated 1 times
19 : generated 1 times
22 : generated 2 times
23 : generated 1 times
25 : generated 1 times
29 : generated 1 times
30 : generated 2 times
31 : generated 1 times
36 : generated 1 times
38 : generated 1 times
44 : generated 1 times
45 : generated 1 times
48 : generated 1 times
49 : generated 1 times
51 : generated 1 times
52 : generated 1 times
53 : generated 1 times
57 : generated 2 times
58 : generated 3 times
62 : generated 1 times
65 : generated 2 times
68 : generated 1 times
71 : generated 1 times
76 : generated 2 times
77 : generated 1 times
85 : generated 1 times
90 : generated 1 times
94 : generated 1 times
95 : generated 1 times
96 : generated 2 times

CUDA's Mersenne Twister for an arbitrary number of threads

CUDA's implementation of the Mersenne Twister (MT) random number generator is limited to a maximal number of threads/blocks of 256 and 200 blocks/grid, i.e. the maximal number of threads is 51200.
Therefore, it is not possible to launch the kernel that uses the MT with
kernel<<<blocksPerGrid, threadsPerBlock>>>(devMTGPStates, ...)
where
int blocksPerGrid = (n+threadsPerBlock-1)/threadsPerBlock;
and n is the total number of threads.
What is the best way to use the MT for threads > 51200?
My approach if to use constant values for blocksPerGrid and threadsPerBlock, e.g. <<<128,128>>> and use the following in the kernel code:
__global__ void kernel(curandStateMtgp32 *state, int n, ...) {
int id = threadIdx.x+blockIdx.x*blockDim.x;
while (id < n) {
float x = curand_normal(&state[blockIdx.x]);
/* some more calls to curand_normal() followed
by the algorithm that works with the data */
id += blockDim.x*gridDim.x;
}
}
I am not sure if this is the correct way or if it can influence the MT status in an undesired way?
Thank you.
I suggest you read the CURAND documentation carefully and thoroughly.
The MT API will be most efficient when using 256 threads per block with up to 64 blocks to generate numbers.
If you need more than that, you have a variety of options:
simply generate more numbers from the existing state - set (i.e. 64
blocks, 256 threads), and distribute these numbers amongst the
threads that need them.
Use more than a single state per block (but this does not allow you to exceed the overall limit within a state-set, it just addresses the need for a single block.)
Create multiple MT generators with independent seeds (and therefore independent state-sets).
Generally, I don't see a problem with the kernel that you've outlined, and it's roughly in line with choice 1 above. However it does not allow you to exceed 51200 threads. (your example has <<<128, 128>>> so 16384 threads)
Following Robert's answer, below I'm providing a fully worked example on using cuRAND's Mersenne Twister for an arbitrary number of threads. I'm using Robert's first option to generate more numbers from the existing state-set and distributing these numbers amongst the threads that need them.
// --- Generate random numbers with cuRAND's Mersenne Twister
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <curand_kernel.h>
/* include MTGP host helper functions */
#include <curand_mtgp32_host.h>
#define BLOCKSIZE 256
#define GRIDSIZE 64
/*******************/
/* GPU ERROR CHECK */
/*******************/
#define gpuErrchk(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
return EXIT_FAILURE;}} while(0)
#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
return EXIT_FAILURE;}} while(0)
/*******************/
/* iDivUp FUNCTION */
/*******************/
__host__ __device__ int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*********************/
/* GENERATION KERNEL */
/*********************/
__global__ void generate_kernel(curandStateMtgp32 * __restrict__ state, float * __restrict__ result, const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int k = tid; k < N; k += blockDim.x * gridDim.x)
result[k] = curand_uniform(&state[blockIdx.x]);
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 217 * 123;
// --- Allocate space for results on host
float *hostResults = (float *)malloc(N * sizeof(float));
// --- Allocate and initialize space for results on device
float *devResults; gpuErrchk(cudaMalloc(&devResults, N * sizeof(float)));
gpuErrchk(cudaMemset(devResults, 0, N * sizeof(float)));
// --- Setup the pseudorandom number generator
curandStateMtgp32 *devMTGPStates; gpuErrchk(cudaMalloc(&devMTGPStates, GRIDSIZE * sizeof(curandStateMtgp32)));
mtgp32_kernel_params *devKernelParams; gpuErrchk(cudaMalloc(&devKernelParams, sizeof(mtgp32_kernel_params)));
CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));
//CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, 1234));
CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, time(NULL)));
// --- Generate pseudo-random sequence and copy to the host
generate_kernel << <GRIDSIZE, BLOCKSIZE >> >(devMTGPStates, devResults, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(hostResults, devResults, N * sizeof(float), cudaMemcpyDeviceToHost));
// --- Print results
//for (int i = 0; i < N; i++) {
for (int i = 0; i < 10; i++) {
printf("%f\n", hostResults[i]);
}
// --- Cleanup
gpuErrchk(cudaFree(devMTGPStates));
gpuErrchk(cudaFree(devResults));
free(hostResults);
return 0;
}

Make CURAND generate different random numbers from a uniform distribution

I am trying to use CURAND library to generate random numbers which are completely independent of each other from 0 to 100. Hence I am giving time as seed to each thread and specifying the "id = threadIdx.x + blockDim.x * blockIdx.x" as sequence and offset .
Then after getting the random number as float, I multiply it by 100 and take its integer value.
Now, the problem I am facing is that its getting the same random number for the thread [0,0] and [0,1], no matter how many times I run the code which is 11. I am unable to understand what am I doing wrong. Please help.
I am pasting my code below:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include<curand_kernel.h>
#include "util/cuPrintf.cu"
#include<time.h>
#define NE WA*HA //Total number of random numbers
#define WA 2 // Matrix A width
#define HA 2 // Matrix A height
#define SAMPLE 100 //Sample number
#define BLOCK_SIZE 2 //Block size
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x + blockIdx.x + blockDim.x;
curand_init ( seed, id , id, &state[id] );
}
__global__ void generate( curandState* globalState, float* randomMatrix )
{
int ind = threadIdx.x + blockIdx.x * blockDim.x;
if(ind < NE){
curandState localState = globalState[ind];
float stopId = curand_uniform(&localState) * SAMPLE;
cuPrintf("Float random value is : %f",stopId);
int stop = stopId ;
cuPrintf("Random number %d\n",stop);
for(int i = 0; i < SAMPLE; i++){
if(i == stop){
float random = curand_normal( &localState );
cuPrintf("Random Value %f\t",random);
randomMatrix[ind] = random;
break;
}
}
globalState[ind] = localState;
}
}
/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
// 1. allocate host memory for matrix A
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float* ) malloc(mem_size_A);
time_t t;
// 2. allocate device memory
float* d_A;
cudaMalloc((void**) &d_A, mem_size_A);
// 3. create random states
curandState* devStates;
cudaMalloc ( &devStates, size_A*sizeof( curandState ) );
// 4. setup seeds
int n_blocks = size_A/BLOCK_SIZE;
time(&t);
printf("\nTime is : %u\n",(unsigned long) t);
setup_kernel <<< n_blocks, BLOCK_SIZE >>> ( devStates, (unsigned long) t );
// 4. generate random numbers
cudaPrintfInit();
generate <<< n_blocks, BLOCK_SIZE >>> ( devStates,d_A );
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
// 5. copy result from device to host
cudaMemcpy(h_A, d_A, mem_size_A, cudaMemcpyDeviceToHost);
// 6. print out the results
printf("\n\nMatrix A (Results)\n");
for(int i = 0; i < size_A; i++)
{
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n");
}
printf("\n");
// 7. clean up memory
free(h_A);
cudaFree(d_A);
}
Output that I get is :
Time is : 1347857063
[0, 0]: Float random value is : 11.675105[0, 0]: Random number 11
[0, 0]: Random Value 0.358356 [0, 1]: Float random value is : 11.675105[0, 1]: Random number 11
[0, 1]: Random Value 0.358356 [1, 0]: Float random value is : 63.840496[1, 0]: Random number 63
[1, 0]: Random Value 0.696459 [1, 1]: Float random value is : 44.712799[1, 1]: Random number 44
[1, 1]: Random Value 0.735049
There are a few things wrong here, I'm addressing the first ones here to get you started:
General points
Please check the return values of all CUDA API calls, see here for more info.
Please run cuda-memcheck to check for obvious things like out-of-bounds accesses.
Specific points
When allocating space for the RNG state, you should have space for one state per thread (not one per matrix element as you have now).
Your thread ID calculation in setup_kernel() is wrong, should be threadIdx.x + blockIdx.x * blockDim.x (* instead of +).
You use the thread ID as the sequence number as well as the offset, you should just set the offset to zero as described in the cuRAND manual:
For the highest quality parallel pseudorandom number generation, each
experiment should be assigned a unique seed. Within an experiment,
each thread of computation should be assigned a unique sequence
number.
Finally you're running two threads per block, that's incredibly inefficient. Check out the CUDA C Programming Guide, in the "maximize utilization" section for more information, but you should be looking to launch a multiple of 32 threads per block (e.g. 128, 256) and a large number of blocks (e.g. tens of thousands). If you're problem is small then consider running multiple problems at once (either batched in a single kernel launch or as kernels in different streams to get concurrent execution).

Very basic radix sort

I just wrote a simple iterative radix sort and I'm wondering if I have the right idea.
Recursive implementations seem to be much more common.
I am sorting 4-byte integers (unsigned to keep it simple).
I am using 1-byte as the 'digit'. So I have 2^8=256 buckets.
I am sorting the most significant digit (MSD) first.
After each sort I put them back into array in the order they exist in buckets and then perform the next sort.
So I end up doing 4 bucket sorts.
It seems to work for a small set of data. Since I am doing it MSD I'm guessing that's not stable and may fail with different data.
Did I miss anything major?
#include <iostream>
#include <vector>
#include <list>
using namespace std;
void radix(vector<unsigned>&);
void print(const vector<list<unsigned> >& listBuckets);
unsigned getMaxForBytes(unsigned bytes);
void merge(vector<unsigned>& data, vector<list<unsigned> >& listBuckets);
int main()
{
unsigned d[] = {5,3,6,9,2,11,9, 65534, 4,10,17,13, 268435455, 4294967294,4294967293, 268435454,65537};
vector<unsigned> v(d,d+17);
radix(v);
return 0;
}
void radix(vector<unsigned>& data)
{
int bytes = 1; // How many bytes to compare at a time
unsigned numOfBuckets = getMaxForBytes(bytes) + 1;
cout << "Numbuckets" << numOfBuckets << endl;
int chunks = sizeof(unsigned) / bytes;
for(int i = chunks - 1; i >= 0; --i)
{
vector<list<unsigned> > buckets; // lazy, wasteful allocation
buckets.resize(numOfBuckets);
unsigned mask = getMaxForBytes(bytes);
unsigned shift = i * bytes * 8;
mask = mask << shift;
for(unsigned j = 0; j < data.size(); ++j)
{
unsigned bucket = data[j] & mask; // isolate bits of current chunk
bucket = bucket >> shift; // bring bits down to least significant
buckets[bucket].push_back(data[j]);
}
print(buckets);
merge(data,buckets);
}
}
unsigned getMaxForBytes(unsigned bytes)
{
unsigned max = 0;
for(unsigned i = 1; i <= bytes; ++i)
{
max = max << 8;
max |= 0xFF;
}
return max;
}
void merge(vector<unsigned>& data, vector<list<unsigned> >& listBuckets)
{
int index = 0;
for(unsigned i = 0; i < listBuckets.size(); ++i)
{
list<unsigned>& list = listBuckets[i];
std::list<unsigned>::const_iterator it = list.begin();
for(; it != list.end(); ++it)
{
data[index] = *it;
++index;
}
}
}
void print(const vector<list<unsigned> >& listBuckets)
{
cout << "Printing listBuckets: " << endl;
for(unsigned i = 0; i < listBuckets.size(); ++i)
{
const list<unsigned>& list = listBuckets[i];
if(list.size() == 0) continue;
std::list<unsigned>::const_iterator it = list.begin(); // Why do I need std here!?
for(; it != list.end(); ++it)
{
cout << *it << ", ";
}
cout << endl;
}
}
Update:
Seems to work well in LSD form which it can be modified by changing the the chunk loop in radix as follows:
for(int i = chunks - 1; i >= 0; --i)
Let's look at en example with two-digit decimal numbers:
49, 25, 19, 27, 87, 67, 22, 90, 47, 91
Sorting by the first digit yields
19, 25, 27, 22, 49, 47, 67, 87, 90, 91
Next, you sort by the second digit, yielding
90, 91, 22, 25, 27, 47, 67, 87, 19, 49
Seems wrong, doesn't it? Or isn't this what you are doing? Maybe you can show us the code if I got you wrong.
If you are doing the second bucket sort on all groups with the same first digit(s), your algorithm would be equivalent to the recursive version. It would be stable as well. The only difference is that you'd do the bucket sorts breadth-first instead of depth-first.
You also need to make sure you Sort every bucket from MSD to LSD before reassembling.
Example:
19,76,90,34,84,12,72,38
Sort into 10 buckets [0-9] on MSD
B0=[];B1=[19,12];B2=[];B3=[34,38];B4=[];B5=[];B6=[];B7=[76,72];B8=[84];B9=[90];
if you were to reassemble and then sort again it would not work. Instead recursively sort each bucket.
B1 is sorted into B1B2=[12];B1B9=[19]
Once all have been sorted you can reassemble correctly.

Resources