CUDA - Multiple Threads - random

I am trying to make an LCG Random Number Generator run in parallel using CUDA & GPU's. However, I am having trouble actually getting multiple threads running at the same time.Here is a copy of the code:
#include <iostream>
#include <math.h>
__global__ void rng(long *cont)
{
int a=9, c=3, F, X=1;
long M=524288, Y;
printf("\nKernel X is %d\n", X[0]);
F=X;
Y=X;
printf("Kernel F is %d\nKernel Y is %d\n", F, Y);
Y=(a*Y+c)%M;
printf("%ld\t", Y);
while(Y!=F)
{
Y=(a*Y+c)%M;
printf("%ld\t", Y);
cont[0]++;
}
}
int main()
{
long cont[1]={1};
int X[1];
long *dev_cont;
int *dev_X;
cudaEvent_t beginEvent;
cudaEvent_t endEvent;
cudaEventCreate( &beginEvent );
cudaEventCreate( &endEvent );
printf("Please give the value of the seed X ");
scanf("%d", &X[0]);
printf("Host X is: %d", *X);
cudaEventRecord( beginEvent, 0);
cudaMalloc( (void**)&dev_cont, sizeof(long) );
cudaMalloc( (void**)&dev_X, sizeof(int) );
cudaMemcpy(dev_cont, cont, 1 * sizeof(long), cudaMemcpyHostToDevice);
cudaMemcpy(dev_X, X, 1 * sizeof(int), cudaMemcpyHostToDevice);
rng<<<1,1>>>(dev_cont);
cudaMemcpy(cont, dev_cont, 1 * sizeof(long), cudaMemcpyDeviceToHost);
cudaEventRecord( endEvent, 0);
cudaEventSynchronize (endEvent );
float timevalue;
cudaEventElapsedTime (&timevalue, beginEvent, endEvent);
printf("\n\nYou generated a total of %ld numbers", cont[0]);
printf("\nCUDA Kernel Time: %.2f ms\n", timevalue);
cudaFree(dev_cont);
cudaFree(dev_X);
cudaEventDestroy( endEvent );
cudaEventDestroy( beginEvent );
return 0;
}
Right now I am only sending one block with one thread. However, if I send 100 threads, the only thing that will happen is that it will produce the same number 100 times and then proceed to the next number. In theory this is what is meant to be expected but it automatically disregards the purpose of "random numbers" when a number is repeated.
The idea I want to implement is to have multiple threads. One thread will use that formula:
Y=(a*Y+c)%M but using an initial value of Y=1, then another thread will use the same formula but with an initial value of Y=1000, etc etc. However, once the first thread produces 1000 numbers, it needs to stop making more calculations because if it continues it will interfere with the second thread producing numbers with a value of Y=1000.
If anyone can point in the right direction, at least in the way of creating multiple threads with different functions or instructions inside of them, to run in parallel, I will try to figure out the rest.
Thanks!
UPDATE: July 31, 8:14PM EST
I updated my code to the following. Basically I am trying to produce 256 random numbers. I created the array where those 256 numbers will be stored. I also created an array with 10 different seed values for the values of Y in the threads. I also changed the code to request 10 threads in the device. I am also saving the numbers that are generated in an array. The code is not working correctly as it should. Please advise on how to fix it or how to make it achieve what I want.
Thanks!
#include <iostream>
#include <math.h>
__global__ void rng(long *cont, int *L, int *N)
{
int Y=threadIdx.x;
Y=N[threadIdx.x];
int a=9, c=3, i;
long M=256;
for(i=0;i<256;i++)
{
Y=(a*Y+c)%M;
N[i]=Y;
cont[0]++;
}
}
int main()
{
long cont[1]={1};
int i;
int L[10]={1,25,50,75,100,125,150,175,200,225}, N[256];
long *dev_cont;
int *dev_L, *dev_N;
cudaEvent_t beginEvent;
cudaEvent_t endEvent;
cudaEventCreate( &beginEvent );
cudaEventCreate( &endEvent );
cudaEventRecord( beginEvent, 0);
cudaMalloc( (void**)&dev_cont, sizeof(long) );
cudaMalloc( (void**)&dev_L, sizeof(int) );
cudaMalloc( (void**)&dev_N, sizeof(int) );
cudaMemcpy(dev_cont, cont, 1 * sizeof(long), cudaMemcpyHostToDevice);
cudaMemcpy(dev_L, L, 10 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_N, N, 256 * sizeof(int), cudaMemcpyHostToDevice);
rng<<<1,10>>>(dev_cont, dev_L, dev_N);
cudaMemcpy(cont, dev_cont, 1 * sizeof(long), cudaMemcpyDeviceToHost);
cudaMemcpy(N, dev_N, 256 * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord( endEvent, 0);
cudaEventSynchronize (endEvent );
float timevalue;
cudaEventElapsedTime (&timevalue, beginEvent, endEvent);
printf("\n\nYou generated a total of %ld numbers", cont[0]);
printf("\nCUDA Kernel Time: %.2f ms\n", timevalue);
printf("Your numbers are:");
for(i=0;i<256;i++)
{
printf("%d\t", N[i]);
}
cudaFree(dev_cont);
cudaFree(dev_L);
cudaFree(dev_N);
cudaEventDestroy( endEvent );
cudaEventDestroy( beginEvent );
return 0;
}
#Bardia - Please let me know how I can change my code to accommodate my needs.
UPDATE: August 1, 5:39PM EST
I edited my code to accommodate #Bardia's modifications to the Kernel code. However a few errors in the generation of numbers are coming out. First, the counter that I created in the kernel to count the amount of numbers that are being created, is not working. At the end it only displays that "1" number was generated. The Timer that I created to measure the time it takes for the kernel to execute the instructions is also not working because it keeps displaying 0.00 ms. And based on the parameters that I have set for the formula, the numbers that are being generated and copied into the array and then printed on the screen do not reflect the numbers that are meant to appear (or even close). These all used to work before.
Here is the new code:
#include <iostream>
#include <math.h>
__global__ void rng(long *cont, int *L, int *N)
{
int Y=threadIdx.x;
Y=L[threadIdx.x];
int a=9, c=3, i;
long M=256;
int length=ceil((float)M/10); //256 divided by the number of threads.
for(i=(threadIdx.x*length);i<length;i++)
{
Y=(a*Y+c)%M;
N[i]=Y;
cont[0]++;
}
}
int main()
{
long cont[1]={1};
int i;
int L[10]={1,25,50,75,100,125,150,175,200,225}, N[256];
long *dev_cont;
int *dev_L, *dev_N;
cudaEvent_t beginEvent;
cudaEvent_t endEvent;
cudaEventCreate( &beginEvent );
cudaEventCreate( &endEvent );
cudaEventRecord( beginEvent, 0);
cudaMalloc( (void**)&dev_cont, sizeof(long) );
cudaMalloc( (void**)&dev_L, sizeof(int) );
cudaMalloc( (void**)&dev_N, sizeof(int) );
cudaMemcpy(dev_cont, cont, 1 * sizeof(long), cudaMemcpyHostToDevice);
cudaMemcpy(dev_L, L, 10 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_N, N, 256 * sizeof(int), cudaMemcpyHostToDevice);
rng<<<1,10>>>(dev_cont, dev_L, dev_N);
cudaMemcpy(cont, dev_cont, 1 * sizeof(long), cudaMemcpyDeviceToHost);
cudaMemcpy(N, dev_N, 256 * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord( endEvent, 0);
cudaEventSynchronize (endEvent );
float timevalue;
cudaEventElapsedTime (&timevalue, beginEvent, endEvent);
printf("\n\nYou generated a total of %ld numbers", cont[0]);
printf("\nCUDA Kernel Time: %.2f ms\n", timevalue);
printf("Your numbers are:");
for(i=0;i<256;i++)
{
printf("%d\t", N[i]);
}
cudaFree(dev_cont);
cudaFree(dev_L);
cudaFree(dev_N);
cudaEventDestroy( endEvent );
cudaEventDestroy( beginEvent );
return 0;
}
This is the output I receive:
[wigberto#client2 CUDA]$ ./RNG8
You generated a total of 1 numbers
CUDA Kernel Time: 0.00 ms
Your numbers are:614350480 32767 1132936976 11079 2 0 10 0 1293351837 0 -161443660 48 0 0 614350336 32767 1293351836 0 -161444681 48 614350760 32767 1132936976 11079 2 0 10 0 1057178751 0 -161443660 48 155289096 49 614350416 32767 1057178750 0 614350816 32767 614350840 32767 155210544 49 0 0 1132937352 11079 1130370784 11079 1130382061 11079 155289096 49 1130376992 11079 0 1 1610 1 1 1 1130370408 11079 614350896 32767 614350816 32767 1057178751 0 614350840 32767 0 0 -161443150 48 0 0 1132937352 11079 1 11079 0 0 1 0 614351008 32767 614351032 32767 0 0 0 0 0 0 1130369536 1 1132937352 11079 1130370400 11079 614350944 32767 1130369536 11079 1130382061 11079 1130370784 11079 1130365792 11079 6143510880 614351008 32767 -920274837 0 614351032 32767 0 0 -161443150 48 0 0 0 0 1 0 128 0-153802168 48 614350896 32767 1132839104 11079 97 0 88 0 1 0 155249184 49 1130370784 11079 0 0-1 0 1130364928 11079 2464624 0 4198536 0 4198536 0 4197546 0 372297808 0 1130373120 11079 -161427611 48 111079 0 0 1 0 -153802272 48 155249184 49 372297840 0 -1 0 -161404446 48 0 0 0 0372298000 0 372297896 0 372297984 0 0 0 0 0 1130369536 11079 84 0 1130471067 11079 6303744 0614351656 32767 0 0 -1 0 4198536 0 4198536 0 4197546 0 1130397880 11079 0 0 0 0 0 0 00 0 0 -161404446 48 0 0 4198536 0 4198536 0 6303744 0 614351280 32767 6303744 0 614351656 32767 614351640 32767 1 0 4197371 0 0 0 0 0 [wigberto#client2 CUDA]$
#Bardia - Please advise on what is the best thing to do here.
Thanks!

You can address threads within a block by threadIdx variable.
ie., in your case you should probably set
Y = threadIdx.x and then use Y=(a*Y+c)%M
But in general implementing a good RNG on CUDA could be really difficult.
So I don't know if you want to implement your own generator just for practice..
Otherwise there is a CURAND library available which provides a number of pseudo- and quasi-random generators, ie. XORWOW, MersenneTwister, Sobol etc.

It should do the same work in all threads, because you want them to do the same work. You should always distinguish threads from each other with addressing them.
For example you should say thread #1 you do this job and save you work here and thread #2 you do that job and save your work there and then go to Host and use that data.
For a two dimensional block grid with two dimension threads in each block I use this code for addressing:
int X = blockIdx.x*blockDim.x+threadIdx.x;
int Y = blockIdx.y*blockDim.y+threadIdx.y;
The X and Y in the code above are the global address of your thread (I think for your a one dimensional grid and thread is sufficient).
Also remember that you can not use the printf function on the kernel. The GPUs can't make any interrupt. For this you can use cuPrintf function which is one of CUDA SDK's samples, but read it's instructions to use it correctly.

This answer relates to the edited part of the question.
I didn't notice that it is a recursive Algorithm and unfortunately I don't know how to parallelize a recursive algorithm.
My only idea for generating these 256 number is to generate them separately. i.e. generate 26 of them in the first thread, 26 of them on the second thread and so on.
This code will do this (this is only kernel part):
#include <iostream>
#include <math.h>
__global__ void rng(long *cont, int *L, int *N)
{
int Y=threadIdx.x;
Y=L[threadIdx.x];
int a=9, c=3, i;
long M=256;
int length=ceil((float)M/10); //256 divided by the number of threads.
for(i=(threadIdx.x*length);i<length;i++)
{
Y=(a*Y+c)%M;
N[i]=Y;
cont[0]++;
}
}

Related

CUDA kernel: performance drops by 10x when increased loop count by 10%

I have a simple CUDA kernel to test loop unrolling, then discovered another thing: when the loop count is 10, kernel takes 34 milliseconds to perform, when the loop count is 90, it takes 59 milliseconds, but when the loop count is 100, the time it takes is 423 milliseconds!
Launch configuration is the same, only loop count changed.
So, my question is, what could be the reason for this performance drop?
Here is the code, input is an array of 128x1024x1024 elements, and I'm using PyCUDA:
__global__ void copy(float *input, float *output) {
int tidx = blockIdx.y * blockDim.x + threadIdx.x;
int stride = 1024 * 1024;
for (int i = 0; i < 128; i++) {
int idx = i * stride + tidx;
float x = input[idx];
float y = 0;
for (int j = 0; j < 100; j += 10) {
x = x + sqrt(float(j));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+1));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+2));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+3));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+4));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+5));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+6));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+7));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+8));
y = sqrt(abs(x)) + sin(x) + cos(x);
x = x + sqrt(float(j+9));
y = sqrt(abs(x)) + sin(x) + cos(x);
}
output[idx] = y;
}
}
The loop count I mentioned is this line:
for (int j = 0; j < 100; j += 10)
And sample outputs here:
10 loops
griddimx: 1 griddimy: 1024 griddimz: 1
blockdimx: 1024 blockdimy: 1 blockdimz: 1
nthreads: 1048576 blocks: 1024
prefetch.py:82: UserWarning: The CUDA compiler succeeded, but said the following:
ptxas info : 0 bytes gmem, 24 bytes cmem[3]
ptxas info : Compiling entry function 'copy' for 'sm_61'
ptxas info : Function properties for copy
32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 21 registers, 336 bytes cmem[0], 52 bytes cmem[2]
computation takes 34.24 miliseconds
90 loops
griddimx: 1 griddimy: 1024 griddimz: 1
blockdimx: 1024 blockdimy: 1 blockdimz: 1
nthreads: 1048576 blocks: 1024
prefetch.py:82: UserWarning: The CUDA compiler succeeded, but said the following:
ptxas info : 0 bytes gmem, 24 bytes cmem[3]
ptxas info : Compiling entry function 'copy' for 'sm_61'
ptxas info : Function properties for copy
32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 21 registers, 336 bytes cmem[0], 52 bytes cmem[2]
computation takes 59.33 miliseconds
100 loops
griddimx: 1 griddimy: 1024 griddimz: 1
blockdimx: 1024 blockdimy: 1 blockdimz: 1
nthreads: 1048576 blocks: 1024
prefetch.py:82: UserWarning: The CUDA compiler succeeded, but said the following:
ptxas info : 0 bytes gmem, 24 bytes cmem[3]
ptxas info : Compiling entry function 'copy' for 'sm_61'
ptxas info : Function properties for copy
32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 22 registers, 336 bytes cmem[0], 52 bytes cmem[2]
computation takes 422.96 miliseconds
The problem seems to come from loop unrolling.
Indeed, the 10-loops case can be trivially unrolled by NVCC since the loop is actually always executed once (thus the for line can be removed with j set to 0).
The 90-loops case is unrolled by NVCC (there are only 9 actual iterations). The resulting code is thus much bigger but still fast since no branches are performed (GPUs hate branches). However, the 100-loops case is not unrolled by NVCC (you hit a threshold of the compiler optimizer). The resulting code is small, but it leads to more branches being executed at runtime: branching is performed for each executed loop iteration (a total of 10).
You can see the assembly code difference here.
You can force unrolling using the directive #pragma unroll. However, keep in mind that increasing the size of a code can reduce its performance.
PS: the slightly higher number of register used in the last version may decrease performance, but simulations show that it should be OK in this case.

Obtaining range of bits its from a given no

I am using the following function to extract n bits from a number. I got this function from here. For convenience I am posting it here. I would like to obtain bits from 0 to 9 and then in another statement bits 10 to 15. I am passing in 1033. I get the correct value of bits 0 to 9 but incorrect value for 10 to 15. I should get a 1 instead i am getting 1024 any suggestions ?
unsigned createMask(unsigned a, unsigned b)
{
unsigned r = 0;
for (unsigned i = a; i <= b; i++)
r |= 1 << i;
return r;
}
Now i have this
unsigned short langId = 1033 ;// 10000001001
unsigned primary = createMask(0,9) & langId; //gives 9 correct
unsigned sec = createMask(10,15) & langId; //gives 1024 incorrect should be 1
The bits of sec that you've set are still in the 10-15 bit positions. You need to shift them back towards the start. Otherwise you have a single 1 set at position 10 and 210 is giving your answer of 1024
sec >> 10
Demo

Minimize cudaDeviceSynchronize launch overhead

I'm currently doing a project with CUDA where a pipeline is refreshed with 200-10000 new events every 1ms. Each time, I want to call one(/two) kernels which compute a small list of outputs; then fed those outputs to the next element of the pipeline.
The theoretical flow is:
receive data in an std::vector
cudaMemcpy the vector to GPU
processing
generate small list of outputs
cudaMemcpy to the output std::vector
But when I'm calling cudaDeviceSynchronize on a 1block/1thread empty kernel with no processing, it already takes in average 0.7 to 1.4ms, which is already higher than my 1ms timeframe.
I could eventually change the timeframe of the pipeline in order to receive events every 5ms, but with 5x more each times. It wouldn't be ideal though.
What would be the best way to minimize the overhead of cudaDeviceSynchronize? Could streams be helpful in this situation? Or another solution to efficiently run the pipeline.
(Jetson TK1, compute capabilities 3.2)
Here's a nvprof log of the applications:
==8285== NVPROF is profiling process 8285, command: python player.py test.rec
==8285== Profiling application: python player.py test.rec
==8285== Profiling result:
Time(%) Time Calls Avg Min Max Name
94.92% 47.697ms 5005 9.5290us 1.7500us 13.083us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
5.08% 2.5538ms 8 319.23us 99.750us 413.42us [CUDA memset]
==8285== API calls:
Time(%) Time Calls Avg Min Max Name
75.00% 5.03966s 5005 1.0069ms 25.083us 11.143ms cudaDeviceSynchronize
17.44% 1.17181s 5005 234.13us 83.750us 3.1391ms cudaLaunch
4.71% 316.62ms 9 35.180ms 23.083us 314.99ms cudaMalloc
2.30% 154.31ms 50050 3.0830us 1.0000us 2.6866ms cudaSetupArgument
0.52% 34.857ms 5005 6.9640us 2.5000us 464.67us cudaConfigureCall
0.02% 1.2048ms 8 150.60us 71.917us 183.33us cudaMemset
0.01% 643.25us 83 7.7490us 1.3330us 287.42us cuDeviceGetAttribute
0.00% 12.916us 2 6.4580us 2.0000us 10.916us cuDeviceGetCount
0.00% 5.3330us 1 5.3330us 5.3330us 5.3330us cuDeviceTotalMem
0.00% 4.0830us 1 4.0830us 4.0830us 4.0830us cuDeviceGetName
0.00% 3.4160us 2 1.7080us 1.5830us 1.8330us cuDeviceGet
A small reconstitution of the program (nvprof log at the end) - for some reason, the average of cudaDeviceSynchronize is 4 times lower, but it's still really high for an empty 1-thread kernel:
/* Compile with `nvcc test.cu -I.`
* with -I pointing to "helper_cuda.h" and "helper_string.h" from CUDA samples
**/
#include <iostream>
#include <cuda.h>
#include <helper_cuda.h>
#define MAX_INPUT_BUFFER_SIZE 131072
typedef struct {
unsigned short x;
unsigned short y;
short a;
long long b;
} Event;
long long *d_a_[2], *d_b_[2];
float *d_as_, *d_bs_;
bool *d_some_bool_[2];
Event *d_data_;
int width_ = 320;
int height_ = 240;
__global__ void reset_timesurface(long long ts,
long long *d_a_0, long long *d_a_1,
long long *d_b_0, long long *d_b_1,
float *d_as, float *d_bs,
bool *d_some_bool_0, bool *d_some_bool_1, Event *d_data) {
// nothing here
}
void reset_errors(long long ts) {
static const int n = 1024;
static const dim3 grid_size(width_ * height_ / n
+ (width_ * height_ % n != 0), 1, 1);
static const dim3 block_dim(n, 1, 1);
reset_timesurface<<<1, 1>>>(ts, d_a_[0], d_a_[1],
d_b_[0], d_b_[1],
d_as_, d_bs_,
d_some_bool_[0], d_some_bool_[1], d_data_);
cudaDeviceSynchronize();
// static long long *h_holder = (long long*)malloc(sizeof(long long) * 2000);
// cudaMemcpy(h_holder, d_a_[0], 0, cudaMemcpyDeviceToHost);
}
int main(void) {
checkCudaErrors(cudaMalloc(&(d_a_[0]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_a_[0], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_a_[1]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_a_[1], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_b_[0]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_b_[0], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_b_[1]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_b_[1], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_as_, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMemset(d_as_, 0, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_bs_, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMemset(d_bs_, 0, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_some_bool_[0]), sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMemset(d_some_bool_[0], 0, sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_some_bool_[1]), sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMemset(d_some_bool_[1], 0, sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_data_, sizeof(Event)*MAX_INPUT_BUFFER_SIZE));
for (int i = 0; i < 5005; ++i)
reset_errors(16487L);
cudaFree(d_a_[0]);
cudaFree(d_a_[1]);
cudaFree(d_b_[0]);
cudaFree(d_b_[1]);
cudaFree(d_as_);
cudaFree(d_bs_);
cudaFree(d_some_bool_[0]);
cudaFree(d_some_bool_[1]);
cudaFree(d_data_);
cudaDeviceReset();
}
/* nvprof ./a.out
==9258== NVPROF is profiling process 9258, command: ./a.out
==9258== Profiling application: ./a.out
==9258== Profiling result:
Time(%) Time Calls Avg Min Max Name
92.64% 48.161ms 5005 9.6220us 6.4160us 13.250us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
7.36% 3.8239ms 8 477.99us 148.92us 620.17us [CUDA memset]
==9258== API calls:
Time(%) Time Calls Avg Min Max Name
53.12% 1.22036s 5005 243.83us 9.6670us 8.5762ms cudaDeviceSynchronize
25.10% 576.78ms 5005 115.24us 44.250us 11.888ms cudaLaunch
9.13% 209.77ms 9 23.308ms 16.667us 208.54ms cudaMalloc
6.56% 150.65ms 1 150.65ms 150.65ms 150.65ms cudaDeviceReset
5.33% 122.39ms 50050 2.4450us 833ns 6.1167ms cudaSetupArgument
0.60% 13.808ms 5005 2.7580us 1.0830us 104.25us cudaConfigureCall
0.10% 2.3845ms 9 264.94us 22.333us 537.75us cudaFree
0.04% 938.75us 8 117.34us 58.917us 169.08us cudaMemset
0.02% 461.33us 83 5.5580us 1.4160us 197.58us cuDeviceGetAttribute
0.00% 15.500us 2 7.7500us 3.6670us 11.833us cuDeviceGetCount
0.00% 7.6670us 1 7.6670us 7.6670us 7.6670us cuDeviceTotalMem
0.00% 4.8340us 1 4.8340us 4.8340us 4.8340us cuDeviceGetName
0.00% 3.6670us 2 1.8330us 1.6670us 2.0000us cuDeviceGet
*/
As detailled in the comments of the original message, my problem was entirely related to the GPU I'm using (Tegra K1). Here's an answer I found for this particular problem; it might be useful for other GPUs as well. The average for cudaDeviceSynchronize on my Jetson TK1 went from 250us to 10us.
The rate of the Tegra was 72000kHz by default, we'll have to set it to 852000kHz using this command:
$ echo 852000000 > /sys/kernel/debug/clock/override.gbus/rate
$ echo 1 > /sys/kernel/debug/clock/override.gbus/state
We can find the list of available frequency using this command:
$ cat /sys/kernel/debug/clock/gbus/possible_rates
72000 108000 180000 252000 324000 396000 468000 540000 612000 648000 684000 708000 756000 804000 852000 (kHz)
More performance can be obtained (again, in exchange for a higher power draw) on both the CPU and GPU; check this link for more informations.

Generating number within range with equal probability with dice

I've been thinking about this but can't seem to figure it out. I need to pick a random integer between 1 to 50 (inclusive) in such a way that each of the integer in it would be equally likely. I will have to do this using a 8 sided dice and a 15 sided dice.
I've read somewhat similar questions related to random number generators with dices but I am still confused. I think it is somewhere along the line of partitioning the numbers into sets. Then, I would roll a die, and then, depending on the outcome, decide which die to roll again.
Can someone help me with this?
As a simple - not necessarily "optimal" solution, roll the 8 sided die, then the 15 sided:
8 sided 15 sided 1..50 result
1 or 2 1..15 1..15
3 or 4 1..15 16..30 (add 15 to 15-sided roll)
5 or 6 1..15 31..45 (add 30 to 15-sided roll)
7 or 8 1..5 46..50 (add 45 to 15-sided roll)
7 or 8 6..15 start again / reroll both dice
lets say you have two functions: d8(), which returns a number from 0 to 7, and d15(), which returns a number from 0 to 14. You want to write a d50() that returns a number from 0 to 49.
Of all the simple ways, this one is probably the most efficient in terms of how many dice you have to roll, and something like this will work for all combinations of dice you have and dice you want:
int d50()
{
int result;
do
{
result = d8()*8+d8(); //random from 0 to 63
} while(result >=50);
return result;
}
If you want really constant time, you can do this:
int d50()
{
int result = d15();
int result = result*15+d15(); //0 to 225
int result = result*8+d8(); //0 to 1799
return result/36; //integer division rounds down
}
This way combines dice until the number of possibilities (1800) is evenly divisible by 50, so the same number of possibilities correspond to each result. This works OK in this case, but doesn't work if the prime factors of the dice you have (2, 3, and 5 in this case), don't cover the factors of the dice you want (2, 5)
I think that you can consider each dice result as a subdivision of a bigger interval. So throwing one 8 sided dice you choose one out the 8 major interval that divide your range of value. Throwing a 15 sided dice means selecting one out the 15 sub-interval and so on.
Considering that 15 = 3*5, 8 = 2*2*2 and 50 = 2*5*5 you can choose 36 = 3*3*2*2 as an handy multiple of 50 so that:
15*15*8 = 50*36 = 1800
You can even think of expressing the numbers from 0 to 1799 in base 15 and choose ramdomly the three digits:
choice = [0-7]*15^2 + [0-14]*15^1 + [0-14]*15^0
So my proposal, with a test of the distribution, is (in the c++ language):
#include <iostream>
#include <random>
#include <map>
int main() {
std::map<int, int> hist;
int result;
std::random_device rd;
std::mt19937 gen(rd()); // initialiaze the random generator
std::uniform_int_distribution<> d8(0, 7); // istantiate the dices
std::uniform_int_distribution<> d15(0, 14);
for (int i = 0; i < 20000; ++i) { // make a lot of throws...
result = d8(gen) * 225;
result += d15(gen) * 15; // add to result
result += d15(gen);
++hist[ result / 36 + 1]; // count each result
}
for (auto p : hist) { // show the occurences of each result
std::cout << p.first << " : " << p.second << '\n';
}
return 0;
}
The output should be something like this:
1 : 387
2 : 360
3 : 377
4 : 393
5 : 402
...
48 : 379
49 : 378
50 : 420

Counting in different bases

If you want to count over 8 bits, in base 2, the result is like this:
0 0 0 0 0 0 0 0;
0 0 0 0 0 0 0 1;
0 0 0 0 0 0 1 0;
.....
1 1 1 1 1 1 1 1;
But how can you make an algorithm that counts - for each bit - based on a different bases, ex:
If the least significant bit is counted according to base 5 and the most significant one is on base 2, the result should look like:
0 0 0 0 0 0 0 0;
0 0 0 0 0 0 0 1;
0 0 0 0 0 0 0 2;
0 0 0 0 0 0 0 3;
0 0 0 0 0 0 0 4;
1 0 0 0 0 0 0 0;
1 0 0 0 0 0 0 1;
1 0 0 0 0 0 0 2;
...
1 0 0 0 0 0 0 4;
Can you please give me the algorithm that generates these vectors?
One algorithm for this problem is the kind of "ripple-carry" addition most students typically learn in kindergarten, except slightly modified. When you add two numbers, you do so digit-by-digit: first the ones place, then the tens place, then the hundreds place, etc. If you would ever had to write down a number larger than 10 (e.g. 7+8=15), then you write it down minus-10 and "carry" the 10 over to the next place, where you add it (it becomes a 1); this might "ripple" over many places (e.g. 999+1=1000). By repeatedly adding one using this algorithm, we can count up one-by-one.
It is important to clarify what we're after: what does it mean for different places to have different bases. If you allow places to have arbitrary number ranges, and stand for arbitrary numbers, then some bad things can happen: a number can be written in more than one way, and/or some decimal numbers cannot be written. Therefore we will limit ourselves to a "sane" scheme where if a place i has a base bi, that means the valid digits are 0,1,2,...,bi-1 (as usual, just like in decimal), and that digits in that place represents bi times the product of all bases on the right (bi-1 x bi-2 x ...). For example, if our bases were [10,10,10], the values of the digits would be [1000,100,10,1]; if our bases were [5,10,5], the values of the digits would be [250,50,5,1]
How to add 1 to a number:
Increment the least-significant digit (LSD)
Perform ripple-carry as follows:
Set the current place to the LSD
Repeat as long as the current place exceeds its allowed max digit:
Set the digit at the current place to 0
Advance the current place one to the left
Increment the number at the new place (now on the left)
Repeat the above algorithm until you have all numbers you desire.
Python:
from itertools import *
def multibaseCount(baseFunction):
currentDigits = [0]
while True:
yield currentDigits[::-1]
# add 1
currentDigits[0] += 1
# ripple-carry:
for place,digit in enumerate(currentDigits):
if currentDigits[place]>=baseFunction(place): # if exceeds max digit
currentDigits[place] = 0 # mod current digit by 10
if place==len(currentDigits)-1: # add new digit if doesn't exist
currentDigits += [0]
currentDigits[place+1] += 1 # add 1 to next digit
else: # doesn't exceed max digit; don't need to carry any more
break
Demo, where place n has base n+1:
>>> for digits in islice(multibaseCount(lambda n:n+1),30):
... print( ''.join(map(str,digits)).zfill(5) )
...
00000
00010
00100
00110
00200
00210
01000
01010
01100
01110
01200
01210
02000
02010
02100
02110
02200
02210
03000
03010
03100
03110
03200
03210
10000
10010
10100
10110
10200
10210
If you're really only interested in eight-digit "numbers" of this format, here's some pseudo-code that will get you started:
for (a=0; a<2: a++)
for (b=0; b<5; b++)
for (c=0; c<2; c++)
for (d=0; d<2; d++)
for (e=0; e<2; e++)
for (f=0; f<2; f++)
for (g=0; g<2; g++)
for (h=0; h<2; h++)
printf("%d%d%d%d%d%d%d%d\n", a, b, c, d, e, f, g, h);
In this case, it'll be base 2, 5, 2, 2, 2, 2, 2, 2. Change the indices as you wish.
too late, but here is a solution in C.
#include <stdio.h>
#include <assert.h>
void
odo(int base, int len)
{
int stack[len+1];
int pos=0;
#define DIGIT (stack[pos])
#define LOOP(code) for(int i=0; i<len; i++) code
#define PRINT LOOP(printf("%d", stack[i]));printf("\n");
LOOP(stack[i]=-1);
while(pos>=0)
{
if (pos<len-1)
{
if (DIGIT<base)
{
DIGIT++;
pos++;
assert(DIGIT==-1);
}
else
{
DIGIT=-1;
pos--;
continue;
}
}
else
{
assert(pos==len-1);
if (DIGIT<base)
{
DIGIT++;
PRINT;
}
else
{
DIGIT=-1;
pos--;
}
}
}
}
int
main(void)
{
odo(4,6);
}

Resources