CUDA more than max threads without errors? - visual-studio-2010

The original problem was launching more threads that it is possible like this:
someKernel<<<1 , 1025>>> ( ... );
and not detecting the error, as I did not know how to detect kernel call errors. This is explained well in talonmies answer in this question:
What is the canonical way to check for errors using the CUDA runtime API?
Instead of modifying the code I presented I wrote my own for conciseness:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t cudaError, char *file, int line, bool abort=true)
{
if (cudaError != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(cudaError), file, line);
}
}
__global__ void addKernel(const int *dev_a, const int *dev_b, int *dev_c)
{
int i = threadIdx.x;
if ( i < 5 )
dev_c[i] = dev_a[i] + dev_b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
int *dev_a(nullptr), *dev_b(nullptr), *dev_c(nullptr);
gpuErrchk( cudaMalloc((void**)&dev_a, arraySize * sizeof(int)) );
gpuErrchk( cudaMalloc((void**)&dev_b, arraySize * sizeof(int)) );
gpuErrchk( cudaMalloc((void**)&dev_c, arraySize * sizeof(int)) );
gpuErrchk( cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(dev_b, b, arraySize * sizeof(int), cudaMemcpyHostToDevice) );
const int testMax1D = 1025;
dim3 testMax2D ( 32, 33 );
addKernel<<<1, testMax2D>>> ( dev_a , dev_b, dev_c );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk( cudaMemcpy( c, dev_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost) );
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
I now get correct error reports. Thank you for your patience.
I don't understand this call in the gpuAssert function, so I ommited it:
if (abort) exit(code);
Is exit a custom written function or something I missed?

There are two classes of errors that can occur with kernel launches and they need to be checked for in separate steps, following a particular order.
The first class of errors is reported synchronously when a kernel call is made and prior to the kernel actually being launched on the device, i.e. these are "pre-launch" errors. These errors typically involve requesting more of a particular resource than is available (e.g. too much shared memory, too many threads). Check for these by calling cudaGetLastError() immediately after a kernel call.
The second class of errors are those that occur at some point in time after the kernel was launched on the device (e.g. memory access violation, timeout of watchdog timer). These are "post-launch" errors. The reason they are reported some time after a kernel call, is a natural consequence of kernel launches occuring asynchronously. They are reported at the next opportunity, which is usually the next synchronous API call. Check for these by calling cudaDeviceSynchronize() and examining its status return.
The posted code is missing a check for errors of the first class.

Related

curand_uniform not deterministic?

I want to generate pseudo-random numbers on a CUDA device in a deterministic way, saying if I ran the program two times I expect the exact same results, given that the program uses a hardcoded seed. Following the examples provided by nvidia: https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-example
I would expect exactly the described behavior.
But I do get different results, running the exact same code multiple times. Is there a way to get pseudo-random numbers in a deterministic way, as I described?
Following example code shows my problem:
#include <iostream>
#include <cuda.h>
#include <curand_kernel.h>
__global__ void setup_kernel(curandState *state)
{
auto id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(123456, id, 0, &state[id]);
}
__global__ void draw_numbers(curandState *state, float* results)
{
auto id = threadIdx.x + blockIdx.x * blockDim.x;
// Copy state
curandState localState = state[id % 1024];
// Generate random number
results[id] = curand_uniform(&localState);
// Copy back state
state[id % 1024] = localState;
}
int main(int argc, char* argv[])
{
// Setup
curandState* dStates;
cudaMalloc((void **) &dStates, sizeof(curandState) * 1024);
setup_kernel<<<1024, 1>>>(dStates);
// Random numbers
float* devResults;
cudaMalloc((void **) &devResults, sizeof(float) * 16 * 1024);
float *hostResults = (float*) calloc(16 * 1024, sizeof(float));
// Call draw random numbers
draw_numbers<<<1024, 16>>>(dStates, devResults);
// Copy results
cudaMemcpy(hostResults, devResults, 16 * 1024 * sizeof(float), cudaMemcpyDeviceToHost);
// Output number 12345
::std::cout << "12345 is: " << hostResults[12345] << ::std::endl;
return 0;
}
Compiling and running the code produces different output on my machine:
$ nvcc -std=c++11 curand.cu && ./a.out && ./a.out && ./a.out
12345 is: 0.8059
12345 is: 0.53454
12345 is: 0.382981
As I said, I would expect three times the same output in this example.
curand_uniform does deterministically depend on the state it is provided.
Thanks to the comments by Robert Crovella I see now that the error was in relying on the thread execution order. Just not reusing the state would result in the same "random" numbers, when the draw_numbers kernel is called multiple times, which is not an option for me either.
My guess is that the best solution in my case is to only launch 1024 threads (as many as curandState are set up) and generating multiple random numbers in each thread (in my example 16/thread). This way I receive different random numbers on multiple calls within the program, but the same numbers for every program launch.
Updated code:
#include <iostream>
#include <cuda.h>
#include <curand_kernel.h>
__global__ void setup_kernel(curandState *state)
{
auto id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(123456, id, 0, &state[id]);
}
__global__ void draw_numbers(curandState *state, float* results, int runs)
{
auto id = threadIdx.x + blockIdx.x * blockDim.x;
// Copy state
curandState localState = state[id];
// Generate random numbers
for (int i = 0; i < runs; ++i)
{
results[id + i * 1024] = curand_uniform(&localState);
}
// Copy back state
state[id] = localState;
}
int main(int argc, char* argv[])
{
// Setup
curandState* dStates;
cudaMalloc((void **) &dStates, sizeof(curandState) * 1024);
setup_kernel<<<1024, 1>>>(dStates);
// Random numbers
float* devResults;
cudaMalloc((void **) &devResults, sizeof(float) * 16 * 1024);
float *hostResults = (float*) calloc(16 * 1024, sizeof(float));
// Call draw random numbers
draw_numbers<<<16, 64>>>(dStates, devResults, 16);
// Copy results
cudaMemcpy(hostResults, devResults, 16 * 1024 * sizeof(float), cudaMemcpyDeviceToHost);
// Output number 12345
::std::cout << "12345 is " << hostResults[12345];
// Call draw random numbers (again)
draw_numbers<<<16, 64>>>(dStates, devResults, 16);
// Copy results
cudaMemcpy(hostResults, devResults, 16 * 1024 * sizeof(float), cudaMemcpyDeviceToHost);
// Output number 12345 again
::std::cout << " and " << hostResults[12345] << ::std::endl;
return 0;
}
Producing following output:
$ nvcc -std=c++11 curand.cu && ./a.out && ./a.out && ./a.out
12345 is 0.164181 and 0.295907
12345 is 0.164181 and 0.295907
12345 is 0.164181 and 0.295907
which serves exactly my use-case.

How to read a txt file in MPI by a single process? Why my approach does not work?

I new to MPI.
I am trying to read a text file by using standard c++ code as follows.
int main(int argc, char* argv[] ){
int np, pid, ierr;
ierr = MPI_Init(&argc, &argv);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &np);
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &pid);
const int imgWidth = 1000; // the width of the image (count in pixel)
const int imgHeight = 1000; // the height of the image
double* Y;
Y = (double *)malloc(imgHeight*imgWidth*sizeof(double));
if(pid == 0)
{
string input = "Im.txt";
readData(input.c_str(), Y);
}
MPI_Bcast(Y, imgHeight*imgWidth, MPI_DOUBLE, 0, MPI_COMM_WORLD);
free(Y);
MPI_Finalize();
return 1;
}
The readData function is defined as:
bool readData(const char *fileName, double* Y){
printf("Reading the data file!\n");
ifstream fin(fileName);
int i = 0;
while(fin>>Y[i])
{
i++;
};
cout<<"In total, "<<i<<" data are imported."<<endl;
//close the file
fin.close();
return 1;
}
The file "Im.txt" includes a bunch of numbers. However, when I run the program, there is no data imported. Can anyone give me a hint? I do not need to use multiply processes to read this file in parallel.
Finally, I find the problem. I am working under win7 with visual studio. Seems I have to indicate explicitly the path of my file. Even I put "Im.txt" to the same folder with the source code file, it does not work.

How to fix Invalid arguments during creation of MPI derived Datatypes

I have one structure xyz as given below struct xyz { char a; int32_t b; char c[50]; uint32_t d; uchar e[10];}
I need to broadcast it so I used MPI_Bcast() where i required MPI Datatype corresponding to struct xyz for that I used MPI_Type_creat_struct() function to create a new MPI datatype as MPI_Datatype MPI_my_new_datatype, oldtypes[4]; where I used MPI datatypes corresponding to above structure members datatype as followings
oldtypes[4] = {MPI_CHAR, MPI_INT, MPI_UNSIGNED, MPI_UNSIGNED_CHAR}; and to craete new datatype i used following arguments in the function..
MPI_Type_create_struct(4,blockcounts, offsets, oldtypes, &MPI_my_new_datatype); MPI_Type_commit(&MPI_my_new_datatype);
Now it is compiling but giving run time error as below::
* An error occurred in MPI_Type_create_structon communicator MPI_COMM_WORLD MPI_ERR_ARG: invalid argument of some other kind MPI_ERRORS_ARE_FATAL (goodbye).
Can any one find out where is the problem?
You can't "bundle up" the similar types like that. Each field needs to be addressed seperately, and there are 5 of them, not 4.
Also note that, in general, it's a good idea to actually "measure" the offsets rather than infer them.
The following works:
#include <stdio.h>
#include <mpi.h>
#include <stdint.h>
struct xyz_t {
char a; int32_t b; char c[50]; uint32_t d; unsigned char e[10];
};
int main(int argc, char **argv) {
int rank, size, ierr;
MPI_Datatype oldtypes[5] = {MPI_CHAR, MPI_INT, MPI_CHAR, MPI_UNSIGNED, MPI_UNSIGNED_CHAR};
int blockcounts[5] = {1, 1, 50, 1, 10};
MPI_Datatype my_mpi_struct;
MPI_Aint offsets[5];
struct xyz_t old, new;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* find offsets */
offsets[0] = (char*)&(old.a) - (char*)&old;
offsets[1] = (char*)&(old.b) - (char*)&old;
offsets[2] = (char*)&(old.c) - (char*)&old;
offsets[3] = (char*)&(old.d) - (char*)&old;
offsets[4] = (char*)&(old.e) - (char*)&old;
MPI_Type_create_struct(5, blockcounts, offsets, oldtypes, &my_mpi_struct);
MPI_Type_commit(&my_mpi_struct);
if (rank == 0) {
old.a = 'a';
old.b = (int)'b';
strcpy(old.c,"This is field c");
old.d = (unsigned int)'d';
strcpy(old.e,"Field e");
MPI_Send(&old, 1, my_mpi_struct, 1, 1, MPI_COMM_WORLD);
} else if (rank == 1) {
MPI_Status status;
MPI_Recv(&new, 1, my_mpi_struct, 0, 1, MPI_COMM_WORLD, &status);
printf("new.a = %c\n", new.a);
printf("new.b = %d\n", new.b);
printf("new.e = %s\n", new.e);
}
MPI_Type_free(&my_mpi_struct);
MPI_Finalize();
return 0;
}
Running:
$ mpirun -np 2 ./struct
new.a = a
new.b = 98
new.e = Field e
Updated: As Dave Goodell below points out, the offset calculations would be better done as
#include <stddef.h>
/* ... */
offsets[0] = offsetof(struct xyz_t,a);
offsets[1] = offsetof(struct xyz_t,b);
offsets[2] = offsetof(struct xyz_t,c);
offsets[3] = offsetof(struct xyz_t,d);
offsets[4] = offsetof(struct xyz_t,e);
and if your MPI supports it (most should, though OpenMPI was slow with some of the MPI2.2 types) the MPI_UNSIGNED should be replaced with an MPI_UINT32

A question about the details about the distribution from blocks to SMs in CUDA

Let me take the hardware with computation ability 1.3 as an example.
30 SMs are available. Then at most 240 blocks are able to be running at the same time(Considering the limit of register and shared memory, the restriction to the number of block may be much lower). Those blocks beyond 240 have to wait for available hardware resources.
My question is when those blocks beyond 240 will be assigned to SMs. Once some blocks of the first 240 are completed? Or when all of the first 240 blocks are finished?
I wrote such a piece of code.
#include<stdio.h>
#include<string.h>
#include<cuda_runtime.h>
#include<cutil_inline.h>
const int BLOCKNUM = 1024;
const int N=240;
__global__ void kernel ( volatile int* mark ) {
if ( blockIdx.x == 0 ) while ( mark[N] == 0 );
if ( threadIdx.x == 0 ) mark[blockIdx.x] = 1;
}
int main() {
int * mark;
cudaMalloc ( ( void** ) &mark, sizeof ( int ) *BLOCKNUM );
cudaMemset ( mark, 0, sizeof ( int ) *BLOCKNUM );
kernel <<< BLOCKNUM, 1>>> ( mark );
cudaFree ( mark );
return 0;
}
This code causes a deadlock and fails to terminate. But if I change N from 240 to 239, the code is able to terminate. So I want to know some details about the scheduling of blocks.
On the GT200, it has been demonstrated through micro-benchmarking that new blocks are scheduled whenever a SM has retired all the currently active blocks which it was running. So the answer is when some blocks are finished, and the scheduling granularity is SM level. There seems to be a consensus that Fermi GPUs have a finer scheduling granularity than previous generations of hardware.
I can't find any reference about this for compute capabilities < 1.3.
Fermi architectures introduce a new block dispatcher called GigaThread engine.
GigaThread enables immediate replacement of blocks on an SM when one completes executing and also enables concurrent kernel execution.
While there is no official answer to this, you can measure through atomic operations when your blocks begin your work and when they end.
Try playing with the following code:
#include <stdio.h>
const int maxBlocks=60; //Number of blocks of size 512 threads on current device required to achieve full occupancy
__global__ void emptyKernel() {}
__global__ void myKernel(int *control, int *output) {
if (threadIdx.x==1) {
//register that we enter
int enter=atomicAdd(control,1);
output[blockIdx.x]=enter;
//some intensive and long task
int &var=output[blockIdx.x+gridDim.x]; //var references global memory
var=1;
for (int i=0; i<12345678; ++i) {
var+=1+tanhf(var);
}
//register that we quit
var=atomicAdd(control,1);
}
}
int main() {
int *gpuControl;
cudaMalloc((void**)&gpuControl, sizeof(int));
int cpuControl=0;
cudaMemcpy(gpuControl,&cpuControl,sizeof(int),cudaMemcpyHostToDevice);
int *gpuOutput;
cudaMalloc((void**)&gpuOutput, sizeof(int)*maxBlocks*2);
int cpuOutput[maxBlocks*2];
for (int i=0; i<maxBlocks*2; ++i) //clear the host array just to be on the safe side
cpuOutput[i]=-1;
// play with these values
const int thr=479;
const int p=13;
const int q=maxBlocks;
//I found that this may actually affect the scheduler! Try with and without this call.
emptyKernel<<<p,thr>>>();
cudaEvent_t timerStart;
cudaEvent_t timerStop;
cudaEventCreate(&timerStart);
cudaEventCreate(&timerStop);
cudaThreadSynchronize();
cudaEventRecord(timerStart,0);
myKernel<<<q,512>>>(gpuControl, gpuOutput);
cudaEventRecord(timerStop,0);
cudaEventSynchronize(timerStop);
cudaMemcpy(cpuOutput,gpuOutput,sizeof(int)*maxBlocks*2,cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
float thisTime;
cudaEventElapsedTime(&thisTime,timerStart,timerStop);
cudaEventDestroy(timerStart);
cudaEventDestroy(timerStop);
printf("Elapsed time: %f\n",thisTime);
for (int i=0; i<q; ++i)
printf("%d: %d-%d\n",i,cpuOutput[i],cpuOutput[i+q]);
}
What you get in the output is the block ID, followed by the enter "time" and exit "time". This way you can learn in which order those events occured.
On Fermi, I'm sure that a block is scheduled on a SM as soon there is room for it. I.e., whenever, a SM finishes executing one block, it will execute another block if there is any block left. (However, the actual order is not deterministic).
In older versions, I don't know. But you can verify it by using the build-in clock() function.
For example, I used the following OpenCL kernel code (you can easily convert it to CUDA):
__kernel void test(uint* start, uint* end, float* buffer);
{
int id = get_global_id(0);
start[id] = clock();
__do_something_here;
end[id] = clock();
}
Then output it to a file and build a graph. You will see how visual it is.

Single-Sided communications with MPI-2

Consider the following fragment of OpenMP code which transfers private data between two threads using an intermediate shared variable
#pragma omp parallel shared(x) private(a,b)
{
...
a = somefunction(b);
if (omp_get_thread_num() == 0) {
x = a;
}
}
#pragma omp parallel shared(x) private(a,b)
{
if (omp_get_thread_num() == 1) {
a = x;
}
b = anotherfunction(a);
...
}
I would (in pseudocode ) need to transfer of private data from one process to another using a single-sided message-passing library.
Any ideas?
This is possible, but there's a lot more "scaffolding" involved -- after all, you are communicating data between potentially completely different computers.
The coordination for this sort of thing is done between windows of data which are accessible from other processors, and with lock/unlock operations which coordinate the access of this data. The locks aren't really locks in the sense of being mutexes, but they are more like synchronization points coordinating data access to the window.
I don't have time right now to explain this in the detail I'd like, but below is an example of using MPI2 to do something like shared memory flagging in a system that doesn't have shared memory:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"
int main(int argc, char** argv)
{
int rank, size, *a, geta;
int x;
int ierr;
MPI_Win win;
const int RCVR=0;
const int SENDER=1;
ierr = MPI_Init(&argc, &argv);
ierr |= MPI_Comm_rank(MPI_COMM_WORLD, &rank);
ierr |= MPI_Comm_size(MPI_COMM_WORLD, &size);
if (ierr) {
fprintf(stderr,"Error initializing MPI library; failing.\n");
exit(-1);
}
if (rank == RCVR) {
MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &a);
*a = 0;
} else {
a = NULL;
}
MPI_Win_create(a, 1, sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
if (rank == SENDER) {
/* Lock recievers window */
MPI_Win_lock(MPI_LOCK_EXCLUSIVE, RCVR, 0, win);
x = 5;
/* put 1 int (from &x) to 1 int rank RCVR, at address 0 in window "win"*/
MPI_Put(&x, 1, MPI_INT, RCVR, 0, 1, MPI_INT, win);
/* Unlock */
MPI_Win_unlock(0, win);
printf("%d: My job here is done.\n", rank);
}
if (rank == RCVR) {
for (;;) {
MPI_Win_lock(MPI_LOCK_EXCLUSIVE, RCVR, 0, win);
MPI_Get(&geta, 1, MPI_INT, RCVR, 0, 1, MPI_INT, win);
MPI_Win_unlock(0, win);
if (geta == 0) {
printf("%d: a still zero; sleeping.\n",rank);
sleep(2);
} else
break;
}
printf("%d: a now %d!\n",rank,geta);
printf("a = %d\n", *a);
MPI_Win_free(&win);
if (rank == RCVR) MPI_Free_mem(a);
MPI_Finalize();
return 0;
}

Resources