Cuda kernel function only changes matrix's first row - matrix

I am trying to sum up two matrices a_h_1 and a_h_2, and writing the result back to a_h_1. But for some reason my kernel function does not change the array members except the first N elements. Even if I write a[8] = 45, for example, it is printed as 8 when it is copied back to host. What is wrong?
#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void matrix_summation(float *a, float *b, int M, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<M*N)
{
a[idx] = blockIdx.x;
}
}
// main routine that executes on the host
int main(void)
{
float *a_h_1,*a_h_2, *a_d_1,*a_d_2; // Pointer to host & device arrays
const int N = 5;
const int M = 5;
// Number of elements in arrays
size_t size = (N * M) * sizeof(float);
a_h_1 = (float *)malloc(size); // Allocate array1 on host
a_h_2 = (float *)malloc(size); // Allocate array2 on host
cudaMalloc((void **) &a_d_1, size); // Allocate array1 on device
cudaMalloc((void **) &a_d_2, size); // Allocate array2 on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N*M; i++){
a_h_1[i] = (float)i;
a_h_2[i] = (float)i;
}
cudaMemcpy(a_d_1, a_h_1, size, cudaMemcpyHostToDevice);
cudaMemcpy(a_d_2, a_h_2, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = M;
int n_blocks = (M*N)/block_size;
matrix_summation <<< n_blocks, block_size >>> ( a_d_1,a_d_2, M, N));
// Retrieve result from device and store it in host array
cudaMemcpy(a_h_1, a_d_1, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
printf("\n\nROW 1 \n");
for (int i=0; i<(M*N); i++)
{
printf(" %f ", a_h_1[i]);
if((i+1)%N == 0)
{
printf("\nROW %d \n", ((i+1)/N)+1);
}
}
// Cleanup
free(a_h_1);
free(a_h_2);
cudaFree(a_d_1);
cudaFree(a_d_2);
system("pause");
}
Here is the output:
ROW 1
0.0 2.0 4.0 6.0 8.0 < this line is correct but others are not
ROW 2
5.0 6.0 7.0 8.0 9.0
ROW 3
10.0 11.0 12.0 13.0 14.0
ROW 4
15.0 16.0 17.0 18.0 19.0
ROW 5
20.0 21.0 22.0 23.0 24.0

It looks like you're not copying all of the device array to your host array. In this line:
cudaMemcpy(a_h_1, a_d_1, sizeof(float)*N, cudaMemcpyDeviceToHost);
I think you meant to copy sizeof(float)*N*M

Related

Accessing dynamically allocated arrays on device (without passing them as kernel arguments)

How can an array of structs that has been dynamically allocated on the host be used by a kernel, without passing the array of structs as a kernel argument? This seems like a common procedure with a good amount of documentation online, yet it doesn't work on the following program.
Note: Please note that the following questions have been studied before posting this question:
1) copying host memory to cuda __device__ variable 2) Global variable in CUDA 3) Is there any way to dynamically allocate constant memory? CUDA
So far, unsuccessful attempts have been made to:
Dynamically allocate array of structs with cudaMalloc(), then
Use cudaMemcpyToSymbol() with the pointer returned from cudaMalloc() to copy to a __device__ variable which can be used by the kernel.
Code attempt:
NBody.cu (error checking using cudaStatus has mostly been omitted for better readability, and function to read data from file into dynamic array removed):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle.x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}
OUTPUT:
"ERROR: kernel launch failed."
Summary:
How can I print the contents of the array of structs from the kernel, without passing it as a kernel argument?
Coding in C using VS2019 with CUDA 10.2
With the help of #Robert Crovella and #talonmies, here is the solution that outputs a sequence that cycles from 0 to 9 repeatedly.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
//#include "Nbody.h"
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody* d_particle;
//__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle[i].x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}

Binary Matrix Reduction in CUDA

I have to traverse all cells of an imaginary matrix m * n and add + 1 for all cells that meet a certain condition.
My naive solution was as follows:
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x;
int y = blockIdx.x;
if (x*x + y*y <= center*center) {
*count++;
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
int h_count = 0;
int *d_count;
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<l,l>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
printf("Sum: %d\n", h_count);
return 0;
}
In my use case, the value of interactions can be very large, making it impossible to allocate l * l of space.
Can someone help me? Any suggestions are welcome.
There are at least 2 problems with your code:
Your kernel code will not work correctly with an ordinary add here:
*count++;
this is because multiple threads are trying to do this at the same time, and CUDA does not automatically sort that out for you. For the purpose of this explanation, we will fix this with an atomicAdd(), although other methods are possible.
The ampersand doesn't belong here:
cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
^
I assume that is just a typo, since you did it correctly on the subsequent cudaMemcpy operation:
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
This methodology (effectively creating a square array of threads using threadIdx.x for one dimension and blockIdx.x for the other) will only work up to an interactions value that leads to an l value of 1024, or less, because CUDA threadblocks are limited to 1024 threads, and you are using l as the size of the threadblock in your kernel launch. To fix this you would want to learn how to create a CUDA 2D grid of arbitrary dimensions, and adjust your kernel launch and in-kernel indexing calculations appropriately. For now we will just make sure that the calculated l value is in range for your code design.
Here's an example addressing the above issues:
$ cat t1590.cu
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x;
int y = blockIdx.x;
if (x*x + y*y <= center*center) {
atomicAdd(count, 1);
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
if ((l > 1024) || (l < 1)) {printf("Error: interactions out of range\n"); return 0;}
int h_count = 0;
int *d_count;
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<l,l>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
cudaError_t err = cudaGetLastError();
if (err == cudaSuccess){
printf("Sum: %d\n", h_count);
printf("fraction satisfying test: %f\n", h_count/(float)interactions);
}
else
printf("CUDA error: %s\n", cudaGetErrorString(err));
return 0;
}
$ nvcc -o t1590 t1590.cu
$ ./t1590
Enter the number of interactions: 1048576
Sum: 206381
fraction satisfying test: 0.196820
$
We see that the code indicates a calculated fraction of about 0.2. Does this appear to be correct? I claim that it does appear to be correct based on your test. You are effectively creating a grid that represents dimensions of lxl. Your test is asking, effectively, "which points in that grid are within a circle, with the center at the origin (corner) of the grid, and radius l/2 ?"
Pictorially, that looks something like this:
and it is reasonable to assume the red shaded area is somewhat less than 0.25 of the total area, so 0.2 is a reasonable estimate of that area.
As a bonus, here is a version of the code that reduces the restriction listed in item 3 above:
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x+blockDim.x*blockIdx.x;
int y = threadIdx.y+blockDim.y*blockIdx.y;
if (x*x + y*y <= center*center) {
atomicAdd(count, 1);
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
int h_count = 0;
int *d_count;
const int bs = 32;
dim3 threads(bs, bs);
dim3 blocks((l+threads.x-1)/threads.x, (l+threads.y-1)/threads.y);
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<blocks,threads>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
cudaError_t err = cudaGetLastError();
if (err == cudaSuccess){
printf("Sum: %d\n", h_count);
printf("fraction satisfying test: %f\n", h_count/(float)interactions);
}
else
printf("CUDA error: %s\n", cudaGetErrorString(err));
return 0;
}
This is launching a 2D grid based on l, and should work up to at least 1 billion interactions .

Sorting many small arrays in CUDA

I am implementing a median filter in CUDA. For a particular pixel, I extract its neighbors corresponding to a window around the pixel, say a N x N (3 x 3) window, and now have an array of N x N elements. I do not envision using a window of more than 10 x 10 elements for my application.
This array is now locally present in the kernel and already loaded into device memory. From previous SO posts that I have read, the most common sorting algorithms are implemented by Thrust. But, Thrust can only be called from the host. Thread - Thrust inside user written kernels
Is there a quick and efficient way to sort a small array of N x N elements inside the kernel?
If the number of elements is fixed and small, you can use sorting networks (http://pages.ripco.net/~jgamble/nw.html). It provides a fixed number of compare/swap operations for a fixed number of elements (eg. 19 compare/swap iterations for 8 elements).
Your problem is sorting many small arrays in CUDA.
Following Robert's suggestion in his comment, CUB offers a possible solution to face this problem. Below I report an example that was constructed around Robert's code at cub BlockRadixSort: how to deal with large tile size or sort multiple tiles?.
The idea is assigning the small arrays to be sorted to different thread blocks and then using cub::BlockRadixSort to sort each array. Two versions are provided, one loading and one loading the small arrays into shared memory.
Let me finally note that your statement that CUDA Thrust is not callable from within kernels is not anymore true. The post Thrust inside user written kernels you linked to has been updated with other answers.
#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>
#include "Utilities.cuh"
using namespace cub;
/**********************************/
/* CUB BLOCKSORT KERNEL NO SHARED */
/**********************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
// --- Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
typedef cub::BlockLoad <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadT;
typedef cub::BlockStore <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreT;
typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// --- Allocate type-safe, repurposable shared memory for collectives
__shared__ union {
typename BlockLoadT ::TempStorage load;
typename BlockStoreT ::TempStorage store;
typename BlockRadixSortT::TempStorage sort;
} temp_storage;
// --- Obtain this block's segment of consecutive keys (blocked across threads)
int thread_keys[ITEMS_PER_THREAD];
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys);
__syncthreads();
// --- Collectively sort the keys
BlockRadixSortT(temp_storage.sort).Sort(thread_keys);
__syncthreads();
// --- Store the sorted segment
BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys);
}
/*******************************/
/* CUB BLOCKSORT KERNEL SHARED */
/*******************************/
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(int *d_in, int *d_out)
{
// --- Shared memory allocation
__shared__ int sharedMemoryArray[BLOCK_THREADS * ITEMS_PER_THREAD];
// --- Specialize BlockStore and BlockRadixSort collective types
typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// --- Allocate type-safe, repurposable shared memory for collectives
__shared__ typename BlockRadixSortT::TempStorage temp_storage;
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
// --- Load data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) sharedMemoryArray[threadIdx.x * ITEMS_PER_THREAD + k] = d_in[block_offset + threadIdx.x * ITEMS_PER_THREAD + k];
__syncthreads();
// --- Collectively sort the keys
BlockRadixSortT(temp_storage).Sort(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArray + (threadIdx.x * ITEMS_PER_THREAD))));
__syncthreads();
// --- Write data to shared memory
for (int k = 0; k < ITEMS_PER_THREAD; k++) d_out[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArray[threadIdx.x * ITEMS_PER_THREAD + k];
}
/********/
/* MAIN */
/********/
int main() {
const int numElemsPerArray = 8;
const int numArrays = 4;
const int N = numArrays * numElemsPerArray;
const int numElemsPerThread = 4;
const int RANGE = N * numElemsPerThread;
// --- Allocating and initializing the data on the host
int *h_data = (int *)malloc(N * sizeof(int));
for (int i = 0 ; i < N; i++) h_data[i] = rand() % RANGE;
// --- Allocating the results on the host
int *h_result1 = (int *)malloc(N * sizeof(int));
int *h_result2 = (int *)malloc(N * sizeof(int));
// --- Allocating space for data and results on device
int *d_in; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
int *d_out1; gpuErrchk(cudaMalloc((void **)&d_out1, N * sizeof(int)));
int *d_out2; gpuErrchk(cudaMalloc((void **)&d_out2, N * sizeof(int)));
// --- BlockSortKernel no shared
gpuErrchk(cudaMemcpy(d_in, h_data, N*sizeof(int), cudaMemcpyHostToDevice));
BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_in, d_out1);
gpuErrchk(cudaMemcpy(h_result1, d_out1, N*sizeof(int), cudaMemcpyDeviceToHost));
printf("BlockSortKernel no shared\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Value %i\n", k, i, h_result1[k * numElemsPerArray + i]);
// --- BlockSortKernel with shared
gpuErrchk(cudaMemcpy(d_in, h_data, N*sizeof(int), cudaMemcpyHostToDevice));
shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_in, d_out2);
gpuErrchk(cudaMemcpy(h_result2, d_out2, N*sizeof(int), cudaMemcpyDeviceToHost));
printf("\n\nBlockSortKernel with shared\n\n");
for (int k = 0; k < numArrays; k++)
for (int i = 0; i < numElemsPerArray; i++)
printf("Array nr. %i; Element nr. %i; Value %i\n", k, i, h_result2[k * numElemsPerArray + i]);
return 0;
}
If you are using CUDA 5.X, you can use dynamic parallelism. You can make some child kernel in your filter kernel to finish the sort job. As how to sort by CUDA, you can use some induction skills.

Cuda thrust global memory writing very slow

I am currently writing a code, that calculates a integral Histogram on the GPU using the Nvidia thrust library.
Therefore I allocate a continuous Block of device memory which I update with a custom functor all the time.
The problem is, that the write to the device memory is veeery slow, but the reads are actually ok.
The basic setup is the following:
struct HistogramCreation
{
HistogramCreation(
...
// pointer to memory
...
){}
/// The actual summation operator
__device__ void operator()(int index){
.. do the calculations ..
for(int j=0;j<30;j++){
(1) *_memoryPointer = values (also using reads to such locations) ;
}
}
}
void foo(){
cudaMalloc(_pointer,size);
HistogramCreation initialCreation( ... _pointer ...);
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(_imageSize),
initialCreation);
}
if I change the writing in (1) to the following>
unsigned int val = values;
The performance is much better. THis is the only global memory write I have.
Using the memory write I get about 2s for HD Footage.
using the local variable it takes about 50 ms so about a factor of 40 less.
Why is this so slow? how could I improve it?
Just as #OlegTitov said, frequent load/store with global
memory should be avoided as much as possible. When there's a
situation where it's inevitable, then coalesced memory
access can help the execution process not to get too slow;
however in most cases, histogram calculation is pretty tough
to realize the coalesced access.
While most of the above is basically just restating
#OlegTitov's answer, i'd just like to share about an
investigation i did about finding summation with NVIDIA
CUDA. Actually the result is pretty interesting and i hope
it'll be a helpful information for other xcuda developers.
The experiment was basically to run a speed test of finding
summation with various memory access patterns: using global
memory (1 thread), L2 cache (atomic ops - 128 threads), and
L1 cache (shared mem - 128 threads)
This experiment used:
Kepler GTX 680,
1546 cores # 1.06GHz
GDDR5 256-bit # 3GHz
Here are the kernels:
__global__
void glob(float *h) {
float* hist = h;
uint sd = SEEDRND;
uint random;
for (int i = 0; i < NUMLOOP; i++) {
if (i%NTHREADS==0) random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
hist[rind] += randval;
}
}
__global__
void atom(float *h) {
float* hist = h;
uint sd = SEEDRND;
for (int i = threadIdx.x; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
atomicAdd(&hist[rind], randval);
}
}
__global__
void shm(float *h) {
int lid = threadIdx.x;
uint sd = SEEDRND;
__shared__ float shm[NTHREADS][NBIN];
for (int i = 0; i < NBIN; i++) shm[lid][i] = h[i];
for (int i = lid; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
shm[lid][rind] += randval;
}
/* reduction here */
for (int i = 0; i < NBIN; i++) {
__syncthreads();
if (threadIdx.x < 64) {
shm[threadIdx.x][i] += shm[threadIdx.x+64][i];
}
__syncthreads();
if (threadIdx.x < 32) {
shm[threadIdx.x][i] += shm[threadIdx.x+32][i];
}
__syncthreads();
if (threadIdx.x < 16) {
shm[threadIdx.x][i] += shm[threadIdx.x+16][i];
}
__syncthreads();
if (threadIdx.x < 8) {
shm[threadIdx.x][i] += shm[threadIdx.x+8][i];
}
__syncthreads();
if (threadIdx.x < 4) {
shm[threadIdx.x][i] += shm[threadIdx.x+4][i];
}
__syncthreads();
if (threadIdx.x < 2) {
shm[threadIdx.x][i] += shm[threadIdx.x+2][i];
}
__syncthreads();
if (threadIdx.x == 0) {
shm[0][i] += shm[1][i];
}
}
for (int i = 0; i < NBIN; i++) h[i] = shm[0][i];
}
OUTPUT
atom: 102656.00 shm: 102656.00 glob: 102656.00
atom: 122240.00 shm: 122240.00 glob: 122240.00
... blah blah blah ...
One Thread: 126.3919 msec
Atomic: 7.5459 msec
Sh_mem: 2.2207 msec
The ratio between these kernels is 57:17:1. Many things can
be analyzed here, and it truly does not mean that using
L1 or L2 memory spaces will always give you more than 10
times speedup of the whole program.
And here's the main and other funcs:
#include <iostream>
#include <cstdlib>
#include <cstdio>
using namespace std;
#define NUMLOOP 1000000
#define NBIN 36
#define SEEDRND 1
#define NTHREADS 128
#define NBLOCKS 1
__device__ uint rnd(uint & seed) {
#if LONG_MAX > (16807*2147483647)
int const a = 16807;
int const m = 2147483647;
seed = (long(seed * a))%m;
return seed;
#else
double const a = 16807;
double const m = 2147483647;
double temp = seed * a;
seed = (int) (temp - m * floor(temp/m));
return seed;
#endif
}
... the above kernels ...
int main()
{
float *h_hist, *h_hist2, *h_hist3, *d_hist, *d_hist2,
*d_hist3;
h_hist = (float*)malloc(NBIN * sizeof(float));
h_hist2 = (float*)malloc(NBIN * sizeof(float));
h_hist3 = (float*)malloc(NBIN * sizeof(float));
cudaMalloc((void**)&d_hist, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist2, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist3, NBIN * sizeof(float));
for (int i = 0; i < NBIN; i++) h_hist[i] = 0.0f;
cudaMemcpy(d_hist, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist2, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist3, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaEvent_t start, end;
float elapsed = 0, elapsed2 = 0, elapsed3;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start, 0);
atom<<<NBLOCKS, NTHREADS>>>(d_hist);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed, start, end);
cudaEventRecord(start, 0);
shm<<<NBLOCKS, NTHREADS>>>(d_hist2);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed2, start, end);
cudaEventRecord(start, 0);
glob<<<1, 1>>>(d_hist3);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed3, start, end);
cudaMemcpy(h_hist, d_hist, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist2, d_hist2, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist3, d_hist3, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
/* print output */
for (int i = 0; i < NBIN; i++) {
printf("atom: %10.2f shm: %10.2f glob:
%10.2f¥n",h_hist[i],h_hist2[i],h_hist3[i]);
}
printf("%12s: %8.4f msec¥n", "One Thread", elapsed3);
printf("%12s: %8.4f msec¥n", "Atomic", elapsed);
printf("%12s: %8.4f msec¥n", "Sh_mem", elapsed2);
return 0;
}
When writing GPU code you should avoid reading and writing to/from global memory. Global memory is very slow on GPU. That's the hardware feature. The only thing you can do is to make neighboring treads read/write in neighboring adresses in global memory. This will cause coalescing and speed up the process. But in general read your data once, process it and write it out once.
Note that NVCC might optimize out a lot of your code after you make the modification - it detects that no write to global memory is made and just removes the "unneeded" code. So this speedup may not be coming out of the global writer per ce.
I would recommend using profiler on your actual code (the one with global write) to see if there's anything like unaligned access or other perf problem.

Breakpoints inside CUDA kernel __global__ not hitting

Using visual studios 2010. Win 7. Nsight 2.1
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// incrementArray.cu
#include <stdio.h>
#include <assert.h>
void incrementArrayOnHost(float *a, int N)
{
int i;
for (i=0; i < N; i++) a[i] = a[i]+1.f;
}
__global__ void incrementArrayOnDevice(float *a, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int j = idx;
int i = 2;
i = i+j; //->breakpoint here
if (idx<N) a[idx] = a[idx]+1.f; //->breakpoint here
}
int main(void)
{
float *a_h, *b_h; // pointers to host memory
float *a_d; // pointer to device memory
int i, N = 10;
size_t size = N*sizeof(float);
// allocate arrays on host
a_h = (float *)malloc(size);
b_h = (float *)malloc(size);
// allocate array on device
cudaMalloc((void **) &a_d, size);
// initialization of host data
for (i=0; i<N; i++) a_h[i] = (float)i;
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);
// do calculation on host
incrementArrayOnHost(a_h, N);
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int blockSize = 4;
int nBlocks = N/blockSize + (N%blockSize == 0?0:1);
// Part 2 of 2. Call incrementArrayOnDevice kernel
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
// Retrieve result from device and store in b_h
cudaMemcpy(b_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// check results
for (i=0; i<N; i++) assert(a_h[i] == b_h[i]);
// cleanup
free(a_h); free(b_h); cudaFree(a_d);
return 0;
}
I've tried inserting breakpoints as listed above inside my global void incrementArrayOnDevice(float *a, int N) but they're not hitting.
When I run debug (f5) in visual studios, I tried to step into incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N); but they would skip the entire kernel code section.
tried to add a watch on the variables i and j but there was an error "CXX0017: Error: symbol "i" not found."
Is this issue normal? Can someone please try on their pc and let me know if they can hit the breakpoints? If you can, what possible problem could mine be? Please help! :(
Nsight debugging is different from VS debugging . You need to use Nsight debugging to hit the kernel breakpoints. However, for this you need 2 GPU cards. Do you have 2 cards in the first place? Please check
You can debug on a single GPU but on the following conditions:
You have to be using 5.0 toolkit
You have to be programming on a GPU that suports 303.xx NForceWare or higher

Resources