Here's the code:
// allocation
void allocateSymbolStorage(char **pepperShakerList, char **pepperList)
{
// allocate storage for an array of pointers
pepperShakerList = (char **) malloc(MAX_PEPPER_SHAKERS * sizeof(char *));
for (int i = 0; i < MAX_PEPPER_SHAKERS; i++)
{
if ((pepperShakerList[i] = (char *) malloc(MAX_SHAKERNAME_LENGTH * sizeof(char))) == NULL)
fatalError("failed pepperShakerList alloc");
}
// allocate storage for an array of pointers
pepperList = (char **) malloc(MAX_PEPPERS * sizeof(char *));
for (int i = 0; i < MAX_PEPPERS; i++)
{
if ((pepperList[i] = (char *) malloc(MAX_PEPPER_LENGTH * sizeof(char))) == NULL)
fatalError("failed pepperList alloc");
}
}
void buildPepperShakers(void)
{
char **pepperShakerList, **pepperList;
allocateSymbolStorage(pepperShakerList, pepperList);
// ....
freeSymbolStorage(pepperShakerList, pepperList);
}
Here's the VS 2010 error:
: warning C4700: uninitialized local variable 'pepperList' used
Here's the confusion:
Why the error if the char ** is being allocated in the allocate function? Is it a matter of the thing falling out of scope?
Assuming it's pepperList and not symbolList that you are talking about, AND assuming that your code in the allocationSymbolStorage reflects what you want to do, then VC is complaining correctly.
As it stands your code would crash because in buildPepperShakers() you are NOT getting any values back from allocateSymbolStorage.
So your allocateSymbolStorage should be declared as:
void allocateSymbolStorage(char ***pepperShakerList, char ***pepperList)
THEN you pass the addresses of local pointer-holder variables in buildPepperShakers, namely pepperList and pepperShakerList to the allocation function, so that it can THEN do allocations as per TJD's answer. That is:
void buildPepperShakers(void) {
char **pepperShakerList, **pepperList;
allocateSymbolStorage(&pepperShakerList, &pepperList);
}
of course your allocateSymbolStorage body now becomes:
void allocateSymbolStorage(char ***pepperShakerList_p, char ***pepperList_p)
{
char **pepperShakerList, **pepperList;
// allocate storage for an array of pointers
pepperShakerList = (char **) malloc(MAX_PEPPER_SHAKERS * sizeof(char *));
for (int i = 0; i < MAX_PEPPER_SHAKERS; i++)
{
if ((pepperShakerList[i] = (char *) malloc(MAX_SHAKERNAME_LENGTH * sizeof(char))) == NULL)
fatalError("failed pepperShakerList alloc");
}
// allocate storage for an array of pointers
pepperList = (char **) malloc(MAX_PEPPERS * sizeof(char *));
for (int i = 0; i < MAX_PEPPERS; i++)
{
if ((pepperList[i] = (char *) malloc(MAX_PEPPER_LENGTH * sizeof(char))) == NULL)
fatalError("failed pepperList alloc");
}
*pepperShakerList_p = pepperShakerList;
*pepperList_p = pepperList;
}
and now VC should not complain. Although this is an ugly way of doing memory management of your objects :-)
This is what you are intending, you need to dereference the pointer you pass in:
*pepperShakerList = (char *) malloc(MAX_PEPPER_SHAKERS * sizeof(char *));
Related
In copyuvm function setupkvm is called to set kernel virtual memory. Why do we need to setup kernel virtual memory when we are copying user process ? Why didn't we need that when we were doing allocuvm ?
Code for copyuvm
// Given a parent process's page table, create a copy
// of it for a child.
pde_t*
copyuvm(pde_t *pgdir, uint sz)
{
pde_t *d;
pte_t *pte;
uint pa, i, flags;
char *mem;
if((d = setupkvm()) == 0)
return 0;
for(i = 0; i < sz; i += PGSIZE){
if((pte = walkpgdir(pgdir, (void *) i, 0)) == 0)
panic("copyuvm: pte should exist");
if(!(*pte & PTE_P))
panic("copyuvm: page not present");
pa = PTE_ADDR(*pte);
flags = PTE_FLAGS(*pte);
if((mem = kalloc()) == 0)
goto bad;
memmove(mem, (char*)P2V(pa), PGSIZE);
if(mappages(d, (void*)i, PGSIZE, V2P(mem), flags) < 0) {
kfree(mem);
goto bad;
}
}
return d;
bad:
freevm(d);
return 0;
}
and for allocuvm
int
allocuvm(pde_t *pgdir, uint oldsz, uint newsz)
{
char *mem;
uint a;
if(newsz >= KERNBASE)
return 0;
if(newsz < oldsz)
return oldsz;
a = PGROUNDUP(oldsz);
for(; a < newsz; a += PGSIZE){
mem = kalloc();
if(mem == 0){
cprintf("allocuvm out of memory\n");
deallocuvm(pgdir, newsz, oldsz);
return 0;
}
memset(mem, 0, PGSIZE);
if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){
cprintf("allocuvm out of memory (2)\n");
deallocuvm(pgdir, newsz, oldsz);
kfree(mem);
return 0;
}
}
return newsz;
}
What copyuvm does is that copy whole virtual memory (user + kernel) from a page directory. So during copyuvm we need setupkvm for kernel part.
On the other hand, allocuvm just extends existing virtual memory ( specifically heap portion). Since there already exists kernel portion of mappings in allocuvm, we are not bound to call setupkvm.
I have a dynamically allocated array that is sent by rank 0 to other ranks using MPI_Send()
On the receiving side, a dynamic array is allocated memory using malloc()
MPI_Recv() happens on the other ranks. At this receive function, I get invalid Buffer Pointer error.
Code is conceptually similar to this:
struct graph{
int count;
int * array;
} a_graph;
int x = 10;
MPI_Status status;
//ONLY 2 RANKS ARE PRESENT. RANK 0 SENDS MSG TO RANK 1
if (rank == 0){
a_graph * my_graph = malloc(sizeof(my_graph))
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
for(int i =0; i < my_graph->count; i++)
my_graph->array[i] = i;
MPI_Send(my_graph->array,my_graph->count,int,1,0,MPI_COMM_WORLD);
free(my_graph->array);
free(my_graph);
}
else if (rank == 1){
a_graph * my_graph = malloc(sizeof(my_graph))
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
MPI_Recv(my_graph->array,my_graph->count,int,0,0,MPI_COMM_WORLD,&status) // MPI INVALID BUFFER POINTER ERROR HAPPENS AT THIS RECV
}
I dont understand why this happens since memory is allocated in both sender and receiver ranks
Below is a minimal, working, and verifiable (MWVE) example which Zulan suggested you to make. Please provide MWVE in your future questions. Anyway, you need to use MPI datatype MPI_INT instead of int for sending and receiving.
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
typedef struct graph{
int count;
int * array;
} a_graph;
int main()
{
MPI_Init(NULL, NULL);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int x = 10;
MPI_Status status;
//ONLY 2 RANKS ARE PRESENT. RANK 0 SENDS MSG TO RANK 1
if (rank == 0){
a_graph * my_graph = malloc(sizeof(a_graph));
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
for(int i =0; i < my_graph->count; i++)
my_graph->array[i] = i;
MPI_Send(my_graph->array,my_graph->count,MPI_INT,1,0,MPI_COMM_WORLD);
free(my_graph->array);
free(my_graph);
}
else if (rank == 1){
a_graph * my_graph = malloc(sizeof(a_graph));
my_graph->count = x;
my_graph->array = malloc(sizeof(int)*my_graph->count);
MPI_Recv(my_graph->array,my_graph->count,MPI_INT,0,0,MPI_COMM_WORLD,&status);
for (int i=0; i<my_graph->count; ++i)
{
printf("%i\n", my_graph->array[i]);
}
}
MPI_Finalize();
return 0;
}
I've been trying to profile an OpenCL host code for FIR filtering on MAC, Ubuntu and other platforms. My Host code and kernel are as below.
The issue is that irrespective of the number of samples that I provide for the FIR filter, the clenquendrangelernel ends up taking the same amount of time. Also I've profiled the clEnqueueReadBuffer and clEnqueueWriteBuffer as well and somehow they also end up taking the same amount of time. In mac I'm profiling with mach as well as using OpenCL events, in ubuntu, I'm profiling with PAPI. Im unable to understand why this is happening, ideally with increase in the number of samples, the clEnqueueReadBuffer and clEnqueueWriteBuffer should take more time and so should kernel execution.
Kernel:-
__kernel void fir4(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[4] = {5,7,5,7};
/*for(j=0;j<4;j++)
{
output[i] += coeff[j]*(input[i+4-j-1]);
}*/
//unrolled
output[i] += coeff[0]*(input[i+4-0-1]);
output[i] += coeff[1]*(input[i+4-1-1]);
output[i] += coeff[2]*(input[i+4-2-1]);
output[i] += coeff[3]*(input[i+4-3-1]);
}
__kernel void fir8(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[8] = {5,7,5,7,5,7,5,7};
for(j=0;j<8;j++)
{
output[i] += coeff[j]*(input[i+8-j-1]);
}
}
__kernel void fir12(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
for(j=0;j<12;j++)
{
output[i] += coeff[j]*(input[i+12-j-1]);
}
}
Host Code:-
// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)
int main(int argc, char** argv)
{
uint64_t start;
uint64_t end;
uint64_t elapsed;
double elapsedmilli;
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float coeff[NUM_COEFF];
float results_host[DATA_SIZE] = {};
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_event event; //Linking event to kernel for profiling
cl_platform_id platform_id = NULL; // compute device platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i,j = 0;
unsigned int count = DATA_SIZE;
unsigned int taps = NUM_COEFF;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
for(i=0; i < taps; i++)
{
if(!(i%2))
coeff[i] = 5;
else
coeff[i] = 7;
}
//Connect to a platform on device
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to locate opencl platform!\n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
int gpu = 0;
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
//Use function and load the kernel source from .cl files in the same folder
//
char *KernelSource = load_program_source("fir.cl");
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
switch(taps)
{
case(4):
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
case(8):
{
kernel = clCreateKernel(program, "fir8", &err);
break;
}
case(12):
{
kernel = clCreateKernel(program, "fir12", &err);
break;
}
default:
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
}
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel! - %d\n",err);
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
local = 48;
start = mach_absolute_time();
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err)
{
printf("Error: Failed to execute kernel!-%d\n",err);
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clWaitForEvents(1, &event);
clFinish(commands);
end = mach_absolute_time();
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);
elapsed = end - start;
struct mach_timebase_info info;
mach_timebase_info(&info);
double t = 1e-9 * (elapsed) * info.numer / info.denom;
elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i=0; i<DATA_SIZE; i++)
{
for(j=0;j<NUM_COEFF;j++)
{
results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
}
//printf("Host Output[%d]-%f\n",i,results_host[i]);
}
for(i = 0; i < count; i++)
{
if(results[i] == results_host[i])
correct++;
//printf("CL Output[%d]-%f\n",i,results[i]);
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
Adding just 10-20 multiplications and additions per item is not comparable to kernel overhead time. Try with 100 or 1000-wide coefficients array.
Using more input elements per item with that way, just increases cache hit numbers(also ratio) because more threads read from same locations.
If DATA_SIZE is several millions, then all data could not fit in cache and become slower linearly with its length. 48000 means less than 200kB. A HD5850 has 512 k L2 cache(3x bandwidth of memory) and 8kB L1 per compute unit(too fast) for example.
I have a piece of code for parallel hashing, the insert code is as follows:
int main(int argc, char** argv){
.....
Entry* table;//hash table
for(size_t i=0;i<N;i++){
keys[i]=i;
values[i] = rand();//random key-value pairs
}
int omp_p = omp_get_max_threads();
#pragma omp parallel for
for(int p=0;p<omp_p;p++){
size_t start = p*N/omp_p;
size_t end = (p+1)*N/omp_p;//each thread gets contiguous chunks of the arrays
for(size_t i=start;i<end;i++){
size_t key = keys[i];
size_t value = values[i];
if(insert(table,key,value) == 0){
printf("Failure!\n");
}
}
}
....
return 0;
}
int insert(Entry* table,size_t key, size_t value){
Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
/*Use cuckoo hashing*/
size_t location = hash_function_1(key);
for(size_t its=0;its<MAX_ITERATIONS;its++){
entry = __sync_lock_test_and_set(&table[location],entry);
key=get_key(entry);
if(key == KEY_EMPTY)
return1;
}
/*We have replaced a valid key, try to hash it using next available hash function*/
size_t location_1 = hash_function_1(key);
size_t location_2 = hash_function_2(key);
size_t location_3 = hash_function_3(key);
if(location == location_1) location = location_2;
else if(location == location_2) location = location_3;
else location = location_1;
}
return 0;
}
The insert code doesn't scale at all. If I use a single thread, for say, 10M keys, I complete in about 170ms, whereas using 16 threads, I take > 500ms. My suspicion is that this is because the cache line (consisting of the table[] array) is being moved around between the threads during the write operation (__sync_lock_test_and_set(...)) and the invalidation results in a slow down
For example if I modify the insert code to just:
int insert(Entry* table,size_t key, size_t value){
Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
size_t location = hash_function_1(key);
table[location] = entry;
return 1;
}
I still get the same bad performance. Since this is hashing, I cannot control, where a particular element hashes to. So any suggestions? Also, if this isn't the right reason, any other pointers as to what might be going wrong? I have tried it from 1M to 100M keys, but the single threaded performance is always better.
I have a few suggestions. Since the run time of your insert function is not constant then you should use schedule(dynamic). Second, you should let OpenMP divide the tasks and not do it yourself (one reason, but not the main reason, is that the way you have it now N has to be a multiple of omp_p). If you want to have some control over how it divides the tasks then try changing the chunksize like this schedule(dynamic,n) where n is the chuck size.
#pragma omp parallel for schedule(dynamic)
for(size_t i=0;i<N;i++){
size_t key = keys[i];
size_t value = values[i];
if(insert(table,key,value) == 0){
printf("Failure!\n");
}
}
I would try experimenting with a strategy based on locks, like this simple snippet shows:
#include<omp.h>
#define NHASHES 4
#define NTABLE 1000000
typedef size_t (hash_f)(size_t);
int main(int argc, char** argv) {
Entry table [NTABLE ];
hash_f hashes[NHASHES];
omp_lock_t locks [NTABLE ]
/* ... */
for(size_t ii = 0; ii < N; ii++) {
keys [ii] = ii;
values [ii] = rand();
}
for(size_t ii = 0; ii < NTABLE; ii++) {
omp_init_lock(&locks[ii]);
}
#pragma omp parallel
{
#pragma omp for schedule(static)
for(int ii = 0; ii < N; ii++) {
size_t key = keys [ii];
size_t value = values[ii];
Entry entry = (((Entry)key) << 32) + value;
for ( jj = 0; jj < NHASHES; jj++ ) {
size_t location = hashes[jj]; // I assume this is the computationally demanding part
omp_set_lock(&locks[location]); // Locks the hash table location before working on it
if ( get_key(table[location]) == KEY_EMPTY ) {
table[location] = entry;
break;
}
omp_unset_lock(&locks[location]); // Unlocks the hash table location
}
// Handle failures here
}
} /* pragma omp parallel */
for(size_t ii = 0; ii < NTABLE; ii++) {
omp_destroy_lock(&locks[ii]);
}
/* ... */
return 0;
}
With a little more machinery you can handle a variable number of locks ranging from 1 (equivalent to a critical section) to NTABLE (equivalent to an atomic construct) and see if the granularity in-between provides some benefit.
I am currently writing a code, that calculates a integral Histogram on the GPU using the Nvidia thrust library.
Therefore I allocate a continuous Block of device memory which I update with a custom functor all the time.
The problem is, that the write to the device memory is veeery slow, but the reads are actually ok.
The basic setup is the following:
struct HistogramCreation
{
HistogramCreation(
...
// pointer to memory
...
){}
/// The actual summation operator
__device__ void operator()(int index){
.. do the calculations ..
for(int j=0;j<30;j++){
(1) *_memoryPointer = values (also using reads to such locations) ;
}
}
}
void foo(){
cudaMalloc(_pointer,size);
HistogramCreation initialCreation( ... _pointer ...);
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(_imageSize),
initialCreation);
}
if I change the writing in (1) to the following>
unsigned int val = values;
The performance is much better. THis is the only global memory write I have.
Using the memory write I get about 2s for HD Footage.
using the local variable it takes about 50 ms so about a factor of 40 less.
Why is this so slow? how could I improve it?
Just as #OlegTitov said, frequent load/store with global
memory should be avoided as much as possible. When there's a
situation where it's inevitable, then coalesced memory
access can help the execution process not to get too slow;
however in most cases, histogram calculation is pretty tough
to realize the coalesced access.
While most of the above is basically just restating
#OlegTitov's answer, i'd just like to share about an
investigation i did about finding summation with NVIDIA
CUDA. Actually the result is pretty interesting and i hope
it'll be a helpful information for other xcuda developers.
The experiment was basically to run a speed test of finding
summation with various memory access patterns: using global
memory (1 thread), L2 cache (atomic ops - 128 threads), and
L1 cache (shared mem - 128 threads)
This experiment used:
Kepler GTX 680,
1546 cores # 1.06GHz
GDDR5 256-bit # 3GHz
Here are the kernels:
__global__
void glob(float *h) {
float* hist = h;
uint sd = SEEDRND;
uint random;
for (int i = 0; i < NUMLOOP; i++) {
if (i%NTHREADS==0) random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
hist[rind] += randval;
}
}
__global__
void atom(float *h) {
float* hist = h;
uint sd = SEEDRND;
for (int i = threadIdx.x; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
atomicAdd(&hist[rind], randval);
}
}
__global__
void shm(float *h) {
int lid = threadIdx.x;
uint sd = SEEDRND;
__shared__ float shm[NTHREADS][NBIN];
for (int i = 0; i < NBIN; i++) shm[lid][i] = h[i];
for (int i = lid; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
shm[lid][rind] += randval;
}
/* reduction here */
for (int i = 0; i < NBIN; i++) {
__syncthreads();
if (threadIdx.x < 64) {
shm[threadIdx.x][i] += shm[threadIdx.x+64][i];
}
__syncthreads();
if (threadIdx.x < 32) {
shm[threadIdx.x][i] += shm[threadIdx.x+32][i];
}
__syncthreads();
if (threadIdx.x < 16) {
shm[threadIdx.x][i] += shm[threadIdx.x+16][i];
}
__syncthreads();
if (threadIdx.x < 8) {
shm[threadIdx.x][i] += shm[threadIdx.x+8][i];
}
__syncthreads();
if (threadIdx.x < 4) {
shm[threadIdx.x][i] += shm[threadIdx.x+4][i];
}
__syncthreads();
if (threadIdx.x < 2) {
shm[threadIdx.x][i] += shm[threadIdx.x+2][i];
}
__syncthreads();
if (threadIdx.x == 0) {
shm[0][i] += shm[1][i];
}
}
for (int i = 0; i < NBIN; i++) h[i] = shm[0][i];
}
OUTPUT
atom: 102656.00 shm: 102656.00 glob: 102656.00
atom: 122240.00 shm: 122240.00 glob: 122240.00
... blah blah blah ...
One Thread: 126.3919 msec
Atomic: 7.5459 msec
Sh_mem: 2.2207 msec
The ratio between these kernels is 57:17:1. Many things can
be analyzed here, and it truly does not mean that using
L1 or L2 memory spaces will always give you more than 10
times speedup of the whole program.
And here's the main and other funcs:
#include <iostream>
#include <cstdlib>
#include <cstdio>
using namespace std;
#define NUMLOOP 1000000
#define NBIN 36
#define SEEDRND 1
#define NTHREADS 128
#define NBLOCKS 1
__device__ uint rnd(uint & seed) {
#if LONG_MAX > (16807*2147483647)
int const a = 16807;
int const m = 2147483647;
seed = (long(seed * a))%m;
return seed;
#else
double const a = 16807;
double const m = 2147483647;
double temp = seed * a;
seed = (int) (temp - m * floor(temp/m));
return seed;
#endif
}
... the above kernels ...
int main()
{
float *h_hist, *h_hist2, *h_hist3, *d_hist, *d_hist2,
*d_hist3;
h_hist = (float*)malloc(NBIN * sizeof(float));
h_hist2 = (float*)malloc(NBIN * sizeof(float));
h_hist3 = (float*)malloc(NBIN * sizeof(float));
cudaMalloc((void**)&d_hist, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist2, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist3, NBIN * sizeof(float));
for (int i = 0; i < NBIN; i++) h_hist[i] = 0.0f;
cudaMemcpy(d_hist, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist2, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist3, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaEvent_t start, end;
float elapsed = 0, elapsed2 = 0, elapsed3;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start, 0);
atom<<<NBLOCKS, NTHREADS>>>(d_hist);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed, start, end);
cudaEventRecord(start, 0);
shm<<<NBLOCKS, NTHREADS>>>(d_hist2);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed2, start, end);
cudaEventRecord(start, 0);
glob<<<1, 1>>>(d_hist3);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed3, start, end);
cudaMemcpy(h_hist, d_hist, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist2, d_hist2, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist3, d_hist3, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
/* print output */
for (int i = 0; i < NBIN; i++) {
printf("atom: %10.2f shm: %10.2f glob:
%10.2f¥n",h_hist[i],h_hist2[i],h_hist3[i]);
}
printf("%12s: %8.4f msec¥n", "One Thread", elapsed3);
printf("%12s: %8.4f msec¥n", "Atomic", elapsed);
printf("%12s: %8.4f msec¥n", "Sh_mem", elapsed2);
return 0;
}
When writing GPU code you should avoid reading and writing to/from global memory. Global memory is very slow on GPU. That's the hardware feature. The only thing you can do is to make neighboring treads read/write in neighboring adresses in global memory. This will cause coalescing and speed up the process. But in general read your data once, process it and write it out once.
Note that NVCC might optimize out a lot of your code after you make the modification - it detects that no write to global memory is made and just removes the "unneeded" code. So this speedup may not be coming out of the global writer per ce.
I would recommend using profiler on your actual code (the one with global write) to see if there's anything like unaligned access or other perf problem.