memory fragmentation in cuda - memory-management

I am having a memory allocation problem which I can't understand. I am trying to allocate a char array in GPU (I am guessing it is probably a memory fragmentation issue).
here is my code,
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<cuda.h>
inline void gpuAssert(cudaError_t code, char *file, int line,
int abort=1)
{
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
__global__ void calc(char *k,char *i)
{
*i=*k;
}
int main()
{
char *dev_o=0;
char *i;
i = (char*)malloc(10*sizeof(char));
cudaMalloc((void**)&dev_o,10*sizeof(char)); //Line 31
calc<<<1,1>>>("arun",dev_o);
gpuErrchk(cudaMemcpy(&i,dev_o,10*sizeof(char),cudaMemcpyDeviceToHost));
cudaFree(dev_o);
printf("string : %s \n",i);
return 0;
}
but I'm getting output as,
GPUassert: out of memory sample2.cu 31
In the same case, I tried to allocate integer in GPU and it's working properly.
My GPU device information is given as,
--- General Information for device 0 ---
Name:GeForce GTX 460 SE
Compute capability:2.1
Clock rate:1296000
Device copy overlap:Enabled
Kernel execition timeout :Enabled
--- Memory Information for device 0 ---
Total global mem:1073283072
Total constant Mem:65536
Max mem pitch:2147483647
Texture Alignment:512
--- MP Information for device 0 ---
Multiprocessor count:6
Shared mem per mp:49152
Registers per mp:32768
Threads in warp:32
Max threads per block:1024
Max thread dimensions:(1024, 1024, 64)
Max grid dimensions:(65535, 65535, 65535)
Can anyone tell me what is the problem and how I can overcome it?

Several things were wrong in your code.
cudaMemcpy(&i, ...) should be cudaMemcpy(i, ...).
Check the return error of your kernel call as explained in this post. If you don't, the error will seem to appear later in your code.
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
Your char *k argument to your kernel is a host pointer. You should create another device array and copy your data to the device before calling your kernel.
You were also not doing any parallel work on your threads in your calc() kernel since you were not using the thread indices, threadIdx.x. This was probably for testing though.
Here is what you would get if you fix these issues:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<cuda.h>
inline void gpuAssert(cudaError_t code, char *file, int line,
int abort=1)
{
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
__global__ void calc(char* k, char *i)
{
i[threadIdx.x] = k[threadIdx.x];
}
int main()
{
const char* msg = "arun";
char *dev_i, *dev_k;
char *i, *k;
k = (char*)malloc(10*sizeof(char));
i = (char*)malloc(10*sizeof(char));
sprintf(k, msg);
cudaMalloc((void**)&dev_i, 10*sizeof(char));
cudaMalloc((void**)&dev_k, 10*sizeof(char));
gpuErrchk(cudaMemcpy(dev_k, k, 10*sizeof(char), cudaMemcpyHostToDevice));
calc<<<1,5>>>(dev_k, dev_i);
gpuErrchk(cudaPeekAtLastError());
// Synchronization will be done in the next synchronous cudaMemCpy call, else
// you would need cudaDeviceSynchronize() to detect execution errors.
//gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(i, dev_i, 10*sizeof(char), cudaMemcpyDeviceToHost));
printf("string : %s\n", i);
cudaFree(dev_i);
cudaFree(dev_k);
free(i);
free(k);
return 0;
}

Related

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

I have a simple multi-threaded program where the thread performs random reads on a given file (in memory) divided evenly amongst the threads. The thread reads from the file to buffer and sets a value. This is really a program designed to test memory bandwidth. This is the following program,
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <ctype.h>
#include <inttypes.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>
#define NS_IN_SECOND 1000000000
uint64_t nano_time(void) {
struct timespec ts;
if( clock_gettime(CLOCK_REALTIME, &ts) == 0)
return ts.tv_sec * NS_IN_SECOND + ts.tv_nsec;
}
// avx512 test
#include <stdint.h>
void *__memmove_chk_avx512_no_vzeroupper(void *dest, void *src, size_t s);
/**
* To create 4 GB file: This will allocate space on disk
* $ dd < /dev/zero bs=1048576 count=4096 > testfile
*
* 100 GiB
* dd if=/dev/zero of=bigmmaptest bs=1M count=102400
* To clear cache:
* $ sync; echo 1 > /proc/sys/vm/drop_caches
*/
//#define SAMPLE_LATENCY 1
#define BYTES_IN_GB (1024*1024*1024)
// Block sized will be used for read and the same will be used for striding
// when iterating over a file in mmap.
#define DEFAULT_BLOCK_SIZE 4096 //8192
#define NANOSECONDS_IN_SECOND 1000000000
const char DEFAULT_NAME[] = "/mnt/tmp/mmaptest";
#define EXIT_MSG(...) \
do { \
printf(__VA_ARGS__); \
_exit(-1); \
} while (0)
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *buf,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
size_t get_filesize(const char* filename);
void print_help_message(const char *progname);
char* map_buffer(int fd, size_t size);
void *run_tests(void *);
static int silent = 0;
typedef struct {
int tid;
int fd;
char *mapped_buffer;
int read_mmap;
int read_syscall;
int write_mmap;
int write_syscall;
off_t *offsets;
size_t block_size;
size_t chunk_size;
int retval;
uint64_t start_time;
uint64_t end_time;
} threadargs_t;
size_t filesize;
int main(int argc, char **argv) {
char *fname = (char*) DEFAULT_NAME;
char *mapped_buffer = NULL;
int c, fd, i, flags = O_RDWR, numthreads = 1, ret, option_index;
static int randomaccess = 0,
read_mmap = 0, read_syscall = 0,
write_mmap = 0, write_syscall = 0,
mixed_mmap = 0, write_tr = 0;
off_t *offsets = 0;
size_t block_size = DEFAULT_BLOCK_SIZE, numblocks,
new_file_size = 0;
uint64_t min_start_time, max_end_time = 0, retval;
// permissions
uint64_t mode = S_IRWXU | S_IRWXG;
pthread_t *threads;
threadargs_t *threadargs;
static struct option long_options[] =
{
// Options set a flag
{"randomaccess", no_argument, &randomaccess, 1},
{"readmmap", no_argument, &read_mmap, 1},
{"readsyscall", no_argument, &read_syscall, 1},
{"silent", no_argument, &silent, 1},
{"writemmap", no_argument, &write_mmap, 1},
{"writesyscall", no_argument, &write_syscall, 1},
{"mixedmmap", no_argument, &mixed_mmap, 1},
// Options take an argument
{"block", required_argument, 0, 'b'},
{"file", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{"size", no_argument, 0, 's'},
{"threads", required_argument, 0, 't'},
{"writethreads", no_argument, 0, 'w'},
{0, 0, 0, 0}
};
//read operations
while(1) {
c = getopt_long(argc, argv, "b:f:h:s:t:w:",
long_options, &option_index);
// is end of the option
if (c == -1)
break;
switch(c)
{
case 0:
break;
case 'b':
block_size = atoi(optarg);
break;
case 'f':
fname = optarg;
break;
case 'h':
print_help_message(argv[0]);
_exit(0);
case 's':
new_file_size = (size_t)(atoi(optarg)) * BYTES_IN_GB;
break;
case 't':
numthreads = (int) (atoi(optarg));
break;
case 'w':
write_tr = atoi(optarg);
break;
default:
break;
}
}
if(!silent){
printf("PID: %d\n", getpid());
printf("Using file %s \n", fname);
}
if ((filesize = get_filesize(fname)) == -1) {
if (read_mmap || read_syscall) {
printf("Cannot obtain file size for %s: %s"
"File must exist prior to running read tests.\n",
fname, strerror(errno));
_exit(-1);
}
else
filesize = new_file_size;
}
fd = open((const char*)fname, flags, mode);
if(fd <0) {
printf("Clould not open/create file %s: %s\n",
fname, strerror(errno));
_exit(-1);
}
if(block_size < 0 || block_size > filesize){
printf("Invalid block size: %zu for file of size "
"%zu. Block size must be greater than 0 and no"
"greater than the file size.\n",
block_size, filesize);
_exit(-1);
}
/*
* Generate random block number for random file access.
* Sequential for sequential access
*/
numblocks = filesize/block_size;
if(filesize % block_size > 0)
numblocks++;
offsets = (off_t *) malloc(numblocks * sizeof(off_t));
if(offsets == 0){
printf("Failed to allocate memory: %s\n", strerror(errno));
_exit(-1);
}
for (uint64_t i = 0; i < numblocks; i++)
if(randomaccess)
offsets[i] = ((int)random() % numblocks) * block_size;
else
offsets[i] = i*block_size;
if (numblocks % numthreads != 0)
EXIT_MSG("We have %" PRIu64 " blocks and %d threads. "
"Threads must evenly divide blocks. "
"Please fix the args.\n",
(uint_least64_t)numblocks, numthreads);
if( read_mmap || write_mmap || mixed_mmap)
assert((mapped_buffer = map_buffer(fd, filesize)) != NULL);
threads = (pthread_t*)malloc(numthreads * sizeof(pthread_t));
threadargs =
(threadargs_t*)malloc(numthreads * sizeof(threadargs_t));
if (threads == NULL || threadargs == NULL)
EXIT_MSG("Could not allocate thread array for %d threads.\n", numthreads);
for (i = 0; i < numthreads; i++) {
if(mixed_mmap){
if (i < write_tr) {
write_mmap = 1;
} else {
read_mmap = 1;
}
}
threadargs[i].fd = fd;
threadargs[i].tid = i;
threadargs[i].block_size = block_size;
threadargs[i].chunk_size = filesize/numthreads;
threadargs[i].mapped_buffer = mapped_buffer;
threadargs[i].offsets = &offsets[numblocks/numthreads * i];
threadargs[i].read_mmap = read_mmap;
threadargs[i].read_syscall = read_syscall;
threadargs[i].write_mmap = write_mmap;
threadargs[i].write_syscall = write_syscall;
int ret = pthread_create(&threads[i], NULL, run_tests, &threadargs[i]);
if (ret!=0)
EXIT_MSG("pthread_create for %dth thread failed: %s\n",
i, strerror(errno));
}
for (i = 0; i< numthreads; i++){
ret = pthread_join(threads[i], NULL);
if (ret !=0)
EXIT_MSG("Thread %d failed in join: %s\n",
i, strerror(errno));
}
// for mixed mode determine read and write aggregate b/w.
if(mixed_mmap) {
// Write b/w
min_start_time = threadargs[0].start_time;
max_end_time = 0;
// Since tid 0 to write_tr-1 did writes, find it's min and max.
for(i=0; i < write_tr; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Write: %.2f\n",
(double)write_tr*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
// Read b/w
min_start_time = threadargs[write_tr].start_time;
max_end_time = 0;
for(i=write_tr; i < numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Read: %.2f\n",
(double)(numthreads-write_tr)*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
/**
* For total run time. Find the smallest start time
* and largest end time across all threads.
*/
min_start_time = threadargs[0].start_time;
max_end_time = 0;
for (i=0; i< numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("%.2f\n",
(double)filesize/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
munmap(mapped_buffer, filesize);
close(fd);
}
void * run_tests(void *args) {
uint64_t retval;
threadargs_t t = *(threadargs_t*)args;
if(t.read_mmap) {
if(!silent)
printf("Running read mmap test:\n");
retval = read_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.read_syscall) {
if(!silent)
printf("Running read syscall test:\n");
retval = read_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_mmap) {
if(!silent)
printf("Running write mmap test:\n");
retval = write_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_syscall) {
if(!silent)
printf("Running write syscall test:\n");
retval = write_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
return (void*) 0;
}
#define READ 1
#define WRITE 2
/**
********* SYSCALL section
*/
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, READ, offsets,
begin, end);
}
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, WRITE, offsets,
begin, end);
}
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char * buffer = NULL;
int i = 0;
size_t total_bytes_transferred = 0;
uint64_t begin_time, end_time, ret_token = 0;
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 0, block_size);
begin_time= nano_time();
while(!done) {
size_t bytes_transferred = 0;
if(optype == READ)
bytes_transferred = pread(fd, buffer, block_size, offsets[i++]);
else if (optype == WRITE)
bytes_transferred = pwrite(fd, buffer, block_size, offsets[i++]);
if (bytes_transferred == 0)
done = true;
else if(bytes_transferred == -1){
printf("Failed to IO: %s\n", strerror(errno));
return -1;
}
else {
total_bytes_transferred += bytes_transferred;
if (optype == WRITE && total_bytes_transferred == filesize)
done = true;
// Do random operation
ret_token += buffer[0];
}
if (i*block_size >= filesize)
done = true;
}
end_time = nano_time();
if(!silent){
printf("%s: %" PRIu64 " bytes transferred in %" PRIu64 ""
" ns.\n", (optype == READ)?"read-syscall":"write-syscall",
(uint_least64_t)total_bytes_transferred, (end_time-begin_time));
// Throughput in GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
return ret_token;
}
/**
* MMAP tests
*/
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end) {
return mmap_test(fd, tid, block_size, filesize, buf, READ, offsets, begin, end);
}
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end){
return mmap_test(fd, tid, block_size, filesize, buf, WRITE, offsets, begin, end);
}
// Add memory addr
#if SAMPLE_LATENCY
#define BEGIN_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) \
lat_begin_time = nano_time();
#define END_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) { \
lat_end_time = nano_time(); \
latency_samples[i/LAT_SAMPL_INTERVAL % MAX_LAT_SAMPLES] = \
lat_end_time - lat_begin_time; \
num_samples++; \
}
#define MAX_LAT_SAMPLES 50
//#define LAT_SAMPL_INTERVAL (1000*1048576)
#define LAT_SAMPL_INTERVAL block_size
#else
#define BEGIN_LAT_SAMPLE ;
#define END_LAT_SAMPLE
#endif
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *mapped_buffer,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char *buffer = NULL;
uint64_t i, j, numblocks, ret;
uint64_t begin_time, end_time, ret_token = 0;
#if SAMPLE_LATENCY
uint64_t lat_begin_time, lat_end_time;
size_t latency_samples[MAX_LAT_SAMPLES];
int num_samples = 0;
memset((void*)latency_samples, 0, sizeof(latency_samples));
#endif
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 1, block_size);
begin_time = nano_time();
for(i=0; i<filesize; i+=block_size){
off_t offset = offsets[i/block_size];
BEGIN_LAT_SAMPLE;
if(optype == READ) {
//__memmove_chk_avx512_no_vzeroupper(buffer, &mapped_buffer[offset], block_size);
memcpy(buffer, &mapped_buffer[offset], block_size);
ret_token += buffer[0];
}
else if (optype == WRITE) {
//__memmove_chk_avx512_no_vzeroupper(&mapped_buffer[offset], buffer, block_size);
memcpy(&mapped_buffer[offset], buffer, block_size);
ret_token += mapped_buffer[i];
}
END_LAT_SAMPLE;
}
end_time = nano_time();
if(!silent) {
printf("%s: %" PRIu64 " bytes read in %" PRIu64 " ns.\n",
(optype==READ)?"readmap":"writemap",
(uint_least64_t)filesize, (end_time-begin_time));
// print GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
#if SAMPLE_LATENCY
printf("\nSample latency for %ld byte block:\n", block_size);
for (i = 0; i < MAX_LAT_SAMPLES; i++)
printf("\t%ld: %ld\n", i, latency_samples[i]);
#endif
return ret_token;
}
char* map_buffer(int fd, size_t size) {
char *mapped_buffer = NULL;
// Populate
mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_POPULATE, fd, 0);
// Shared
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_SHARED, fd, 0);
// Anon test
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if(mapped_buffer == MAP_FAILED)
EXIT_MSG("Failed to mmap file of size %zu: %s\n",
size, strerror(errno));
// Might also need to gurantee page aligned - posix_memalign()
// int mret = madvise(mapped_buffer, filesize, MADV_HUGEPAGE);
// if(mret!=0) {
// fprintf(stderr, "failed madvise: %s\n", strerror(errno));
// }
return mapped_buffer;
}
size_t get_filesize(const char* filename){
int retval;
struct stat st;
retval = stat(filename, &st);
if(retval)
return -1;
else
return st.st_size;
}
void print_help_message(const char *progname) {
/* take only the last portion of the path */
const char *basename = strrchr(progname, '/');
basename = basename ? basename + 1 : progname;
printf("usage: %s [OPTION]\n", basename);
printf(" -h, --help\n"
" Print this help and exit.\n");
printf(" -b, --block[=BLOCKSIZE]\n"
" Block size used for read system calls.\n"
" For mmap tests, the size of the stride when iterating\n"
" over the file.\n"
" Defaults to %d.\n", DEFAULT_BLOCK_SIZE);
printf(" -f, --file[=FILENAME]\n"
" Perform all tests on this file (defaults to %s).\n",
DEFAULT_NAME);
printf(" --readsyscall\n"
" Perform a read test using system calls.\n");
printf(" --readmmap\n"
" Perform a read test using mmap.\n");
printf(" --writesyscall\n"
" Perform a write test using system calls.\n");
printf(" --writemmap\n"
" Perform a write test using mmap.\n");
printf(" --randomaccess\n"
" Perform random access.\n");
printf(" --threads\n"
" Number of threads to use. Defaults to one.\n");
printf(" --mixedmmap\n"
" Perfom read and write concurrently at different offsets\n");
printf(" -w, -writethreads[=0]\n"
" Number of threads that should perform write\n");
}
To compile:
$ gcc testm.c -o testm -lpthread -static -O2 -fno-builtin-memcpy
Commands to run the program:
$ dd if=/dev/zero of=bigmmaptest bs=1M count=25600 # 25 GiB file
$ ./testm -b 1024 -f bigmmaptest --threads 16 --randomaccess --readmmap
I am on a 32 core Xeon 5218 2nd Gen. L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
When the memcpy size is 1 KiB I get 21.7 GB/s but when the size is 256B I get 26.68 GB/s and 34.8 GB/s when the size is 4 KiB. Why is there a drop in the middle?
I observe that 2 KiB also performs poorly when compared to 256B and 4 KiB.
What's more interesting is, when I disable the L2 hardware prefetcher and without any other changes my bandwidth automatically increases for 1 KiB and 2 KiB. Without prefetch 2 KiB memcpy gives 34.8 GB/s. All of these are aggregate bandwidth.
With perf, I did measure L2 load-store misses but they turned out to not change drastically. This effect is also not seen for 8 threads and below.
I am on linux 5.0.4. I am using the glibC memcpy (gcc 7.5.0) and even with -O2 I observe the above quirk. Where 1 KiB access size gives 18.76 GiB/s with L2 prefetch and without I get 30.32 GiB/s. For comparison, 256 B access size provides 24.7 GiB/s with prefetch and 24.8 GiB/s without. Clearly, the drop in performance is because of the L2 cache pollution caused by the prefetcher, as this is not observed with smaller thread counts. I was considering if SMT could be the reason for increased pollution but I observe the effect distinctly at 16 threads on 16 physical cores.
Skimming through glibc memcpy code, I can see that any access below the size of 4 KiB uses AVX 256 instructions, so there is nothing changing there.
The smaller 256B size not seeing a drop from the L2 streamer might be due to the sequence of cache misses being too short to activate the streamer and waste bandwidth (and slots in the LFBs and L2 <-> L3 superqueue) on requests that won't be useful.
For aligned 4k, there are no bytes within the same page that you're not fetching, so the L2 prefetcher is positively useful, or at least not harmful. (Demand loads come in pretty quickly for later lines when running memcpy so I'm guessing speeds were about the same with/without HW prefetch enabled, unless HW prefetch helps getting started on a new 4k chunk while still waiting for the end of the previous.)
The L2 only sees physical addresses, and AFAIK it doesn't try to prefetch across a 4k boundary. (Even if its within the same 2M hugepage, because it doesn't know that either.) The "next-page prefetcher" Intel mentions being new in Ivy Bridge is AFAIK just a TLB prefetch, not data.
So with aligned 4k memcpy, HW prefetch stops automatically at the end of the data you're actually going to read, not wasting any bandwidth. Since mmap gives you page-aligned memory, these 4k memcopies are from a single source page.
(The destination is irrelevant as it probably stays hot in L1d cache, with maybe an occasional eviction to L2, and the reload from it after memcpy can come from store-forwarding, not even having to wait for memcpy's store to commit to L1d.)
Prediction: If your smaller memcpy source starts part way into a 4k page, but still end at the end of a 4k page, you'd probably see similar behaviour to prefetch disabled. e.g. generate a random page number, and start at 3072 bytes into it, doing a 1 KiB copy. So all your 1 KiB copies come from the ends of pages, never middles.
(You'd still have more dTLB misses per byte memcpyed, because each TLB entry is only covering 1 K of the data you ever actually read. You did you use MAP_POPULATE so you shouldn't be seeing page faults in the timed region, assuming you have enough RAM.)
L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
Those are aggregate totals, but L1d and L2 are private per-core! You have 32kiB L1d and 1MiB L2 per core, because this is Cascade Lake, same layout as Skylake-X.
And BTW, I'd consider using a fast PRNG like xorshift+ or xorshift* inside the timing loop; that's easily random enough to defeat prefetching; even a simple LFSR or even LCG with a power-of-2 modulo would do that (and be very cheap, just an imul and add). It avoids having to read offsets from another array, if you really want to isolate just the memcpy memory accesses. Probably doesn't make a difference though. One advantage of a very simple PRNG with a period equal to the space you're trying to cover (like an LCG) is that you won't generate the same address twice, giving you a random permutation of the blocks. But with a big enough block of memory, random cache hits even from L3 are unlikely even without that hard-to-achieve property.
Your current array of offsets is fine. (I didn't look at the code super closely, so I'm just assuming there aren't bugs.)

CUDA unified memory and Windows 10

While using CudaMallocManaged() to allocate an array of structs with arrays inside, I'm getting the error "out of memory" even though I have enough free memory. Here's some code that replicates my problem:
#include <iostream>
#include <cuda.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define N 100000
#define ARR_SZ 100
struct Struct
{
float* arr;
};
int main()
{
Struct* struct_arr;
gpuErrchk( cudaMallocManaged((void**)&struct_arr, sizeof(Struct)*N) );
for(int i = 0; i < N; ++i)
gpuErrchk( cudaMallocManaged((void**)&(struct_arr[i].arr), sizeof(float)*ARR_SZ) ); //out of memory...
for(int i = 0; i < N; ++i)
cudaFree(struct_arr[i].arr);
cudaFree(struct_arr);
/*float* f;
gpuErrchk( cudaMallocManaged((void**)&f, sizeof(float)*N*ARR_SZ) ); //this works ok
cudaFree(f);*/
return 0;
}
There doesn't seem to be a problem when I call cudaMallocManaged() once to allocate a single chunk of memory, as I'm showing in the last piece of commented code.
I have a GeForce GTX 1070 Ti, and I'm using Windows 10. A friend tried to compile the same code in a PC with Linux and it worked correctly, while it had the same issue in another PC with Windows 10. WDDM TDR is deactivated.
Any help would be appreciated. Thanks.
There is an allocation granularity.
This means that if you ask for 1 byte, or 400 bytes, what is actually used up is something like 4096 65536 bytes. So a bunch of very small allocations will actually use up memory at a much faster rate than what you would predict based on the requested allocation size. The solution is to not make very small allocations, but instead to allocate in larger chunks.
An alternative strategy here would also be to flatten your allocation, and carve out pieces from it for each of your arrays:
#include <iostream>
#include <cstdio>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define N 100000
#define ARR_SZ 100
struct Struct
{
float* arr;
};
int main()
{
Struct* struct_arr;
float* f;
gpuErrchk( cudaMallocManaged((void**)&struct_arr, sizeof(Struct)*N) );
gpuErrchk( cudaMallocManaged((void**)&f, sizeof(float)*N*ARR_SZ) );
for(int i = 0; i < N; ++i)
struct_arr[i].arr = f+i*ARR_SZ;
cudaFree(struct_arr);
cudaFree(f);
return 0;
}
ARR_SZ divisible by 4 means the various created pointers can also be up-cast to larger vector types e.g. float2 or float4, if your use had any intention of doing that.
A possible reason the original code works on linux is because managed memory on linux, in a proper setup, can oversubscribe the GPU physical memory. The result is the actual allocation limit is much higher than what the GPU on-board memory would suggest. It might also be that the linux case has a bit more free memory, or perhaps the allocation granularity on linux is different (smaller).
Based on a question in the comments, I decided to estimate the allocation granularity, using this code:
#include <iostream>
#include <cstdio>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define N 100000
#define ARR_SZ 100
struct Struct
{
float* arr;
};
int main()
{
Struct* struct_arr;
//float* f;
gpuErrchk(cudaMallocManaged((void**)& struct_arr, sizeof(Struct) * N));
#if 0
gpuErrchk(cudaMallocManaged((void**)& f, sizeof(float) * N * ARR_SZ));
for (int i = 0; i < N; ++i)
struct_arr[i].arr = f + i * ARR_SZ;
#else
size_t fre, tot;
gpuErrchk(cudaMemGetInfo(&fre, &tot));
std::cout << "Free: " << fre << " total: " << tot << std::endl;
for (int i = 0; i < N; ++i)
gpuErrchk(cudaMallocManaged((void**) & (struct_arr[i].arr), sizeof(float) * ARR_SZ));
gpuErrchk(cudaMemGetInfo(&fre, &tot));
std::cout << "Free: " << fre << " total: " << tot << std::endl;
for (int i = 0; i < N; ++i)
cudaFree(struct_arr[i].arr);
#endif
cudaFree(struct_arr);
//cudaFree(f);
return 0;
}
When I compile a debug project with that code, and run that on a windows 10 desktop with RTX 2070 GPU (8GB memory, same as GTX 1070 Ti) I get the following output:
Microsoft Windows [Version 10.0.17763.973]
(c) 2018 Microsoft Corporation. All rights reserved.
C:\Users\Robert Crovella>cd C:\Users\Robert Crovella\source\repos\test12\x64\Debug
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>test12
Free: 7069866393 total: 8589934592
Free: 516266393 total: 8589934592
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>test12
Free: 7069866393 total: 8589934592
Free: 516266393 total: 8589934592
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>
Note that on my machine there is only 0.5GB of reported free memory left after the 100,000 allocations. So if for any reason your 8GB GPU starts out with less free memory (entirely possible) you may run into an out-of-memory error, even though I did not.
The calculation of the allocation granularity is as follows:
7069866393 - 516266393 / 100000 = 65536 bytes per allocation(!)
So my previous estimate of 4096 bytes per allocation was way off, by at least 1 order of magnitude, on my machine/test setup.
The allocation granularity may vary based on:
windows or linux
WDDM or TCC
x86 or Power9
managed vs ordinary cudaMalloc
possibly other factors (e.g. CUDA version)
so my advice to future readers would not be to assume that it is always 65536 bytes per allocation, minimum.

Matrix Multiplication : Performance decreases after coalescing global memory access in CUDA

I have started recently started working with GPUs using CUDA. As a starter programme I tried to implement a simple matrix multiplication efficiently
C = AB
, Starting with the naive matrix multiplication (each thread loads all the elements of A and B for an element in C), the tiled implementation (threads collaboratively load a tile of elements from A and B in a tile in shared memory to reduce global memory traffic) provides good speed up.
However, in the tiled implementation too the access to the global memory is not in coalesced order. So, to increase performance it is better to transpose matrix B and then multiply. Below is my code,
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include <time.h>
#include <sys/time.h>
void querydeviceprop();
void allocate_matrix(float *h_a, float *h_b, int matDim);
void verify(float *h_c, float *h_c_check, int matDim);
void print_matrix(float *ha, int m,int n);
void transpose_matrix(float *ha, int matDim);
void mat_mul();
#define TILE_WIDTH 16 //should be equal to numThread for tiling implementation
__global__ void MatrixMult_tiling(float *d_a,float *d_b,float *d_c, int dim){
__shared__ float ta[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
__shared__ float tb[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
int bx,by,tx,ty,i,j;
float res;
int row, col;
bx=blockIdx.x; by=blockIdx.y;
tx=threadIdx.x; ty=threadIdx.y;
row=by*TILE_WIDTH+ty;
col=bx*TILE_WIDTH+tx;
res=0;
for(i=0;i<dim/TILE_WIDTH;i++){
//collaboratively load the elements. Each thread loads a single element.
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[(ty+i*TILE_WIDTH)*dim+col];
__syncthreads();
for(j=0;j<TILE_WIDTH;j++){
res=res+ta[ty][j]*tb[j][tx];
}
__syncthreads();
}
d_c[row*dim+col]=res;
}
__global__ void MatrixMult_tiling_coalesced(float *d_a,float *d_b,float *d_c, int dim){
__shared__ float ta[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
__shared__ float tb[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
int bx,by,tx,ty,i,j;
float res;
int row, col;
bx=blockIdx.x; by=blockIdx.y;
tx=threadIdx.x; ty=threadIdx.y;
row=by*TILE_WIDTH+ty;
col=bx*TILE_WIDTH+tx;
res=0;
for(i=0;i<dim/TILE_WIDTH;i++){
//collaboratively load the elements. Each thread loads a single element.
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
__syncthreads();
for(j=0;j<TILE_WIDTH;j++){
res=res+ta[ty][j]*tb[tx][j];
}
__syncthreads();
}
d_c[row*dim+col]=res;
}
__global__ void MatrixMult_naive(float *d_a,float *d_b,float *d_c, int dim){
int row,col,i;
col=blockIdx.y*blockDim.y+threadIdx.y;
row=blockIdx.x*blockDim.x+threadIdx.x;
float res;
if(row<dim && col<dim){
res=0;
for(i=0;i<dim;i++){
res=res+(d_a[row*dim+i]*d_b[i*dim+col]);
}
d_c[row*dim+col]=res;
}
}
int main(){
mat_mul();
return 0;
}
void mat_mul(){
cudaSetDevice(0);
time_t t;
cudaError_t err = cudaSuccess;
srand((unsigned) time(&t));
cudaEvent_t start, stop;
float milliseconds=0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int matDim = 8192;
float *h_a, *h_b, *h_c, *h_c_check;
/*declare the host memories*/
h_a=(float *)malloc(matDim*matDim*sizeof(float));
h_b=(float *)malloc(matDim*matDim*sizeof(float));
h_c=(float *)malloc(matDim*matDim*sizeof(float));
h_c_check=(float *)malloc(matDim*matDim*sizeof(float));
// Verify that allocations succeeded
if (h_a == NULL || h_b == NULL || h_c == NULL || h_c_check ==NULL)
{
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
allocate_matrix(h_a,h_b,matDim); // allocate memory to hold the matrix
//allocate cuda memory
float *d_a=NULL;
float *d_b=NULL;
float *d_c=NULL;
err=cudaMalloc((void **)&d_a, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err=cudaMalloc((void **)&d_b, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err=cudaMalloc((void **)&d_c, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Matrix dimension is : %d\n",matDim);
// Copy the host input matrix A and B in host memory to the device matrix in device memory
//printf("Copy input data from the host memory to the CUDA device\n");
cudaEventRecord(start);
err = cudaMemcpy(d_a, h_a, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy matrix A %10.10f ms\n",milliseconds);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventRecord(start);
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy matrix B %10.10f ms\n",milliseconds);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
/*constants for kernel launch*/
int numThread=16; //number of threads per Block axis
int numBlocks=matDim/numThread;
if(matDim%numThread)
numBlocks++;
dim3 dimGrid(numBlocks,numBlocks);
dim3 dimBlock(numThread,numThread);
//-------------------------------------------------------------
//-------transpose and copy to GPU-------
transpose_matrix(h_b, matDim);//transpose first the b matrix
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventSynchronize(stop);
if (err != cudaSuccess){
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//--------transpose and copy ends-------------
cudaEventRecord(start);
MatrixMult_tiling_coalesced<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess){
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time tiled & coalesced %10.10f ms\n",milliseconds);
//printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c_check, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy time tiled & coalesced %10.10f ms\n",milliseconds);
//------------transpose back the original B matrix----------------
transpose_matrix(h_b, matDim);//transpose first the b matrix
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventSynchronize(stop);
if (err != cudaSuccess){
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//------------transpose back the original matrix ends-------------
//-------------------------------------------------------------
cudaEventRecord(start);
MatrixMult_tiling<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time tiled %10.10f ms\n",milliseconds);
//printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy time tiled %10.10f ms\n",milliseconds);
//-------------------------------------------------------------
/*
cudaEventRecord(start);
MatrixMult_naive<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time naive %10.10f ms\n",milliseconds);
printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU memcpy time naive %10.10f ms\n",milliseconds);
*/
//-------------------------------------------------------------
verify(h_c, h_c_check, matDim);
// Free device global memory
err = cudaFree(d_a);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_b);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_c);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free host memory
free(h_a);
free(h_b);
free(h_c);
printf("Done\n");
}
void allocate_matrix(float *h_a, float *h_b, int matDim){
int i,j;
// Initialize the host input vectors
for (i = 0; i < matDim; i++)
{
for(j=0;j< matDim;j++){
h_a[i*matDim+j] = rand()%10;
h_b[i*matDim+j] = rand()%10;
}
}
}
void print_matrix(float *ha, int m,int n){
int i, j;
for(i=0;i<m;i++){
for(j=0;j<n;j++){
printf(" %.1f ",ha[i*m+j]);
}
printf("\n");
}
}
void transpose_matrix(float *h_a, int matDim){
int i, j;
int temp;
for(i=0;i<matDim;i++)
{
for(j=0;j<i;j++)
{
temp=h_a[i*matDim+j];
h_a[i*matDim+j]=h_a[j*matDim+i];
h_a[j*matDim+i]=temp;
}
}
}
void verify(float *h_c, float *h_c_check, int matDim){
int i,j;
//check the code
for (i = 0; i < matDim; i++)
{
for(j=0;j<matDim;j++){
if (fabs(h_c[i*matDim+j] - h_c_check[i*matDim+j]) > 1e-5)
{
printf("cpu : %f , gpu : %f\t",h_c[i*matDim+j],h_c_check[i*matDim+j]);
fprintf(stderr, "Result verification failed at element %d,%d !\n\n", i,j);
exit(EXIT_FAILURE);
}
}
}
printf("Test PASSED\n");
}
MatrixMult_tiling_coalesced and void MatrixMult_tiling are the functions with and without colalesced memroy access of the elements of B respectively.
Now, the problem is the time taken by MatrixMult_tiling_coalesced is almost double of the time taken by MatrixMult_tiling.
I understand that in the MatrixMult_tiling the eleemnts are loaded in the tiles in coalesced manner(i.e in row major order) for each tile, but the tiles are arranged in along a column whereas the the tiles in MatrixMult_tiling_coalesced are arranged along a row, so the function MatrixMult_tiling_coalesced should be faster than the other one.
But in practice I can see the opposite is true.
I will appreciate if someone can point out the reason.
Thnaks in advance..
EDIT 1:
After the answer of Robert (see below) I understand the problem was in the load operation during elemntwise multiplication.
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];]
to
tb[tx][ty]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
and
res=res+ta[ty][j]*tb[tx][j];
to
res=res+ta[ty][j]*tb[j][tx];
This inclreased performance of the MatrixMult_tiling_coalesced function from 1500 ms to 1000 ms. However, the function MatrixMult_tiling takes only 879 ms. So, the coalesced routine is still slower. I don't understand where is the problem now.
EDIT 2 :
I realized that in the EDIT 1, I had just moved the bank conflict problem from the elementwise multiplication to the element loading section. The folling changes in the code has no bank conflict,
tb[tx][ty]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
to
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
And
res=res+ta[ty][j]*tb[j][tx];
to
res=res+ta[ty][j]*tb[ty][j];
But it is still slightly slower than the MatrixMult_tiling function. The MatrixMult_tiling_coalesced function takes 982 ms vs 870 ms of MatrixMult_tiling function. It should be at least similar to MatrixMult_tiling if not faster.
FINAL EDIT :
Edit 2 will not produce correct result. So the code with edit 1 will be optimum. Transposing the one of the multiplicand matrix is probably not a good idea. :-(
Thanks all for helping.
B certainly isn't the matrix I would transpose in C=AB. But that is neither here nor there.
I'm not sure why you think:
in the tiled implementation too the access to the global memory is not in coalesced order
I don't see any lines of code in your MatrixMult_tiling that result in uncoalesced access.
Just to make sure we don't trip over terminology, "coalesced" or "uncoalesced" are terms that we apply to access patterns to global memory (not shared memory). Your global memory access patterns are in these lines in your tiled kernel:
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[(ty+i*TILE_WIDTH)*dim+col];
...
d_c[row*dim+col]=res;
and none of those patterns to global memory are uncoalesced. In each of the generated indices into d_a, d_b and d_c, if you perform the substitutions, you will find that the threadIdx.x variable is present in all of them and is not multiplied by any value, constant or otherwise. Therefore these patterns will all coalesce (nicely).
I will appreciate if someone can point out the reason.
You have done something bad when it comes to shared memory.
In your tiled kernel, your multiplication operation looks like this:
res=res+ta[ty][j]*tb[j][tx];
For this case:
ta[ty][j]
we have a situation where all threads in the warp (which have linearly increasing tx values but the same ty value) are reading the same location in shared memory. This is an "optimal" access pattern - it does not present any bank conflicts, and will be serviced in the shortest possible time.
For this case:
tb[j][tx]
we have a situation where adjacent threads in the warp are reading adjacent locations in shared memory. This is also an "optimal", un-bank-conflicted pattern, and will be serviced in the shortest possible time.
However in your MatrixMult_tiling_coalesced kernel, the corresponding multiplication operation is:
res=res+ta[ty][j]*tb[tx][j];
Again, with this case:
ta[ty][j]
we have a shared memory "broadcast" pattern (all threads in a warp read from the same location) which is optimal and fast. But in this case:
tb[tx][j]
you have actually created columnar access into shared memory. This is the worst possible access pattern for shared memory, and it will result in a 32-way serialization (or possibly 16-way serialization, in the case of your 16x16 threadblocks) of the load process, and definitely worse performance. Why? Remember that for a given load, j is constant across the warp, and tx increases linearly across the warp. Therefore, let's say j is 1 on a particular loop iteration. The threads in warp 0 will read:
tb[0][1], tb[1][1], tb[2][1], tb[3][1], ...
and these locations all belong to a particular "column" of shared memory, i.e. they all belong to the same shared memory bank. This is the worst-case pattern for shared memory.
For completeness, I claim that all of your global memory access patterns in your MatrixMult_tiling_coalesced kernel are also coalesced:
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
...
d_c[row*dim+col]=res;
so there should be no major difference in the global memory access pattern/activity/efficiency, between your two kernel implementations.
As a side note, I assume this is all a learning exercise. If you are interested in high-performance matrix multiply on the GPU, I would encourage you to consider using CUBLAS.

Pass host pointer array to device global memory pointer array?

Suppose we have;
struct collapsed {
char **seq;
int num;
};
...
__device__ *collapsed xdev;
...
collapsed *x_dev
cudaGetSymbolAddress((void **)&x_dev, xdev);
cudaMemcpyToSymbol(x_dev, x, sizeof(collapsed)*size); //x already defined collapsed * , this line gives ERROR
Whay do you think I am getting error at the last line : invalid device symbol ??
The first problem here is that x_dev isn't a device symbol. It might contain an address in a device memory, but that address cannot be passed to cudaMemcpyToSymbol. The call should just be:
cudaMemcpyToSymbol(xdev, ......);
Which brings up the second problem. Doing this:
cudaMemcpyToSymbol(xdev, x, sizeof(collapsed)*size);
would be illegal. xdev is a pointer, so the only valid value you can copy to xdev is a device address. If x is the address of a struct collapsed in device memory, then the only valid version of this memory transfer operation is
cudaMemcpyToSymbol(xdev, &x, sizeof(collapsed *));
ie. x must have previously have been set to the address of memory allocated in the device, something like
collapsed *x;
cudaMalloc((void **)&x, sizeof(collapsed)*size);
cudaMemcpy(x, host_src, sizeof(collapsed)*size, cudaMemcpyHostToDevice);
As promised, here is a complete working example. First the code:
#include <cstdlib>
#include <iostream>
#include <cuda_runtime.h>
struct collapsed {
char **seq;
int num;
};
__device__ collapsed xdev;
__global__
void kernel(const size_t item_sz)
{
if (threadIdx.x < xdev.num) {
char *p = xdev.seq[threadIdx.x];
char val = 0x30 + threadIdx.x;
for(size_t i=0; i<item_sz; i++) {
p[i] = val;
}
}
}
#define gpuQ(ans) { gpu_assert((ans), __FILE__, __LINE__); }
void gpu_assert(cudaError_t code, const char *file, const int line)
{
if (code != cudaSuccess)
{
std::cerr << "gpu_assert: " << cudaGetErrorString(code) << " "
<< file << " " << line << std::endl;
exit(code);
}
}
int main(void)
{
const int nitems = 32;
const size_t item_sz = 16;
const size_t buf_sz = size_t(nitems) * item_sz;
// Gpu memory for sequences
char *_buf;
gpuQ( cudaMalloc((void **)&_buf, buf_sz) );
gpuQ( cudaMemset(_buf, 0x7a, buf_sz) );
// Host array for holding sequence device pointers
char **seq = new char*[nitems];
size_t offset = 0;
for(int i=0; i<nitems; i++, offset += item_sz) {
seq[i] = _buf + offset;
}
// Device array holding sequence pointers
char **_seq;
size_t seq_sz = sizeof(char*) * size_t(nitems);
gpuQ( cudaMalloc((void **)&_seq, seq_sz) );
gpuQ( cudaMemcpy(_seq, seq, seq_sz, cudaMemcpyHostToDevice) );
// Host copy of the xdev structure to copy to the device
collapsed xdev_host;
xdev_host.num = nitems;
xdev_host.seq = _seq;
// Copy to device symbol
gpuQ( cudaMemcpyToSymbol(xdev, &xdev_host, sizeof(collapsed)) );
// Run Kernel
kernel<<<1,nitems>>>(item_sz);
// Copy back buffer
char *buf = new char[buf_sz];
gpuQ( cudaMemcpy(buf, _buf, buf_sz, cudaMemcpyDeviceToHost) );
// Print out seq values
// Each string should be ASCII starting from ´0´ (0x30)
char *seq_vals = buf;
for(int i=0; i<nitems; i++, seq_vals += item_sz) {
std::string s;
s.append(seq_vals, item_sz);
std::cout << s << std::endl;
}
return 0;
}
and here it is compiled and run:
$ /usr/local/cuda/bin/nvcc -arch=sm_12 -Xptxas=-v -g -G -o erogol erogol.cu
./erogol.cu(19): Warning: Cannot tell what pointer points to, assuming global memory space
ptxas info : 8 bytes gmem, 4 bytes cmem[14]
ptxas info : Compiling entry function '_Z6kernelm' for 'sm_12'
ptxas info : Used 5 registers, 20 bytes smem, 4 bytes cmem[1]
$ /usr/local/cuda/bin/cuda-memcheck ./erogol
========= CUDA-MEMCHECK
0000000000000000
1111111111111111
2222222222222222
3333333333333333
4444444444444444
5555555555555555
6666666666666666
7777777777777777
8888888888888888
9999999999999999
::::::::::::::::
;;;;;;;;;;;;;;;;
<<<<<<<<<<<<<<<<
================
>>>>>>>>>>>>>>>>
????????????????
################
AAAAAAAAAAAAAAAA
BBBBBBBBBBBBBBBB
CCCCCCCCCCCCCCCC
DDDDDDDDDDDDDDDD
EEEEEEEEEEEEEEEE
FFFFFFFFFFFFFFFF
GGGGGGGGGGGGGGGG
HHHHHHHHHHHHHHHH
IIIIIIIIIIIIIIII
JJJJJJJJJJJJJJJJ
KKKKKKKKKKKKKKKK
LLLLLLLLLLLLLLLL
MMMMMMMMMMMMMMMM
NNNNNNNNNNNNNNNN
OOOOOOOOOOOOOOOO
========= ERROR SUMMARY: 0 errors
Some notes:
To simplify things a bit, I have only used a single memory allocation _buf to hold all of the string data. Each value of seq is set to a different address within _buf. This is functionally equivalent to running a separate cudaMalloc call for each pointer, but much faster.
The key concept is to assemble a copy of the structure you wish to access on the device in host memory, then copy that to the device. All of the pointers in my xdev_host are device pointers. The CUDA API doesn't have any sort of deep copy or automatic pointer translation facility, so it is the programmer's responsibility to make sure this is correct.
Each thread in the kernel just fills its sequence with a difference ASCII character. Note that I have declared my xdev as a structure, rather than pointer to structure and copy values rather than a reference to the __device__ symbol (again to simplify things slightly). But otherwise the sequence of operations is what you would need to make your design pattern work.
Because I only have access to a compute 1.x device, the compiler issues a warning. One compute 2.x and 3.x this won't happen because of the improved memory model in those devices. The warning is normal and can be safely ignored.
Because each sequence is just written into a different part of _buf, I can transfer all the sequences back to the host with a single cudaMemcpy call.

Page faults on OS X when reading with MMAP

I am trying to benchmark file system I/O on Mac OS X using mmap.
#include <unistd.h>
#include <fcntl.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <stdio.h>
#include <math.h>
char c;
int main(int argc, char ** argv)
{
if (argc != 2)
{
printf("no files\n");
exit(1);
}
int fd = open(argv[1], O_RDONLY);
fcntl(fd, F_NOCACHE, 1);
int offset=0;
int size=0x100000;
int pagesize = getpagesize();
struct stat stats;
fstat(fd, &stats);
int filesize = stats.st_size;
printf("%d byte pages\n", pagesize);
printf("file %s # %d bytes\n", argv[1], filesize);
while(offset < filesize)
{
if(offset + size > filesize)
{
int pages = ceil((filesize-offset)/(double)pagesize);
size = pages*pagesize;
}
printf("mapping offset %x with size %x\n", offset, size);
void * mem = mmap(0, size, PROT_READ, 0, fd, offset);
if(mem == -1)
return 0;
offset+=size;
int i=0;
for(; i<size; i+=pagesize)
{
c = *((char *)mem+i);
}
munmap(mem, size);
}
return 0;
}
The idea is that I'll map a file or portion of it and then cause a page fault by dereferencing it. I am slowly losing my sanity since this doesn't at all work and I've done similar things on Linux before.
Change this line
void * mem = mmap(0, size, PROT_READ, 0, fd, offset);
to
void * mem = mmap(0, size, PROT_READ, MAP_PRIVATE, fd, offset);
And, don't compare mem with -1. Use this instead:
if(mem == MAP_FAILED) { ... }
It's both more readable and more portable.
General advice: if you're on a different UNIX platform from what you're used to, it's a good idea to open the man page. For mmap on OS X, it can be found here. It says
Conforming applications must specify either MAP_PRIVATE or MAP_SHARED.
So, specifying 0 on the fourth
argument is not OK in OS X. I believe
this is true for BSD in general.

Resources