AVX vs. SSE: expect to see a larger speedup

AVX vs. SSE: expect to see a larger speedup - performance

I expected AVX to be about 1.5x faster than SSE. All 3 arrays (3 arrays * 16384 elements *4 bytes/element = 196608 bytes) should fit in L2 cache (256KB) on an Intel Core CPU (Broadwell).
Are there any special compiler directives or flags that I should be using?
Compiler Version
$ clang --version
Apple LLVM version 9.0.0 (clang-900.0.38)
Target: x86_64-apple-darwin16.7.0
Thread model: posix
InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin
Compile line
$ make avx
clang -O3 -fno-tree-vectorize -msse -msse2 -msse3 -msse4.1 -mavx -mavx2 avx.c ; ./a.out 123
n: 123
AVX Time taken: 0 seconds 177 milliseconds
vector+vector:begin int: 1 5 127 0
SSE Time taken: 0 seconds 195 milliseconds
vector+vector:begin int: 1 5 127 0
avx.c
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <time.h>
#ifndef __cplusplus
#include <stdalign.h> // C11 defines _Alignas(). This header defines alignas()
#endif
#define REPS 50000
#define AR 16384
// add int vectors via AVX
__attribute__((noinline))
void add_iv_avx(__m256i *restrict a, __m256i *restrict b, __m256i *restrict out, int N) {
__m256i *x = __builtin_assume_aligned(a, 32);
__m256i *y = __builtin_assume_aligned(b, 32);
__m256i *z = __builtin_assume_aligned(out, 32);
const int loops = N / 8; // 8 is number of int32 in __m256i
for(int i=0; i < loops; i++) {
_mm256_store_si256(&z[i], _mm256_add_epi32(x[i], y[i]));
}
}
// add int vectors via SSE; https://en.wikipedia.org/wiki/Restrict
__attribute__((noinline))
void add_iv_sse(__m128i *restrict a, __m128i *restrict b, __m128i *restrict out, int N) {
__m128i *x = __builtin_assume_aligned(a, 16);
__m128i *y = __builtin_assume_aligned(b, 16);
__m128i *z = __builtin_assume_aligned(out, 16);
const int loops = N / sizeof(int);
for(int i=0; i < loops; i++) {
//out[i]= _mm_add_epi32(a[i], b[i]); // this also works
_mm_storeu_si128(&z[i], _mm_add_epi32(x[i], y[i]));
}
}
// printing
void p128_as_int(__m128i in) {
alignas(16) uint32_t v[4];
_mm_store_si128((__m128i*)v, in);
printf("int: %i %i %i %i\n", v[0], v[1], v[2], v[3]);
}
__attribute__((noinline))
void debug_print(int *h) {
printf("vector+vector:begin ");
p128_as_int(* (__m128i*) &h[0] );
}
int main(int argc, char *argv[]) {
int n = atoi (argv[1]);
printf("n: %d\n", n);
int *x,*y,*z;
if (posix_memalign((void**)&x, 32, 16384*sizeof(int))) { free(x); return EXIT_FAILURE; }
if (posix_memalign((void**)&y, 32, 16384*sizeof(int))) { free(y); return EXIT_FAILURE; }
if (posix_memalign((void**)&z, 32, 16384*sizeof(int))) { free(z); return EXIT_FAILURE; }
x[0]=0; x[1]=2; x[2]=4;
y[0]=1; y[1]=3; y[2]=n;
// touch each 4K page in x,y,z to avoid copy-on-write optimizations
for (int i=512; i<AR; i+= 512) { x[i]=1; y[i]=1; z[i]=1; }
// warmup
for(int i=0; i<REPS; ++i) { add_iv_avx((__m256i*)x, (__m256i*)y, (__m256i*)z, AR); }
// AVX
clock_t start = clock();
for(int i=0; i<REPS; ++i) { add_iv_avx((__m256i*)x, (__m256i*)y, (__m256i*)z, AR); }
int msec = (clock()-start) * 1000 / CLOCKS_PER_SEC;
printf(" AVX Time taken: %d seconds %d milliseconds\n", msec/1000, msec%1000);
debug_print(z);
// warmup
for(int i=0; i<REPS; ++i) { add_iv_sse((__m128i*)x, (__m128i*)y, (__m128i*)z, AR); }
// SSE
start = clock();
for(int i=0; i<REPS; ++i) { add_iv_sse((__m128i*)x, (__m128i*)y, (__m128i*)z, AR); }
msec = (clock()-start) * 1000 / CLOCKS_PER_SEC;
printf("\n SSE Time taken: %d seconds %d milliseconds\n", msec/1000, msec%1000);
debug_print(z);
return EXIT_SUCCESS;
}

The problem is that that your data doesn't fit in the L1 cache.
The L1 bandwidth of Broadwell is much larger than the L2 bandwidth.
The L1 bandwidth is large enough to load two 32 byte vectors every cpu cycle. So, a better AVX vs. SSE speedup
might be expected if your data set is much smaller. However, note that
the combined L1 read/write bandwidth is less than 2*32(r)+32(w)=96 bytes per cycle.
In practice 75 bytes per cycle is possible, see here.
The second graph on this page shows that indeed the L2 bandwidth is much smaller:
At Test_block_size=128KB (=32KB per core) the bandwidth is 900GB/s.
At Test_block_size=1MB (=256KB per core) the bandwidth is only 300GB/s.
(Note that Haswell 4770k has more or less the same L1 and L2 cache architecture as Broadwell.)
Try to reduce AR to 2000 and to increase NREP to 1000000 and see what happens with the SSE vs. AVX speedup.

Related

large-size page-locked memory copy get wrong result in CUDA

I found an issue about large-size page-locked memory in CUDA. Here is the source code and makefile. The code allocates 10GB page-locked memory and copy some data from device memory to this page-locked memory, the data in device memory are set 1.0 before the copy.
#include <cuda.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
__global__
void test_k(double* x, size_t n)
{
int gid = blockIdx.x*blockDim.x + threadIdx.x;
if(gid<n) x[gid] = 1.0 ;
}
int main(int argc, char* argv[])
{
size_t n = size_t(10)*1024*1024*1024/sizeof(double);
printf("\n n: %zu, page-locked memory size: %zu MB\n", n, n*sizeof(double)/1024/1024);
double* x_h = NULL, *x_d = NULL;
int gpuid = 0;
if(argc>1 ) gpuid = atoi(argv[1]);
printf("select gpu %d\n", gpuid);
checkCudaErrors(cudaSetDevice(gpuid));
checkCudaErrors(cudaMallocHost(&x_h, sizeof(double)*n));
checkCudaErrors(cudaMalloc(&x_d, sizeof(double)*n));
for(int i = 0; i < n; ++i) x_h[i]=0.0;
int nthd = 256;
int nblk = (n+nthd-1) / nthd;
test_k<<<nblk, nthd, 0, 0>>>(x_d, n);
checkCudaErrors(cudaMemcpy(x_h, x_d, sizeof(double)*n, cudaMemcpyDeviceToHost));
int errCount = 0;
for(size_t i = 0; i < n; ++i){
if(x_h[i] == 0.0) errCount++;
}
printf("%s errCount: %d, which should be 0\n", errCount?"Error:":"Correct", errCount);
checkCudaErrors(cudaFree(x_d));
checkCudaErrors(cudaFreeHost(x_h));
return 0;
}
CUDA_PATH = /depot/cuda/cuda-11.2/
CUDA_INC = -I$(CUDA_PATH)/include -I$(CUDA_PATH)/samples/common/inc
NVCC = $(CUDA_PATH)/bin/nvcc
NVCCXXFLAGS = -std=c++11 -O3 -w -m64 -Xptxas -dlcm=cg -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 $(CUDA_INC)
all: testLargePin
testLargePin: testLargePin.cu
$(NVCC) $^ $(NVCCXXFLAGS) -o $#
clean:
rm testLargePin -f
I run the binary on three different GPU servers(all with A100-SXM4-40GB). On machine 1, the result is correct. On machine 2, it reports
CUDA error at testLargePin.cu:31 code=719(cudaErrorLaunchFailure) "cudaMemcpy(x_h, x_d, sizeof(double)*n, cudaMemcpyDeviceToHost)"
On machine 3, its copy is wrong, there are lots of zeros in the page-locked array.
n: 1342177280, page-locked memory size: 10240 MB
select gpu 0
Error: errCount: 1024, which should be 0
Anyone knows the reason and how to fix the issue? like an API to check the max page-locked memory size in specified machine? Thanks in advance.

By NVIDIA, (https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9)
Error 719 is about dereferencing an invalid device pointer, accessing out of bounds shared memory, or system specific problem...
In my experience, synchronization helped troubles about memory error and inconsistent results. Did you try adding cudaDeviceSyncronize(); after checkCudaErrors(cudaMemcpy(x_h, x_d, sizeof(double)*n, cudaMemcpyDeviceToHost)); ??
About page-locked memory, there's no limit in CUDA. I think you have to check this on your host side.

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

I have a simple multi-threaded program where the thread performs random reads on a given file (in memory) divided evenly amongst the threads. The thread reads from the file to buffer and sets a value. This is really a program designed to test memory bandwidth. This is the following program,
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <ctype.h>
#include <inttypes.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>
#define NS_IN_SECOND 1000000000
uint64_t nano_time(void) {
struct timespec ts;
if( clock_gettime(CLOCK_REALTIME, &ts) == 0)
return ts.tv_sec * NS_IN_SECOND + ts.tv_nsec;
}
// avx512 test
#include <stdint.h>
void *__memmove_chk_avx512_no_vzeroupper(void *dest, void *src, size_t s);
/**
* To create 4 GB file: This will allocate space on disk
* $ dd < /dev/zero bs=1048576 count=4096 > testfile
*
* 100 GiB
* dd if=/dev/zero of=bigmmaptest bs=1M count=102400
* To clear cache:
* $ sync; echo 1 > /proc/sys/vm/drop_caches
*/
//#define SAMPLE_LATENCY 1
#define BYTES_IN_GB (1024*1024*1024)
// Block sized will be used for read and the same will be used for striding
// when iterating over a file in mmap.
#define DEFAULT_BLOCK_SIZE 4096 //8192
#define NANOSECONDS_IN_SECOND 1000000000
const char DEFAULT_NAME[] = "/mnt/tmp/mmaptest";
#define EXIT_MSG(...) \
do { \
printf(__VA_ARGS__); \
_exit(-1); \
} while (0)
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *buf,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
size_t get_filesize(const char* filename);
void print_help_message(const char *progname);
char* map_buffer(int fd, size_t size);
void *run_tests(void *);
static int silent = 0;
typedef struct {
int tid;
int fd;
char *mapped_buffer;
int read_mmap;
int read_syscall;
int write_mmap;
int write_syscall;
off_t *offsets;
size_t block_size;
size_t chunk_size;
int retval;
uint64_t start_time;
uint64_t end_time;
} threadargs_t;
size_t filesize;
int main(int argc, char **argv) {
char *fname = (char*) DEFAULT_NAME;
char *mapped_buffer = NULL;
int c, fd, i, flags = O_RDWR, numthreads = 1, ret, option_index;
static int randomaccess = 0,
read_mmap = 0, read_syscall = 0,
write_mmap = 0, write_syscall = 0,
mixed_mmap = 0, write_tr = 0;
off_t *offsets = 0;
size_t block_size = DEFAULT_BLOCK_SIZE, numblocks,
new_file_size = 0;
uint64_t min_start_time, max_end_time = 0, retval;
// permissions
uint64_t mode = S_IRWXU | S_IRWXG;
pthread_t *threads;
threadargs_t *threadargs;
static struct option long_options[] =
{
// Options set a flag
{"randomaccess", no_argument, &randomaccess, 1},
{"readmmap", no_argument, &read_mmap, 1},
{"readsyscall", no_argument, &read_syscall, 1},
{"silent", no_argument, &silent, 1},
{"writemmap", no_argument, &write_mmap, 1},
{"writesyscall", no_argument, &write_syscall, 1},
{"mixedmmap", no_argument, &mixed_mmap, 1},
// Options take an argument
{"block", required_argument, 0, 'b'},
{"file", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{"size", no_argument, 0, 's'},
{"threads", required_argument, 0, 't'},
{"writethreads", no_argument, 0, 'w'},
{0, 0, 0, 0}
};
//read operations
while(1) {
c = getopt_long(argc, argv, "b:f:h:s:t:w:",
long_options, &option_index);
// is end of the option
if (c == -1)
break;
switch(c)
{
case 0:
break;
case 'b':
block_size = atoi(optarg);
break;
case 'f':
fname = optarg;
break;
case 'h':
print_help_message(argv[0]);
_exit(0);
case 's':
new_file_size = (size_t)(atoi(optarg)) * BYTES_IN_GB;
break;
case 't':
numthreads = (int) (atoi(optarg));
break;
case 'w':
write_tr = atoi(optarg);
break;
default:
break;
}
}
if(!silent){
printf("PID: %d\n", getpid());
printf("Using file %s \n", fname);
}
if ((filesize = get_filesize(fname)) == -1) {
if (read_mmap || read_syscall) {
printf("Cannot obtain file size for %s: %s"
"File must exist prior to running read tests.\n",
fname, strerror(errno));
_exit(-1);
}
else
filesize = new_file_size;
}
fd = open((const char*)fname, flags, mode);
if(fd <0) {
printf("Clould not open/create file %s: %s\n",
fname, strerror(errno));
_exit(-1);
}
if(block_size < 0 || block_size > filesize){
printf("Invalid block size: %zu for file of size "
"%zu. Block size must be greater than 0 and no"
"greater than the file size.\n",
block_size, filesize);
_exit(-1);
}
/*
* Generate random block number for random file access.
* Sequential for sequential access
*/
numblocks = filesize/block_size;
if(filesize % block_size > 0)
numblocks++;
offsets = (off_t *) malloc(numblocks * sizeof(off_t));
if(offsets == 0){
printf("Failed to allocate memory: %s\n", strerror(errno));
_exit(-1);
}
for (uint64_t i = 0; i < numblocks; i++)
if(randomaccess)
offsets[i] = ((int)random() % numblocks) * block_size;
else
offsets[i] = i*block_size;
if (numblocks % numthreads != 0)
EXIT_MSG("We have %" PRIu64 " blocks and %d threads. "
"Threads must evenly divide blocks. "
"Please fix the args.\n",
(uint_least64_t)numblocks, numthreads);
if( read_mmap || write_mmap || mixed_mmap)
assert((mapped_buffer = map_buffer(fd, filesize)) != NULL);
threads = (pthread_t*)malloc(numthreads * sizeof(pthread_t));
threadargs =
(threadargs_t*)malloc(numthreads * sizeof(threadargs_t));
if (threads == NULL || threadargs == NULL)
EXIT_MSG("Could not allocate thread array for %d threads.\n", numthreads);
for (i = 0; i < numthreads; i++) {
if(mixed_mmap){
if (i < write_tr) {
write_mmap = 1;
} else {
read_mmap = 1;
}
}
threadargs[i].fd = fd;
threadargs[i].tid = i;
threadargs[i].block_size = block_size;
threadargs[i].chunk_size = filesize/numthreads;
threadargs[i].mapped_buffer = mapped_buffer;
threadargs[i].offsets = &offsets[numblocks/numthreads * i];
threadargs[i].read_mmap = read_mmap;
threadargs[i].read_syscall = read_syscall;
threadargs[i].write_mmap = write_mmap;
threadargs[i].write_syscall = write_syscall;
int ret = pthread_create(&threads[i], NULL, run_tests, &threadargs[i]);
if (ret!=0)
EXIT_MSG("pthread_create for %dth thread failed: %s\n",
i, strerror(errno));
}
for (i = 0; i< numthreads; i++){
ret = pthread_join(threads[i], NULL);
if (ret !=0)
EXIT_MSG("Thread %d failed in join: %s\n",
i, strerror(errno));
}
// for mixed mode determine read and write aggregate b/w.
if(mixed_mmap) {
// Write b/w
min_start_time = threadargs[0].start_time;
max_end_time = 0;
// Since tid 0 to write_tr-1 did writes, find it's min and max.
for(i=0; i < write_tr; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Write: %.2f\n",
(double)write_tr*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
// Read b/w
min_start_time = threadargs[write_tr].start_time;
max_end_time = 0;
for(i=write_tr; i < numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Read: %.2f\n",
(double)(numthreads-write_tr)*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
/**
* For total run time. Find the smallest start time
* and largest end time across all threads.
*/
min_start_time = threadargs[0].start_time;
max_end_time = 0;
for (i=0; i< numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("%.2f\n",
(double)filesize/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
munmap(mapped_buffer, filesize);
close(fd);
}
void * run_tests(void *args) {
uint64_t retval;
threadargs_t t = *(threadargs_t*)args;
if(t.read_mmap) {
if(!silent)
printf("Running read mmap test:\n");
retval = read_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.read_syscall) {
if(!silent)
printf("Running read syscall test:\n");
retval = read_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_mmap) {
if(!silent)
printf("Running write mmap test:\n");
retval = write_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_syscall) {
if(!silent)
printf("Running write syscall test:\n");
retval = write_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
return (void*) 0;
}
#define READ 1
#define WRITE 2
/**
********* SYSCALL section
*/
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, READ, offsets,
begin, end);
}
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, WRITE, offsets,
begin, end);
}
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char * buffer = NULL;
int i = 0;
size_t total_bytes_transferred = 0;
uint64_t begin_time, end_time, ret_token = 0;
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 0, block_size);
begin_time= nano_time();
while(!done) {
size_t bytes_transferred = 0;
if(optype == READ)
bytes_transferred = pread(fd, buffer, block_size, offsets[i++]);
else if (optype == WRITE)
bytes_transferred = pwrite(fd, buffer, block_size, offsets[i++]);
if (bytes_transferred == 0)
done = true;
else if(bytes_transferred == -1){
printf("Failed to IO: %s\n", strerror(errno));
return -1;
}
else {
total_bytes_transferred += bytes_transferred;
if (optype == WRITE && total_bytes_transferred == filesize)
done = true;
// Do random operation
ret_token += buffer[0];
}
if (i*block_size >= filesize)
done = true;
}
end_time = nano_time();
if(!silent){
printf("%s: %" PRIu64 " bytes transferred in %" PRIu64 ""
" ns.\n", (optype == READ)?"read-syscall":"write-syscall",
(uint_least64_t)total_bytes_transferred, (end_time-begin_time));
// Throughput in GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
return ret_token;
}
/**
* MMAP tests
*/
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end) {
return mmap_test(fd, tid, block_size, filesize, buf, READ, offsets, begin, end);
}
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end){
return mmap_test(fd, tid, block_size, filesize, buf, WRITE, offsets, begin, end);
}
// Add memory addr
#if SAMPLE_LATENCY
#define BEGIN_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) \
lat_begin_time = nano_time();
#define END_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) { \
lat_end_time = nano_time(); \
latency_samples[i/LAT_SAMPL_INTERVAL % MAX_LAT_SAMPLES] = \
lat_end_time - lat_begin_time; \
num_samples++; \
}
#define MAX_LAT_SAMPLES 50
//#define LAT_SAMPL_INTERVAL (1000*1048576)
#define LAT_SAMPL_INTERVAL block_size
#else
#define BEGIN_LAT_SAMPLE ;
#define END_LAT_SAMPLE
#endif
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *mapped_buffer,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char *buffer = NULL;
uint64_t i, j, numblocks, ret;
uint64_t begin_time, end_time, ret_token = 0;
#if SAMPLE_LATENCY
uint64_t lat_begin_time, lat_end_time;
size_t latency_samples[MAX_LAT_SAMPLES];
int num_samples = 0;
memset((void*)latency_samples, 0, sizeof(latency_samples));
#endif
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 1, block_size);
begin_time = nano_time();
for(i=0; i<filesize; i+=block_size){
off_t offset = offsets[i/block_size];
BEGIN_LAT_SAMPLE;
if(optype == READ) {
//__memmove_chk_avx512_no_vzeroupper(buffer, &mapped_buffer[offset], block_size);
memcpy(buffer, &mapped_buffer[offset], block_size);
ret_token += buffer[0];
}
else if (optype == WRITE) {
//__memmove_chk_avx512_no_vzeroupper(&mapped_buffer[offset], buffer, block_size);
memcpy(&mapped_buffer[offset], buffer, block_size);
ret_token += mapped_buffer[i];
}
END_LAT_SAMPLE;
}
end_time = nano_time();
if(!silent) {
printf("%s: %" PRIu64 " bytes read in %" PRIu64 " ns.\n",
(optype==READ)?"readmap":"writemap",
(uint_least64_t)filesize, (end_time-begin_time));
// print GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
#if SAMPLE_LATENCY
printf("\nSample latency for %ld byte block:\n", block_size);
for (i = 0; i < MAX_LAT_SAMPLES; i++)
printf("\t%ld: %ld\n", i, latency_samples[i]);
#endif
return ret_token;
}
char* map_buffer(int fd, size_t size) {
char *mapped_buffer = NULL;
// Populate
mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_POPULATE, fd, 0);
// Shared
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_SHARED, fd, 0);
// Anon test
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if(mapped_buffer == MAP_FAILED)
EXIT_MSG("Failed to mmap file of size %zu: %s\n",
size, strerror(errno));
// Might also need to gurantee page aligned - posix_memalign()
// int mret = madvise(mapped_buffer, filesize, MADV_HUGEPAGE);
// if(mret!=0) {
// fprintf(stderr, "failed madvise: %s\n", strerror(errno));
// }
return mapped_buffer;
}
size_t get_filesize(const char* filename){
int retval;
struct stat st;
retval = stat(filename, &st);
if(retval)
return -1;
else
return st.st_size;
}
void print_help_message(const char *progname) {
/* take only the last portion of the path */
const char *basename = strrchr(progname, '/');
basename = basename ? basename + 1 : progname;
printf("usage: %s [OPTION]\n", basename);
printf(" -h, --help\n"
" Print this help and exit.\n");
printf(" -b, --block[=BLOCKSIZE]\n"
" Block size used for read system calls.\n"
" For mmap tests, the size of the stride when iterating\n"
" over the file.\n"
" Defaults to %d.\n", DEFAULT_BLOCK_SIZE);
printf(" -f, --file[=FILENAME]\n"
" Perform all tests on this file (defaults to %s).\n",
DEFAULT_NAME);
printf(" --readsyscall\n"
" Perform a read test using system calls.\n");
printf(" --readmmap\n"
" Perform a read test using mmap.\n");
printf(" --writesyscall\n"
" Perform a write test using system calls.\n");
printf(" --writemmap\n"
" Perform a write test using mmap.\n");
printf(" --randomaccess\n"
" Perform random access.\n");
printf(" --threads\n"
" Number of threads to use. Defaults to one.\n");
printf(" --mixedmmap\n"
" Perfom read and write concurrently at different offsets\n");
printf(" -w, -writethreads[=0]\n"
" Number of threads that should perform write\n");
}
To compile:
$ gcc testm.c -o testm -lpthread -static -O2 -fno-builtin-memcpy
Commands to run the program:
$ dd if=/dev/zero of=bigmmaptest bs=1M count=25600 # 25 GiB file
$ ./testm -b 1024 -f bigmmaptest --threads 16 --randomaccess --readmmap
I am on a 32 core Xeon 5218 2nd Gen. L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
When the memcpy size is 1 KiB I get 21.7 GB/s but when the size is 256B I get 26.68 GB/s and 34.8 GB/s when the size is 4 KiB. Why is there a drop in the middle?
I observe that 2 KiB also performs poorly when compared to 256B and 4 KiB.
What's more interesting is, when I disable the L2 hardware prefetcher and without any other changes my bandwidth automatically increases for 1 KiB and 2 KiB. Without prefetch 2 KiB memcpy gives 34.8 GB/s. All of these are aggregate bandwidth.
With perf, I did measure L2 load-store misses but they turned out to not change drastically. This effect is also not seen for 8 threads and below.
I am on linux 5.0.4. I am using the glibC memcpy (gcc 7.5.0) and even with -O2 I observe the above quirk. Where 1 KiB access size gives 18.76 GiB/s with L2 prefetch and without I get 30.32 GiB/s. For comparison, 256 B access size provides 24.7 GiB/s with prefetch and 24.8 GiB/s without. Clearly, the drop in performance is because of the L2 cache pollution caused by the prefetcher, as this is not observed with smaller thread counts. I was considering if SMT could be the reason for increased pollution but I observe the effect distinctly at 16 threads on 16 physical cores.
Skimming through glibc memcpy code, I can see that any access below the size of 4 KiB uses AVX 256 instructions, so there is nothing changing there.

The smaller 256B size not seeing a drop from the L2 streamer might be due to the sequence of cache misses being too short to activate the streamer and waste bandwidth (and slots in the LFBs and L2 <-> L3 superqueue) on requests that won't be useful.
For aligned 4k, there are no bytes within the same page that you're not fetching, so the L2 prefetcher is positively useful, or at least not harmful. (Demand loads come in pretty quickly for later lines when running memcpy so I'm guessing speeds were about the same with/without HW prefetch enabled, unless HW prefetch helps getting started on a new 4k chunk while still waiting for the end of the previous.)
The L2 only sees physical addresses, and AFAIK it doesn't try to prefetch across a 4k boundary. (Even if its within the same 2M hugepage, because it doesn't know that either.) The "next-page prefetcher" Intel mentions being new in Ivy Bridge is AFAIK just a TLB prefetch, not data.
So with aligned 4k memcpy, HW prefetch stops automatically at the end of the data you're actually going to read, not wasting any bandwidth. Since mmap gives you page-aligned memory, these 4k memcopies are from a single source page.
(The destination is irrelevant as it probably stays hot in L1d cache, with maybe an occasional eviction to L2, and the reload from it after memcpy can come from store-forwarding, not even having to wait for memcpy's store to commit to L1d.)
Prediction: If your smaller memcpy source starts part way into a 4k page, but still end at the end of a 4k page, you'd probably see similar behaviour to prefetch disabled. e.g. generate a random page number, and start at 3072 bytes into it, doing a 1 KiB copy. So all your 1 KiB copies come from the ends of pages, never middles.
(You'd still have more dTLB misses per byte memcpyed, because each TLB entry is only covering 1 K of the data you ever actually read. You did you use MAP_POPULATE so you shouldn't be seeing page faults in the timed region, assuming you have enough RAM.)
L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
Those are aggregate totals, but L1d and L2 are private per-core! You have 32kiB L1d and 1MiB L2 per core, because this is Cascade Lake, same layout as Skylake-X.
And BTW, I'd consider using a fast PRNG like xorshift+ or xorshift* inside the timing loop; that's easily random enough to defeat prefetching; even a simple LFSR or even LCG with a power-of-2 modulo would do that (and be very cheap, just an imul and add). It avoids having to read offsets from another array, if you really want to isolate just the memcpy memory accesses. Probably doesn't make a difference though. One advantage of a very simple PRNG with a period equal to the space you're trying to cover (like an LCG) is that you won't generate the same address twice, giving you a random permutation of the blocks. But with a big enough block of memory, random cache hits even from L3 are unlikely even without that hard-to-achieve property.
Your current array of offsets is fine. (I didn't look at the code super closely, so I'm just assuming there aren't bugs.)

Speed up random memory access using prefetch

I am trying to speed up a single program by using prefetches. The purpose of my program is just for test. Here is what it does:
It uses two int buffers of the same size
It reads one-by-one all the values of the first buffer
It reads the value at the index in the second buffer
It sums all the values taken from the second buffer
It does all the previous steps for bigger and bigger
At the end, I print the number of voluntary and involuntary CPU
In the very first time, values in the first buffers contains the values of its index (cf. function createIndexBuffer in the code just below) .
It will be more clear in the code of my program:
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#define BUFFER_SIZE ((unsigned long) 4096 * 100000)
unsigned int randomUint()
{
int value = rand() % UINT_MAX;
return value;
}
unsigned int * createValueBuffer()
{
unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
valueBuffer[i] = randomUint();
}
return (valueBuffer);
}
unsigned int * createIndexBuffer()
{
unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = i;
}
return (indexBuffer);
}
unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
unsigned long long sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
unsigned int index = indexBuffer[i];
sum += valueBuffer[index];
}
return (sum);
}
unsigned int computeTimeInMicroSeconds()
{
unsigned int * valueBuffer = createValueBuffer();
unsigned int * indexBuffer = createIndexBuffer();
struct timeval startTime, endTime;
gettimeofday(&startTime, NULL);
unsigned long long sum = computeSum(indexBuffer, valueBuffer);
gettimeofday(&endTime, NULL);
printf("Sum = %llu\n", sum);
free(indexBuffer);
free(valueBuffer);
return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);
}
int main()
{
printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
unsigned int timeInMicroSeconds = computeTimeInMicroSeconds();
printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}
If I launch it, I get the following output:
$ gcc TestPrefetch.c -O3 -o TestPrefetch && ./TestPrefetch
sizeof buffers = 1562Mb
Sum = 439813150288855829
Time: 201172 micro-seconds = 0.201 seconds
Quick and fast!!!
According to my knowledge (I may be wrong), one of the reason for having such a fast program is that, as I access my two buffers sequentially, data can be prefetched in the CPU cache.
We can make it more complex in order that data is (almost) prefeched in CPU cache. For example, we can just change the createIndexBuffer function in:
unsigned int * createIndexBuffer()
{
unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = rand() % BUFFER_SIZE;
}
return (indexBuffer);
}
Let's try the program once again:
$ gcc TestPrefetch.c -O3 -o TestPrefetch && ./TestPrefetch
sizeof buffers = 1562Mb
Sum = 439835307963131237
Time: 3730387 micro-seconds = 3.730 seconds
More than 18 times slower!!!
We now arrive to my problem. Given the new createIndexBuffer function, I would like to speed up computeSum function using prefetch
unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
unsigned long long sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
__builtin_prefetch((char *) &indexBuffer[i + 1], 0, 0);
unsigned int index = indexBuffer[i];
sum += valueBuffer[index];
}
return (sum);
}
of course I also have to change my createIndexBuffer in order it allocates a buffer having one more element
I relaunch my program: not better! As prefetch may be slower than one "for" loop iteration, I may prefetch not one element before but two elements before
__builtin_prefetch((char *) &indexBuffer[i + 2], 0, 0);
not better! two loops iterations? not better? Three? **I tried it until 50 (!!!) but I cannot enhance the performance of my function computeSum.
Can I would like help to understand why
Thank you very much for your help

I believe that above code is automatically optimized by CPU without any further space for manual optimization.
1. Main problem is that indexBuffer is sequentially accessed. Hardware prefetcher senses it and prefetches further values automatically, without need to call prefetch manually. So, during iteration #i, values indexBuffer[i+1], indexBuffer[i+2],... are already in cache. (By the way, there is no need to add artificial element to the end of array: memory access errors are silently ignored by prefetch instructions).
What you really need to do is to prefetch valueBuffer instead:
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + 1]], 0, 0);
2. But adding above line of code won't help either in such simple scenario. Cost of accessing memory is hundreds of cycles, while add instruction is ~1 cycle. Your code already spends 99% of time in memory accesses. Adding manual prefetch will make it this one cycle faster and no better.
Manual prefetch would really work well if your math were much more heavy (try it), like using an expression with large number of non-optimized out divisions (20-30 cycles each) or calling some math function (log, sin).
3. But even this doesn't guarantee to help. Dependency between loop iterations is very weak, it is only via sum variable. This allows CPU to execute instructions speculatively: it may start fetching valueBuffer[i+1] concurrently while still executing math for valueBuffer[i].

Prefetch fetches normally a full cache line. This is typically 64 bytes. So the random example fetches always 64 bytes for a 4 byte int. 16 times the data you actually need which fits very well with the slow down by a factor of 18. So the code is simply limited by memory throughput and not latency.

Sorry. What I gave you was not the correct version of my code. The correct version is, what you said:
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + prefetchStep]], 0, 0);
However, even with the right version, it is unfortunately not better

Then I adapted my program to try your suggestion using the sin function.
My adapted program is the following one:
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#include <math.h>
#define BUFFER_SIZE ((unsigned long) 4096 * 50000)
unsigned int randomUint()
{
int value = rand() % UINT_MAX;
return value;
}
unsigned int * createValueBuffer()
{
unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
valueBuffer[i] = randomUint();
}
return (valueBuffer);
}
unsigned int * createIndexBuffer(unsigned short prefetchStep)
{
unsigned int * indexBuffer = (unsigned int *) malloc((BUFFER_SIZE + prefetchStep) * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = rand() % BUFFER_SIZE;
}
return (indexBuffer);
}
double computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer, unsigned short prefetchStep)
{
double sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + prefetchStep]], 0, 0);
unsigned int index = indexBuffer[i];
sum += sin(valueBuffer[index]);
}
return (sum);
}
unsigned int computeTimeInMicroSeconds(unsigned short prefetchStep)
{
unsigned int * valueBuffer = createValueBuffer();
unsigned int * indexBuffer = createIndexBuffer(prefetchStep);
struct timeval startTime, endTime;
gettimeofday(&startTime, NULL);
double sum = computeSum(indexBuffer, valueBuffer, prefetchStep);
gettimeofday(&endTime, NULL);
printf("prefetchStep = %d, Sum = %f - ", prefetchStep, sum);
free(indexBuffer);
free(valueBuffer);
return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);
}
int main()
{
printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
for (unsigned short prefetchStep = 0 ; prefetchStep < 250 ; prefetchStep++)
{
unsigned int timeInMicroSeconds = computeTimeInMicroSeconds(prefetchStep);
printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}
}
The output is:
$ gcc TestPrefetch.c -O3 -o TestPrefetch -lm && taskset -c 7 ./TestPrefetch
sizeof buffers = 781Mb
prefetchStep = 0, Sum = -1107.523504 - Time: 20895326 micro-seconds = 20.895 seconds
prefetchStep = 1, Sum = 13456.262424 - Time: 12706720 micro-seconds = 12.707 seconds
prefetchStep = 2, Sum = -20179.289469 - Time: 12136174 micro-seconds = 12.136 seconds
prefetchStep = 3, Sum = 12068.302534 - Time: 11233803 micro-seconds = 11.234 seconds
prefetchStep = 4, Sum = 21071.238160 - Time: 10855348 micro-seconds = 10.855 seconds
prefetchStep = 5, Sum = -22648.280105 - Time: 10517861 micro-seconds = 10.518 seconds
prefetchStep = 6, Sum = 22665.381676 - Time: 9205809 micro-seconds = 9.206 seconds
prefetchStep = 7, Sum = 2461.741268 - Time: 11391088 micro-seconds = 11.391 seconds
...
So here, it works better! Honestly, I was almost sure that it will not be better because the math function cost is higher compared to the memory access.
If anyone could give me more information about why it is better now, I would appreciate it
Thank you very much

CUDA performance of atomic operation on different address in warp

To my knowledge, if atomic operations are performed on same memory address location in a warp, the performance of the warp could be 32 times slower.
But what if atomic operations of threads in a warp are on 32 different memory locations? Is there any performance penalty at all? Or it will be as fast as normal operation?
My use case is that I have 32 different positions, each thread in a warp needs one of these position but which position is data dependent. So each thread could use atomicCAS to scan if the location desired is empty or not. If it is not empty, scan the next position.
If I am lucky, 32 threads could atomicCAS to 32 different memory locations, is there any performance penalty is this case?
I assume Kepler architecture is used

In the code below, I'm adding a constant value to the elements of an array (dev_input). I'm comparing two kernels, one using atomicAdd and one using regular addition. This is an example taken to the extreme in which atomicAdd operates on completely different addresses, so there will be no need for serialization of the operations.
#include <stdio.h>
#define BLOCK_SIZE 1024
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void regular_addition(float *dev_input, float val, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) dev_input[i] = dev_input[i] + val;
}
__global__ void atomic_operations(float *dev_input, float val, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) atomicAdd(&dev_input[i],val);
}
int main(){
int N = 8192*32;
float* output = (float*)malloc(N*sizeof(float));
float* dev_input; gpuErrchk(cudaMalloc((void**)&dev_input, N*sizeof(float)));
gpuErrchk(cudaMemset(dev_input, 0, N*sizeof(float)));
int NumBlocks = iDivUp(N,BLOCK_SIZE);
float time, timing1 = 0.f, timing2 = 0.f;
cudaEvent_t start, stop;
int niter = 32;
for (int i=0; i<niter; i++) {
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaEventRecord(start,0));
atomic_operations<<<NumBlocks,BLOCK_SIZE>>>(dev_input,3,N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaEventRecord(stop,0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&time, start, stop));
timing1 = timing1 + time;
}
printf("Time for atomic operations: %3.5f ms \n", timing1/(float)niter);
for (int i=0; i<niter; i++) {
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaEventRecord(start,0));
regular_addition<<<NumBlocks,BLOCK_SIZE>>>(dev_input,3,N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaEventRecord(stop,0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&time, start, stop));
timing2 = timing2 + time;
}
printf("Time for regular addition: %3.5f ms \n", timing2/(float)niter);
}
Testing this code on my NVIDIA GeForce GT540M, CUDA 5.5, Windows 7, I obtain approximately the same results for the two kernels, i.e., about 0.7ms.
Now change the instruction
if (i < N) atomicAdd(&dev_input[i],val);
to
if (i < N) atomicAdd(&dev_input[i%32],val);
which is closer to the case of your interest, namely, each atomicAdd operates on different addresses within a warp. The result I obtain is that no performance penalty is observed.
Finally, change the above instruction to
if (i < N) atomicAdd(&dev_input[0],val);
This is the other extreme in which atomicAdd always operates on the same address. In this case, the execution time raises to 5.1ms.
The above tests have been performed on a Fermi architecture. You can try to run the above code on your Kepler card.

Cuda thrust global memory writing very slow

I am currently writing a code, that calculates a integral Histogram on the GPU using the Nvidia thrust library.
Therefore I allocate a continuous Block of device memory which I update with a custom functor all the time.
The problem is, that the write to the device memory is veeery slow, but the reads are actually ok.
The basic setup is the following:
struct HistogramCreation
{
HistogramCreation(
...
// pointer to memory
...
){}
/// The actual summation operator
__device__ void operator()(int index){
.. do the calculations ..
for(int j=0;j<30;j++){
(1) *_memoryPointer = values (also using reads to such locations) ;
}
}
}
void foo(){
cudaMalloc(_pointer,size);
HistogramCreation initialCreation( ... _pointer ...);
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(_imageSize),
initialCreation);
}
if I change the writing in (1) to the following>
unsigned int val = values;
The performance is much better. THis is the only global memory write I have.
Using the memory write I get about 2s for HD Footage.
using the local variable it takes about 50 ms so about a factor of 40 less.
Why is this so slow? how could I improve it?

Just as #OlegTitov said, frequent load/store with global
memory should be avoided as much as possible. When there's a
situation where it's inevitable, then coalesced memory
access can help the execution process not to get too slow;
however in most cases, histogram calculation is pretty tough
to realize the coalesced access.
While most of the above is basically just restating
#OlegTitov's answer, i'd just like to share about an
investigation i did about finding summation with NVIDIA
CUDA. Actually the result is pretty interesting and i hope
it'll be a helpful information for other xcuda developers.
The experiment was basically to run a speed test of finding
summation with various memory access patterns: using global
memory (1 thread), L2 cache (atomic ops - 128 threads), and
L1 cache (shared mem - 128 threads)
This experiment used:
Kepler GTX 680,
1546 cores # 1.06GHz
GDDR5 256-bit # 3GHz
Here are the kernels:
__global__
void glob(float *h) {
float* hist = h;
uint sd = SEEDRND;
uint random;
for (int i = 0; i < NUMLOOP; i++) {
if (i%NTHREADS==0) random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
hist[rind] += randval;
}
}
__global__
void atom(float *h) {
float* hist = h;
uint sd = SEEDRND;
for (int i = threadIdx.x; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
atomicAdd(&hist[rind], randval);
}
}
__global__
void shm(float *h) {
int lid = threadIdx.x;
uint sd = SEEDRND;
__shared__ float shm[NTHREADS][NBIN];
for (int i = 0; i < NBIN; i++) shm[lid][i] = h[i];
for (int i = lid; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
shm[lid][rind] += randval;
}
/* reduction here */
for (int i = 0; i < NBIN; i++) {
__syncthreads();
if (threadIdx.x < 64) {
shm[threadIdx.x][i] += shm[threadIdx.x+64][i];
}
__syncthreads();
if (threadIdx.x < 32) {
shm[threadIdx.x][i] += shm[threadIdx.x+32][i];
}
__syncthreads();
if (threadIdx.x < 16) {
shm[threadIdx.x][i] += shm[threadIdx.x+16][i];
}
__syncthreads();
if (threadIdx.x < 8) {
shm[threadIdx.x][i] += shm[threadIdx.x+8][i];
}
__syncthreads();
if (threadIdx.x < 4) {
shm[threadIdx.x][i] += shm[threadIdx.x+4][i];
}
__syncthreads();
if (threadIdx.x < 2) {
shm[threadIdx.x][i] += shm[threadIdx.x+2][i];
}
__syncthreads();
if (threadIdx.x == 0) {
shm[0][i] += shm[1][i];
}
}
for (int i = 0; i < NBIN; i++) h[i] = shm[0][i];
}
OUTPUT
atom: 102656.00 shm: 102656.00 glob: 102656.00
atom: 122240.00 shm: 122240.00 glob: 122240.00
... blah blah blah ...
One Thread: 126.3919 msec
Atomic: 7.5459 msec
Sh_mem: 2.2207 msec
The ratio between these kernels is 57:17:1. Many things can
be analyzed here, and it truly does not mean that using
L1 or L2 memory spaces will always give you more than 10
times speedup of the whole program.
And here's the main and other funcs:
#include <iostream>
#include <cstdlib>
#include <cstdio>
using namespace std;
#define NUMLOOP 1000000
#define NBIN 36
#define SEEDRND 1
#define NTHREADS 128
#define NBLOCKS 1
__device__ uint rnd(uint & seed) {
#if LONG_MAX > (16807*2147483647)
int const a = 16807;
int const m = 2147483647;
seed = (long(seed * a))%m;
return seed;
#else
double const a = 16807;
double const m = 2147483647;
double temp = seed * a;
seed = (int) (temp - m * floor(temp/m));
return seed;
#endif
}
... the above kernels ...
int main()
{
float *h_hist, *h_hist2, *h_hist3, *d_hist, *d_hist2,
*d_hist3;
h_hist = (float*)malloc(NBIN * sizeof(float));
h_hist2 = (float*)malloc(NBIN * sizeof(float));
h_hist3 = (float*)malloc(NBIN * sizeof(float));
cudaMalloc((void**)&d_hist, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist2, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist3, NBIN * sizeof(float));
for (int i = 0; i < NBIN; i++) h_hist[i] = 0.0f;
cudaMemcpy(d_hist, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist2, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist3, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaEvent_t start, end;
float elapsed = 0, elapsed2 = 0, elapsed3;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start, 0);
atom<<<NBLOCKS, NTHREADS>>>(d_hist);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed, start, end);
cudaEventRecord(start, 0);
shm<<<NBLOCKS, NTHREADS>>>(d_hist2);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed2, start, end);
cudaEventRecord(start, 0);
glob<<<1, 1>>>(d_hist3);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed3, start, end);
cudaMemcpy(h_hist, d_hist, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist2, d_hist2, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist3, d_hist3, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
/* print output */
for (int i = 0; i < NBIN; i++) {
printf("atom: %10.2f shm: %10.2f glob:
%10.2f¥n",h_hist[i],h_hist2[i],h_hist3[i]);
}
printf("%12s: %8.4f msec¥n", "One Thread", elapsed3);
printf("%12s: %8.4f msec¥n", "Atomic", elapsed);
printf("%12s: %8.4f msec¥n", "Sh_mem", elapsed2);
return 0;
}

When writing GPU code you should avoid reading and writing to/from global memory. Global memory is very slow on GPU. That's the hardware feature. The only thing you can do is to make neighboring treads read/write in neighboring adresses in global memory. This will cause coalescing and speed up the process. But in general read your data once, process it and write it out once.

Note that NVCC might optimize out a lot of your code after you make the modification - it detects that no write to global memory is made and just removes the "unneeded" code. So this speedup may not be coming out of the global writer per ce.
I would recommend using profiler on your actual code (the one with global write) to see if there's anything like unaligned access or other perf problem.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

AVX vs. SSE: expect to see a larger speedup - performance

Related

large-size page-locked memory copy get wrong result in CUDA

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

Speed up random memory access using prefetch

CUDA performance of atomic operation on different address in warp

Cuda thrust global memory writing very slow

Categories

Resources