Understanding the execution time of a memory access code with varying strides - performance

I was trying to understand the execution time of the following code for different steps but some figures are really confusing to me. This is a simple code for memory reads with increasing steps.
Here is my code
#include <iostream>
#include <algorithm>
#include <ctime>
#include <stdio.h>
using namespace std;
int main(){
int a=1024;
int numLoop=a;
int temp =0;
int * arr;
arr = (int*) malloc (100000000 * sizeof(int));
for(int array=a; array<(a+1); array+=1) {
for(int step=1; step<(numLoop+1); step*=2) {
clock_t begin = clock();
for(int v=1; v<(step+1); v++) {
for(int i=0; i<array; i+=step) {
temp = (temp+arr[i]) % 33721;
}
}
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
printf("%0.6lf %d %d\n",elapsed_secs,step,array);
}
}
return 0;
}
I got the following results after execution
0.000073 1 1024
0.000069 2 1024
0.000078 4 1024
0.000099 8 1024
0.000140 16 1024
0.000220 32 1024
0.000432 64 1024
0.000682 128 1024
0.001354 256 1024
0.002693 512 1024
0.005178 1024 1024
Architecture details:
Freescale p4080ds QorIQ processor
Numer of active core:1 (configured)
L1 cache : 32kB
L2 cache : 128kB
L3 cache : 2MB
(More about the architecture is here)
Here the most disturbing thing for me is the last line of the result. When step=1024 then it is accessing arr[0] 1024 times which should be in the cache after first access. Then why does it take the highest execution time( 0.005178) of all?

Related

How to find median value in 2d array for each column with CUDA? [duplicate]

I found the method 'vectorized/batch sort' and 'nested sort' on below link. How to use Thrust to sort the rows of a matrix?
When I tried this method for 500 row and 1000 elements, the result of them are
vectorized/batch sort : 66ms
nested sort : 3290ms
I am using 1080ti HOF model to do this operation but it takes too long compared to your case.
But in the below link, it could be less than 10ms and almost 100 microseconds.
(How to find median value in 2d array for each column with CUDA?)
Could you recommend how to optimize this method to reduce operation time?
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>
#define NSORTS 500
#define DSIZE 1000
int my_mod_start = 0;
int my_mod() {
return (my_mod_start++) / DSIZE;
}
bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2) {
return thrust::equal(d1.begin(), d1.end(), d2.begin());
}
struct sort_functor
{
thrust::device_ptr<int> data;
int dsize;
__host__ __device__
void operator()(int start_idx)
{
thrust::sort(thrust::device, data + (dsize*start_idx), data + (dsize*(start_idx + 1)));
}
};
#include <time.h>
#include <windows.h>
unsigned long long dtime_usec(LONG start) {
SYSTEMTIME timer2;
GetSystemTime(&timer2);
LONG end = (timer2.wSecond * 1000) + timer2.wMilliseconds;
return (end-start);
}
int main() {
for (int i = 0; i < 3; i++) {
SYSTEMTIME timer1;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16 * DSIZE*NSORTS));
thrust::host_vector<int> h_data(DSIZE*NSORTS);
thrust::generate(h_data.begin(), h_data.end(), rand);
thrust::device_vector<int> d_data = h_data;
// first time a loop
thrust::device_vector<int> d_result1 = d_data;
thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
GetSystemTime(&timer1);
LONG time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
for (int i = 0; i < NSORTS; i++)
thrust::sort(r1ptr + (i*DSIZE), r1ptr + ((i + 1)*DSIZE));
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
//vectorized sort
thrust::device_vector<int> d_result2 = d_data;
thrust::host_vector<int> h_segments(DSIZE*NSORTS);
thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
thrust::device_vector<int> d_segments = h_segments;
GetSystemTime(&timer1);
time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
//nested sort
thrust::device_vector<int> d_result3 = d_data;
sort_functor f = { d_result3.data(), DSIZE };
thrust::device_vector<int> idxs(NSORTS);
thrust::sequence(idxs.begin(), idxs.end());
GetSystemTime(&timer1);
time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
thrust::for_each(idxs.begin(), idxs.end(), f);
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
}
return 0;
}
The main takeaway from your thrust experience is that you should never compile a debug project or with device debug switch (-G) when you are interested in performance. Compiling device debug code causes the compiler to omit many performance optimizations. The difference in your case was quite dramatic, about a 30x improvement going from debug to release code.
Here is a segmented cub sort, where we are launching 500 blocks and each block is handling a separate 1024 element array. The CUB code is lifted from here.
$ cat t1761.cu
#include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
#include <iostream>
const int ipt=8;
const int tpb=128;
__global__ void ExampleKernel(int *data)
{
// Specialize BlockRadixSort for a 1D block of 128 threads owning 8 integer items each
typedef cub::BlockRadixSort<int, tpb, ipt> BlockRadixSort;
// Allocate shared memory for BlockRadixSort
__shared__ typename BlockRadixSort::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int thread_keys[ipt];
// just create some synthetic data in descending order 1023 1022 1021 1020 ...
for (int i = 0; i < ipt; i++) thread_keys[i] = (tpb-1-threadIdx.x)*ipt+i;
// Collectively sort the keys
BlockRadixSort(temp_storage).Sort(thread_keys);
__syncthreads();
// write results to output array
for (int i = 0; i < ipt; i++) data[blockIdx.x*ipt*tpb + threadIdx.x*ipt+i] = thread_keys[i];
}
int main(){
const int blks = 500;
int *data;
cudaMalloc(&data, blks*ipt*tpb*sizeof(int));
ExampleKernel<<<blks,tpb>>>(data);
int *h_data = new int[blks*ipt*tpb];
cudaMemcpy(h_data, data, blks*ipt*tpb*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; i++) std::cout << h_data[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1761 t1761.cu -I/path/to/cub/cub-1.8.0
$ CUDA_VISIBLE_DEVICES="2" nvprof ./t1761
==13713== NVPROF is profiling process 13713, command: ./t1761
==13713== Warning: Profiling results might be incorrect with current version of nvcc compiler used to compile cuda app. Compile with nvcc compiler 9.0 or later version to get correct profiling results. Ignore this warning if code is already compiled with the recommended nvcc version
0 1 2 3 4 5 6 7 8 9
==13713== Profiling application: ./t1761
==13713== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 60.35% 308.66us 1 308.66us 308.66us 308.66us [CUDA memcpy DtoH]
39.65% 202.79us 1 202.79us 202.79us 202.79us ExampleKernel(int*)
API calls: 98.39% 210.79ms 1 210.79ms 210.79ms 210.79ms cudaMalloc
0.72% 1.5364ms 1 1.5364ms 1.5364ms 1.5364ms cudaMemcpy
0.32% 691.15us 1 691.15us 691.15us 691.15us cudaLaunchKernel
0.28% 603.26us 97 6.2190us 400ns 212.71us cuDeviceGetAttribute
0.24% 516.56us 1 516.56us 516.56us 516.56us cuDeviceTotalMem
0.04% 79.374us 1 79.374us 79.374us 79.374us cuDeviceGetName
0.01% 13.373us 1 13.373us 13.373us 13.373us cuDeviceGetPCIBusId
0.00% 5.0810us 3 1.6930us 729ns 2.9600us cuDeviceGetCount
0.00% 2.3120us 2 1.1560us 609ns 1.7030us cuDeviceGet
0.00% 748ns 1 748ns 748ns 748ns cuDeviceGetUuid
$
(CUDA 10.2.89, RHEL 7)
Above I am running on a Tesla K20x, which has performance that is "closer" to your 1080ti than a Tesla V100. We see that the kernel execution time is ~200us. If I run the exact same code on a Tesla V100, the kernel execution time drops to ~35us:
$ CUDA_VISIBLE_DEVICES="0" nvprof ./t1761
==13814== NVPROF is profiling process 13814, command: ./t1761
0 1 2 3 4 5 6 7 8 9
==13814== Profiling application: ./t1761
==13814== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 82.33% 163.43us 1 163.43us 163.43us 163.43us [CUDA memcpy DtoH]
17.67% 35.073us 1 35.073us 35.073us 35.073us ExampleKernel(int*)
API calls: 98.70% 316.92ms 1 316.92ms 316.92ms 316.92ms cudaMalloc
0.87% 2.7879ms 1 2.7879ms 2.7879ms 2.7879ms cuDeviceTotalMem
0.19% 613.75us 97 6.3270us 389ns 205.37us cuDeviceGetAttribute
0.19% 601.61us 1 601.61us 601.61us 601.61us cudaMemcpy
0.02% 72.718us 1 72.718us 72.718us 72.718us cudaLaunchKernel
0.02% 59.905us 1 59.905us 59.905us 59.905us cuDeviceGetName
0.01% 37.886us 1 37.886us 37.886us 37.886us cuDeviceGetPCIBusId
0.00% 4.6830us 3 1.5610us 546ns 2.7850us cuDeviceGetCount
0.00% 1.9900us 2 995ns 587ns 1.4030us cuDeviceGet
0.00% 677ns 1 677ns 677ns 677ns cuDeviceGetUuid
$
You'll note there is no "input" array, I'm just synthesizing data in the kernel, since we are interested in performance, primarily. If you need to handle an array size like 1000, you should probably just pad each array to 1024 (e.g. pad with a very large number, then ignore the last numbers in the sorted result.)
This code is largely lifted from external documentation. It is offered for instructional purposes. I'm not suggesting it is defect-free or suitable for any particular purpose. Use it at your own risk.

CUDA unified memory and Windows 10

While using CudaMallocManaged() to allocate an array of structs with arrays inside, I'm getting the error "out of memory" even though I have enough free memory. Here's some code that replicates my problem:
#include <iostream>
#include <cuda.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define N 100000
#define ARR_SZ 100
struct Struct
{
float* arr;
};
int main()
{
Struct* struct_arr;
gpuErrchk( cudaMallocManaged((void**)&struct_arr, sizeof(Struct)*N) );
for(int i = 0; i < N; ++i)
gpuErrchk( cudaMallocManaged((void**)&(struct_arr[i].arr), sizeof(float)*ARR_SZ) ); //out of memory...
for(int i = 0; i < N; ++i)
cudaFree(struct_arr[i].arr);
cudaFree(struct_arr);
/*float* f;
gpuErrchk( cudaMallocManaged((void**)&f, sizeof(float)*N*ARR_SZ) ); //this works ok
cudaFree(f);*/
return 0;
}
There doesn't seem to be a problem when I call cudaMallocManaged() once to allocate a single chunk of memory, as I'm showing in the last piece of commented code.
I have a GeForce GTX 1070 Ti, and I'm using Windows 10. A friend tried to compile the same code in a PC with Linux and it worked correctly, while it had the same issue in another PC with Windows 10. WDDM TDR is deactivated.
Any help would be appreciated. Thanks.
There is an allocation granularity.
This means that if you ask for 1 byte, or 400 bytes, what is actually used up is something like 4096 65536 bytes. So a bunch of very small allocations will actually use up memory at a much faster rate than what you would predict based on the requested allocation size. The solution is to not make very small allocations, but instead to allocate in larger chunks.
An alternative strategy here would also be to flatten your allocation, and carve out pieces from it for each of your arrays:
#include <iostream>
#include <cstdio>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define N 100000
#define ARR_SZ 100
struct Struct
{
float* arr;
};
int main()
{
Struct* struct_arr;
float* f;
gpuErrchk( cudaMallocManaged((void**)&struct_arr, sizeof(Struct)*N) );
gpuErrchk( cudaMallocManaged((void**)&f, sizeof(float)*N*ARR_SZ) );
for(int i = 0; i < N; ++i)
struct_arr[i].arr = f+i*ARR_SZ;
cudaFree(struct_arr);
cudaFree(f);
return 0;
}
ARR_SZ divisible by 4 means the various created pointers can also be up-cast to larger vector types e.g. float2 or float4, if your use had any intention of doing that.
A possible reason the original code works on linux is because managed memory on linux, in a proper setup, can oversubscribe the GPU physical memory. The result is the actual allocation limit is much higher than what the GPU on-board memory would suggest. It might also be that the linux case has a bit more free memory, or perhaps the allocation granularity on linux is different (smaller).
Based on a question in the comments, I decided to estimate the allocation granularity, using this code:
#include <iostream>
#include <cstdio>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define N 100000
#define ARR_SZ 100
struct Struct
{
float* arr;
};
int main()
{
Struct* struct_arr;
//float* f;
gpuErrchk(cudaMallocManaged((void**)& struct_arr, sizeof(Struct) * N));
#if 0
gpuErrchk(cudaMallocManaged((void**)& f, sizeof(float) * N * ARR_SZ));
for (int i = 0; i < N; ++i)
struct_arr[i].arr = f + i * ARR_SZ;
#else
size_t fre, tot;
gpuErrchk(cudaMemGetInfo(&fre, &tot));
std::cout << "Free: " << fre << " total: " << tot << std::endl;
for (int i = 0; i < N; ++i)
gpuErrchk(cudaMallocManaged((void**) & (struct_arr[i].arr), sizeof(float) * ARR_SZ));
gpuErrchk(cudaMemGetInfo(&fre, &tot));
std::cout << "Free: " << fre << " total: " << tot << std::endl;
for (int i = 0; i < N; ++i)
cudaFree(struct_arr[i].arr);
#endif
cudaFree(struct_arr);
//cudaFree(f);
return 0;
}
When I compile a debug project with that code, and run that on a windows 10 desktop with RTX 2070 GPU (8GB memory, same as GTX 1070 Ti) I get the following output:
Microsoft Windows [Version 10.0.17763.973]
(c) 2018 Microsoft Corporation. All rights reserved.
C:\Users\Robert Crovella>cd C:\Users\Robert Crovella\source\repos\test12\x64\Debug
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>test12
Free: 7069866393 total: 8589934592
Free: 516266393 total: 8589934592
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>test12
Free: 7069866393 total: 8589934592
Free: 516266393 total: 8589934592
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>
Note that on my machine there is only 0.5GB of reported free memory left after the 100,000 allocations. So if for any reason your 8GB GPU starts out with less free memory (entirely possible) you may run into an out-of-memory error, even though I did not.
The calculation of the allocation granularity is as follows:
7069866393 - 516266393 / 100000 = 65536 bytes per allocation(!)
So my previous estimate of 4096 bytes per allocation was way off, by at least 1 order of magnitude, on my machine/test setup.
The allocation granularity may vary based on:
windows or linux
WDDM or TCC
x86 or Power9
managed vs ordinary cudaMalloc
possibly other factors (e.g. CUDA version)
so my advice to future readers would not be to assume that it is always 65536 bytes per allocation, minimum.

Does clflush instruction flush block only from Level 1 Cache?

I have a multi-core system with 4 cores each of them having private L1 and L2 caches and shared LLC. Caches have inclusive property meaning that Higher level Caches are super-set of lower level Caches. Can I directly flush a block on the LLC or does it have to go through the lower level first?
I am trying to understand flush+ reload and flush+flush Cache side Channel attacks.
clflush is architecturally required / guaranteed to evict the line from all levels of cache, making it useful for committing data to non-volatile DIMMs. (e.g. Battery-backed DRAM or 3D XPoint).
The wording in the manual seems pretty clear:
Invalidates from every level of the cache hierarchy in the cache coherence domain ... If that cache line contains modified data at any level of the cache hierarchy, that data is written back to memory
I think if multiple cores have a line in Shared state, clflush / clflushopt on one core has to evict it from the private caches of all cores. (This would happen anyway as part of evicting from inclusive L3 cache, but Skylake-X changed to a NINE (not-inclusive not-exclusive) L3 cache.)
Can I directly flush a block on the LLC or does it have to go through the lower level first?
Not clear what you're asking. Are you asking if you can ask the CPU to flush a block from L3 only, without disturbing L1/L2? You already know L3 is inclusive on most Intel CPUs, so the net effect would be the same as clflush. For cores to talk to L3, they have to go through their own L1d and L2.
clflush still works if the data is only present in L3 but not the private L1d or L2 of the core executing it. It's not a "hint" like a prefetch, or a local-only thing.
In future Silvermont-family CPUs, there will be a cldemote instruction that lets you flush a block to the LLC, but not all the way to DRAM. (And it's only a hint, so it doesn't force the CPU to obey it if the write-back path is busy with evictions to make room for demand-loads.)
That couldn't be true that CLFLUSH always evicts from every cache-level. I just wrote a little program (C++17) where flushing cachlines is always below 5ns on my machine (3990X):
#include <iostream>
#include <chrono>
#include <cstring>
#include <vector>
#include <charconv>
#include <sstream>
#include <cmath>
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
using namespace std;
using namespace chrono;
size_t parseSize( char const *str );
string blockSizeStr( size_t blockSize );
int main( int argc, char **argv )
{
static size_t const DEFAULT_MAX_BLOCK_SIZE = (size_t)512 * 1024;
size_t blockSize = argc < 2 ? DEFAULT_MAX_BLOCK_SIZE : parseSize( argv[1] );
if( blockSize == -1 )
return EXIT_FAILURE;
blockSize = blockSize >= 4096 ? blockSize : 4096;
vector<char> block( blockSize );
size_t size = 4096;
static size_t const ITERATIONS_64K = 100;
do
{
uint64_t avg = 0;
size = size <= blockSize ? size : blockSize;
size_t iterations = (size_t)((double)0x10000 / size * ITERATIONS_64K + 0.5);
iterations += (size_t)!iterations;
for( size_t it = 0; it != iterations; ++it )
{
// make cachlines to get modified for sure by
// modifying to a differnt value each iteration
for( size_t i = 0; i != size; ++i )
block[i] = (i + it) % 0x100;
auto start = high_resolution_clock::now();
for( char *p = &*block.begin(), *end = p + size; p < end; p += 64 )
_mm_clflush( p );
avg += duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count();
}
double nsPerCl = ((double)(int64_t)avg / iterations) / (double)(ptrdiff_t)(size / 64);
cout << blockSizeStr( size ) << " " << nsPerCl << "ns" << endl;
} while( (size *= 2) <= blockSize );
}
size_t parseSize( char const *str )
{
double dSize;
from_chars_result fcr = from_chars( str, str + strlen( str ), dSize, chars_format::general );
if( fcr.ec != errc() )
return -1;
if( !*(str = fcr.ptr) || str[1] )
return -1;
static const
struct suffix_t
{
char suffix;
size_t mult;
} suffixes[]
{
{ 'k', 1024 },
{ 'm', (size_t)1024 * 1024 },
{ 'g', (size_t)1024 * 1024 * 1024 }
};
char cSuf = tolower( *str );
for( suffix_t const &suf : suffixes )
if( suf.suffix == cSuf )
{
dSize = trunc( dSize * (ptrdiff_t)suf.mult );
if( dSize < 1.0 || dSize >= (double)numeric_limits<ptrdiff_t>::max() )
return -1;
return (ptrdiff_t)dSize;
}
return -1;
}
string blockSizeStr( size_t blockSize )
{
ostringstream oss;
double dSize = (double)(ptrdiff_t)blockSize;
if( dSize < 1024.0 )
oss << blockSize;
else if( dSize < 1024.0 * 1024.0 )
oss << dSize / 1024.0 << "kB";
else if( blockSize < (size_t)1024 * 1024 * 1024 )
oss << dSize / (1024.0 * 1024.0) << "MB";
else
oss << (double)blockSize / (1024.0 * 1024.0 * 1024.0) << "GB";
return oss.str();
}
There's no DDR-whatever memory that can handle flushing a single cacheline below 5ns.

Is there any way to reduce sum 100M float elements of an array in CUDA?

I'm new to CUDA. So please bear with questions with trivial solutions, if any.
I am trying to find the sum of 100M float elements of an array. From the following code one could see that I've used a reduction kernel and thrust. I suppose the kernel stores the sum in g_odata[0]. As all the elements are same in g_idata the result should be n*g_idata[1]. But you could clearly see the results are incorrect for both of them.
What am I getting wrong? How could I achieve my target?
Every reduction kernel I found is for integer datatype. e.g. the highly recommended Optimizing Parallel Reduction in CUDA.. Is there any specific reason to that?
Here is my code:
#include <iostream>
#include <math.h>
#include <stdlib.h>
#include <iomanip>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
using namespace std;
__global__ void reduce(float *g_idata, float *g_odata) {
__shared__ float sdata[256];
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
int main(void){
unsigned int n=pow(10,8);
float *g_idata, *g_odata;
cudaMallocManaged(&g_idata, n*sizeof(float));
cudaMallocManaged(&g_odata, n*sizeof(float));
int blockSize = 32;
int numBlocks = (n + blockSize - 1) / blockSize;
for(int i=0;i<n;i++){g_idata[i]=6.1;g_odata[i]=0;}
reduce<<<numBlocks, blockSize>>>(g_idata, g_odata);
cudaDeviceSynchronize();
cout << g_odata[0] << "\t" << (float)n*g_idata[1] << "\t"<< (float)n*g_idata[1]-g_odata[0]<<endl;
g_odata[0]=thrust::reduce(thrust::device, g_idata, g_idata+n);
cout << g_odata[0] << "\t" << (float)n*g_idata[1] << "\t"<< (float)n*g_idata[1]-g_odata[0]<<endl;
cudaFree(g_idata);
cudaFree(g_odata);
}
Result:
6.0129e+08 6.1e+08 8.7097e+06
6.09986e+08 6.1e+08 13824
I am using CUDA 10. nvcc --version :
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130
Details of my GPU DeviceQuery:
./deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 750"
CUDA Driver Version / Runtime Version 10.0 / 10.0
CUDA Capability Major/Minor version number: 5.0
Total amount of global memory: 1999 MBytes (2096168960 bytes)
( 4) Multiprocessors, (128) CUDA Cores/MP: 512 CUDA Cores
GPU Max Clock rate: 1110 MHz (1.11 GHz)
Memory Clock rate: 2505 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 2097152 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device supports Compute Preemption: No
Supports Cooperative Kernel Launch: No
Supports MultiDevice Co-op Kernel Launch: No
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.0, CUDA Runtime Version = 10.0, NumDevs = 1
Result = PASS
Thanks in advance.
I think the reason you are confused about the results here is a lack of understanding of floating point arithmetic. This whitepaper covers the topic pretty well. As a simple concept to grasp, if I have numbers represented as float quantities, and I attempt to do this:
100000000 + 1
the result will be: 100000000 (write some code and try it yourself)
This isn't unique to GPUs, CPU code will behave the same way (try it).
So for very large reductions, we get to the point (often) where we are adding very large numbers to much much smaller numbers, and the results aren't accurate from a "pure math" point of view.
That is fundamentally the problem here. In your CPU code, when you decide that the correct result should be 6.1*n, that kind of multiplication problem is not subject to the limits of adding large numbers to small ones that I just described, so you get an "accurate" result from that.
One of the ways to prove this or work around it, is to use double representation instead of float. This doesn't really completely eliminate the problem, but it pushes the resolution to the point where it can do a much better job of representing the range of numbers here.
The following code primarily has that change. You can change the typedef to compare the behavior between float and double.
There are a few other changes in the code. None of them are the cause of the discrepancy you witnessed.
$ cat t18.cu
#include <iostream>
#include <math.h>
#include <stdlib.h>
#include <iomanip>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#define BLOCK_SIZE 32
typedef double ft;
using namespace std;
__device__ double my_atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ float my_atomicAdd(float* addr, float val){
return atomicAdd(addr, val);
}
__global__ void reduce(ft *g_idata, ft *g_odata, int n) {
__shared__ ft sdata[BLOCK_SIZE];
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = (i < n)?g_idata[i]:0;
__syncthreads();
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if ((index +s) < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
if (threadIdx.x == 0)
my_atomicAdd(g_odata,sdata[0]);
}
int main(void){
unsigned int n=pow(10,8);
ft *g_idata, *g_odata;
cudaMallocManaged(&g_idata, n*sizeof(ft));
cudaMallocManaged(&g_odata, sizeof(ft));
cout << "n = " << n << endl;
int blockSize = BLOCK_SIZE;
int numBlocks = (n + blockSize - 1) / blockSize;
g_odata[0] = 0;
for(int i=0;i<n;i++){g_idata[i]=6.1;}
reduce<<<numBlocks, blockSize>>>(g_idata, g_odata, n);
cudaDeviceSynchronize();
cout << g_odata[0] << "\t" << (float)n*g_idata[1] << "\t"<< (float)n*g_idata[1]-g_odata[0]<<endl;
g_odata[0]=thrust::reduce(thrust::device, g_idata, g_idata+n);
cout << g_odata[0] << "\t" << (float)n*g_idata[1] << "\t"<< (float)n*g_idata[1]-g_odata[0]<<endl;
cudaFree(g_idata);
cudaFree(g_odata);
}
$ nvcc -o t18 t18.cu
$ cuda-memcheck ./t18
========= CUDA-MEMCHECK
n = 100000000
6.1e+08 6.1e+08 0.00527966
6.1e+08 6.1e+08 5.13792e-05
========= ERROR SUMMARY: 0 errors
$

AVX vs. SSE: expect to see a larger speedup

I expected AVX to be about 1.5x faster than SSE. All 3 arrays (3 arrays * 16384 elements *4 bytes/element = 196608 bytes) should fit in L2 cache (256KB) on an Intel Core CPU (Broadwell).
Are there any special compiler directives or flags that I should be using?
Compiler Version
$ clang --version
Apple LLVM version 9.0.0 (clang-900.0.38)
Target: x86_64-apple-darwin16.7.0
Thread model: posix
InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin
Compile line
$ make avx
clang -O3 -fno-tree-vectorize -msse -msse2 -msse3 -msse4.1 -mavx -mavx2 avx.c ; ./a.out 123
n: 123
AVX Time taken: 0 seconds 177 milliseconds
vector+vector:begin int: 1 5 127 0
SSE Time taken: 0 seconds 195 milliseconds
vector+vector:begin int: 1 5 127 0
avx.c
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <time.h>
#ifndef __cplusplus
#include <stdalign.h> // C11 defines _Alignas(). This header defines alignas()
#endif
#define REPS 50000
#define AR 16384
// add int vectors via AVX
__attribute__((noinline))
void add_iv_avx(__m256i *restrict a, __m256i *restrict b, __m256i *restrict out, int N) {
__m256i *x = __builtin_assume_aligned(a, 32);
__m256i *y = __builtin_assume_aligned(b, 32);
__m256i *z = __builtin_assume_aligned(out, 32);
const int loops = N / 8; // 8 is number of int32 in __m256i
for(int i=0; i < loops; i++) {
_mm256_store_si256(&z[i], _mm256_add_epi32(x[i], y[i]));
}
}
// add int vectors via SSE; https://en.wikipedia.org/wiki/Restrict
__attribute__((noinline))
void add_iv_sse(__m128i *restrict a, __m128i *restrict b, __m128i *restrict out, int N) {
__m128i *x = __builtin_assume_aligned(a, 16);
__m128i *y = __builtin_assume_aligned(b, 16);
__m128i *z = __builtin_assume_aligned(out, 16);
const int loops = N / sizeof(int);
for(int i=0; i < loops; i++) {
//out[i]= _mm_add_epi32(a[i], b[i]); // this also works
_mm_storeu_si128(&z[i], _mm_add_epi32(x[i], y[i]));
}
}
// printing
void p128_as_int(__m128i in) {
alignas(16) uint32_t v[4];
_mm_store_si128((__m128i*)v, in);
printf("int: %i %i %i %i\n", v[0], v[1], v[2], v[3]);
}
__attribute__((noinline))
void debug_print(int *h) {
printf("vector+vector:begin ");
p128_as_int(* (__m128i*) &h[0] );
}
int main(int argc, char *argv[]) {
int n = atoi (argv[1]);
printf("n: %d\n", n);
int *x,*y,*z;
if (posix_memalign((void**)&x, 32, 16384*sizeof(int))) { free(x); return EXIT_FAILURE; }
if (posix_memalign((void**)&y, 32, 16384*sizeof(int))) { free(y); return EXIT_FAILURE; }
if (posix_memalign((void**)&z, 32, 16384*sizeof(int))) { free(z); return EXIT_FAILURE; }
x[0]=0; x[1]=2; x[2]=4;
y[0]=1; y[1]=3; y[2]=n;
// touch each 4K page in x,y,z to avoid copy-on-write optimizations
for (int i=512; i<AR; i+= 512) { x[i]=1; y[i]=1; z[i]=1; }
// warmup
for(int i=0; i<REPS; ++i) { add_iv_avx((__m256i*)x, (__m256i*)y, (__m256i*)z, AR); }
// AVX
clock_t start = clock();
for(int i=0; i<REPS; ++i) { add_iv_avx((__m256i*)x, (__m256i*)y, (__m256i*)z, AR); }
int msec = (clock()-start) * 1000 / CLOCKS_PER_SEC;
printf(" AVX Time taken: %d seconds %d milliseconds\n", msec/1000, msec%1000);
debug_print(z);
// warmup
for(int i=0; i<REPS; ++i) { add_iv_sse((__m128i*)x, (__m128i*)y, (__m128i*)z, AR); }
// SSE
start = clock();
for(int i=0; i<REPS; ++i) { add_iv_sse((__m128i*)x, (__m128i*)y, (__m128i*)z, AR); }
msec = (clock()-start) * 1000 / CLOCKS_PER_SEC;
printf("\n SSE Time taken: %d seconds %d milliseconds\n", msec/1000, msec%1000);
debug_print(z);
return EXIT_SUCCESS;
}
The problem is that that your data doesn't fit in the L1 cache.
The L1 bandwidth of Broadwell is much larger than the L2 bandwidth.
The L1 bandwidth is large enough to load two 32 byte vectors every cpu cycle. So, a better AVX vs. SSE speedup
might be expected if your data set is much smaller. However, note that
the combined L1 read/write bandwidth is less than 2*32(r)+32(w)=96 bytes per cycle.
In practice 75 bytes per cycle is possible, see here.
The second graph on this page shows that indeed the L2 bandwidth is much smaller:
At Test_block_size=128KB (=32KB per core) the bandwidth is 900GB/s.
At Test_block_size=1MB (=256KB per core) the bandwidth is only 300GB/s.
(Note that Haswell 4770k has more or less the same L1 and L2 cache architecture as Broadwell.)
Try to reduce AR to 2000 and to increase NREP to 1000000 and see what happens with the SSE vs. AVX speedup.

Resources