Nvprof reported that there are about 200 milion shared_ld_bank_conflict and some shared_st_bank_conflict in my sgemm kernel. I tried the padding trick __shared__ float smem[SIZE + OFFSET];, it reduced store bank conflicts to 0, but load bank conflicts are still there. I don't know how to further improve it.
__global__ void sgemm(
const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ C,
int M, int N, int K
){
int tid = threadIdx.x;
int gStartx = blockIdx.x * 128;
int gStarty = blockIdx.y * 128;
int dx = tid % 8;
int dy = tid / 8;
int vx = tid % 16;
int vy = tid / 16;
__shared__ volatile float aSM[8][128+4];
__shared__ volatile float bSM[8][128+4];
float aBuffer1[4];
float bBuffer1[4];
float aBuffer2[4];
float bBuffer2[4];
float cCache[8][8];
#pragma unroll
for (int i=0; i<8; i++)
#pragma unroll
for (int j=0; j<8; j++)
cCache[i][j] = 0.f;
//load first two tiles
#pragma unroll
for (int i=0; i<4; i++){
aBuffer1[i] = A[(gStarty + dy + i*32)*K + (dx)];
bBuffer1[i] = B[(gStartx + dy + i*32)*K + (dx)];
}
int nIt = (K + 8 - 1) / 8;
#pragma unroll
for (int itr=0; itr<nIt; itr++){
int gStartk = itr * 8;
int is_odd = itr & 1;
if (is_odd == 0){
#pragma unroll
for (int i=0; i<4; i++){
if (itr != (nIt - 1)){
// prefetch next tiles
aBuffer2[i] = A[(gStarty + i*32 + dy)*K + (gStartk + 8 + dx)];
bBuffer2[i] = B[(gStartx + i*32 + dy)*K + (gStartk + 8 + dx)];
}
//move current tiles to SMEM
aSM[dx][dy+i*32] = aBuffer1[i];
bSM[dx][dy+i*32] = bBuffer1[i];
}
} else {
#pragma unroll
for (int i=0; i<4; i++){
if (itr != (nIt - 1)){
//prefetch next tiles to another buffer
aBuffer1[i] = A[(gStarty + i*32 + dy)*K + (gStartk + 8 + dx)];
bBuffer1[i] = B[(gStartx + i*32 + dy)*K + (gStartk + 8 + dx)];
}
aSM[dx][dy+i*32] = aBuffer2[i];
bSM[dx][dy+i*32] = bBuffer2[i];
}
}
__syncthreads();
float aCache[8][4];
#pragma unroll
for (int p=0; p<2; p++){
#pragma unroll
for (int ki=0; ki<8; ki++){
#pragma unroll
for (int mi=0; mi<4; mi++){
aCache[ki][mi] = aSM[ki][8*vy + 4*p +mi];
}
}
#pragma unroll
for (int ki=0; ki<8; ki++){
#pragma unroll
for (int ni=0; ni<8; ni++){
float b = bSM[ki][8*vx + ni];
#pragma unroll
for (int mi=0; mi<4; mi++){
float a = aCache[ki][mi];
cCache[mi + 4*p][ni] = fma(a, b, cCache[mi + 4*p][ni] );
}
}
}
}
__syncthreads();
}
#pragma unroll
for (int i=0; i<8; i++){
for (int j=0; j<8; j++){
C[(gStarty + vy*8 + i)*N + (gStartx + vx*8 + j)] = cCache[i][j];
}
}
}
A (2048x2048) matrix is row major, B (2048x2048) is column major, each block has 256 threads, each block calculates 128x128 portion of C, and each thread calculates 8x8x8. the gpu is Tesla P100.
Ok I found a solution: when storing to bSM, insert one padding word between every 32 words in the second dimention
//bSM[dx][dy+i*32] = bBuffer1[i];
bSM[dx][dy+i*33] = bBuffer1[i]; //we're skipping column 32, 65, 98, 131
when reading bSM[i][j], read it like this: bSM[i][j/32 + j]
//float b = bSM[ki][8*vx + ni];
float b = bSM[ki][(8*vx) / 32 + 8*vx + ni];
// (8*vx+ni)/32 is the same as (8*vx)/32, since vi is always less than 8
now it's giving me 55% performance of cublas gemm on tesla p4
The below code is based on the video tutorials by Tim Mattson on YouTube.
I would like to find out the number of threads I actually receive when calling parallel (it is possible that I have requested 256 threads but only ended up with 8).
The usual omp_get_num_threads() does not work with the below (if I wanted to create a code block I get an expected a for loop following OpenMP 'directive' directive error):
void pi_with_omp() {
int i;
double x, pi, sum = 0.0;
double start_time, run_time;
step = 1.0 / (double)num_steps;
omp_set_num_threads(NUM_THREADS);
start_time = omp_get_wtime();
#pragma omp parallel for reduction(+:sum) private(x)
for (i = 0; i < num_steps; i++) {
x = (i + 0.5) * step;
sum += 4.0 / (1.0 + x * x);
}
pi = step * sum;
run_time = omp_get_wtime() - start_time;
printf("\n pi with %ld steps is %lf in %lf seconds", num_steps, pi, run_time);
}
The only way I have found is to rewrite the above pragma and dissect it into two like the following:
int nthreads;
#pragma omp parallel
{
double x;
int id, nthrds;
id = omp_get_thread_num();
nthrds = omp_get_num_threads();
if (id == 0) nthreads = nthrds;
#pragma omp for reduction(+:sum)
for (i = 0; i < num_steps; i++) {
x = (i + 0.5) * step;
sum = sum + 4.0 / (1.0 + x * x);
}
}
Which does the job but is not pretty. Has anyone got a better solution?
You can simplify your code, but you will still need to separate the parallel and the for.
int nthreads;
#pragma omp parallel
{
#pragma omp single nowait
nthreads = omp_get_num_threads();
#pragma omp for reduction(+:sum)
for (i = 0; i < num_steps; i++) {
double x = (i + 0.5) * step;
sum = sum + 4.0 / (1.0 + x * x);
}
}
I want to compute the average of an image (3 channels of interest + 1 alpha channel we ignore here) for each channel using SSE2 intrinsics. I tried that:
__m128 average = _mm_setzero_ps();
#pragma omp parallel for reduction(+:average)
for(size_t k = 0; k < roi_out->height * roi_out->width * ch; k += ch)
{
float *in = ((float *)temp) + k;
average += _mm_load_ps(in);
}
But I get this error with GCC: user-defined reduction not found for average.
Is that possible with SSE2 ? What's wrong ?
Edit
This works:
float sum[4] = { 0.0f };
#pragma omp parallel for simd reduction(+:sum[:4])
for(size_t k = 0; k < roi_out->height * roi_out->width * ch; k += ch)
{
float *in = ((float *)temp) + k;
for (int i = 0; i < ch; ++i) sum[i] += in[i];
}
const __m128 average = _mm_load_ps(sum) / ((float)roi_out->height * roi_out->width);
You can user-define a custom reduction like this:
#pragma omp declare reduction \
(addps:__m128:omp_out+=omp_in) \
initializer(omp_priv=_mm_setzero_ps())
And then use it like:
#pragma omp parallel for reduction(addps:average)
for(size_t k = 0; k < size * ch; k += ch)
{
average += _mm_loadu_ps(data+k);
}
I think, most importantly, openmp needs to know how to get a neutral element (here _mm_setzero_ps()) for your reduction.
Full working example: https://godbolt.org/z/Fpqttc
Interesting link: http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-reduction.html#User-definedreductions
I have tried to implement a C QuickSelect algorithm as described in this post (3 way quicksort (C implementation)).
However, all I get are performances 5 to 10 times less than the default qsort (even with an initial shuffling).
I tried to dig into the original qsort source code as provide here (https://github.com/lattera/glibc/blob/master/stdlib/qsort.c), but it's too complex.
Does anybody have a simpler, and better algorithm?
Any idea is welcomed.
Thanks,
NB: my original problem is to try to get the Kth smallest values of an array to the first Kth indices. So I planned to call quickselect K times
EDIT 1: Here is the Cython Code as copied and adapted from the link above
cdef void qswap(void* a, void* b, const size_t size) nogil:
cdef char temp[size]# C99, use malloc otherwise
#char serves as the type for "generic" byte arrays
memcpy(temp, b, size)
memcpy(b, a, size)
memcpy(a, temp, size)
cdef void qshuffle(void* base, size_t num, size_t size) nogil: #implementation of Fisher
cdef int i, j, tmp# create local variables to hold values for shuffle
for i in range(num - 1, 0, -1): # for loop to shuffle
j = c_rand() % (i + 1)#randomise j for shuffle with Fisher Yates
qswap(base + i*size, base + j*size, size)
cdef void partition3(void* base,
size_t *low, size_t *high, size_t size,
QComparator compar) nogil:
# Modified median-of-three and pivot selection.
cdef void *ptr = base
cdef size_t lt = low[0]
cdef size_t gt = high[0] # lt is the pivot
cdef size_t i = lt + 1# (+1 !) we don't compare pivot with itself
cdef int c = 0
while (i <= gt):
c = compar(ptr + i * size, ptr + lt * size)
if (c < 0):# base[i] < base[lt] => swap(i++,lt++)
qswap(ptr + lt * size, ptr + i * size, size)
i += 1
lt += 1
elif (c > 0):#base[i] > base[gt] => swap(i, gt--)
qswap(ptr + i * size, ptr + gt* size, size)
gt -= 1
else:#base[i] == base[gt]
i += 1
#base := [<<<<<lt=====gt>>>>>>]
low[0] = lt
high[0] = gt
cdef void qselectk3(void* base, size_t lo, size_t hi,
size_t size, size_t k,
QComparator compar) nogil:
cdef size_t low = lo
cdef size_t high = hi
partition3(base, &low, &high, size, compar)
if ((k - 1) < low): #k lies in the less-than-pivot partition
high = low - 1
low = lo
elif ((k - 1) >= low and (k - 1) <= high): #k lies in the equals-to-pivot partition
qswap(base, base + size*low, size)
return
else: # k > high => k lies in the greater-than-pivot partition
low = high + 1
high = hi
qselectk3(base, low, high, size, k, compar)
"""
A selection algorithm to find the nth smallest elements in an unordered list.
these elements ARE placed at the nth positions of the input array
"""
cdef void qselect(void* base, size_t num, size_t size,
size_t n,
QComparator compar) nogil:
cdef int k
qshuffle(base, num, size)
for k in range(n):
qselectk3(base + size*k, 0, num - k - 1, size, 1, compar)
I use python timeit to get the performance of both method pyselect(with N=50) and pysort.
Like this
def testPySelect():
A = np.random.randint(16, size=(10000), dtype=np.int32)
pyselect(A, 50)
timeit.timeit(testPySelect, number=1)
def testPySort():
A = np.random.randint(16, size=(10000), dtype=np.int32)
pysort(A)
timeit.timeit(testPySort, number=1)
The answer by #chqrlie is the good and final answer, yet to complete the post, I am posting the Cython version along with the benchmarking results.
In short, the proposed solution is 2 times faster than qsort on long vectors!
cdef void qswap2(void *aptr, void *bptr, size_t size) nogil:
cdef uint8_t* ac = <uint8_t*>aptr
cdef uint8_t* bc = <uint8_t*>bptr
cdef uint8_t t
while (size > 0): t = ac[0]; ac[0] = bc[0]; bc[0] = t; ac += 1; bc += 1; size -= 1
cdef struct qselect2_stack:
uint8_t *base
uint8_t *last
cdef void qselect2(void *base, size_t nmemb, size_t size,
size_t k, QComparator compar) nogil:
cdef qselect2_stack stack[64]
cdef qselect2_stack *sp = &stack[0]
cdef uint8_t *lb
cdef uint8_t*ub
cdef uint8_t *p
cdef uint8_t *i
cdef uint8_t *j
cdef uint8_t *top
if (nmemb < 2 or size <= 0):
return
top = <uint8_t *>base
if(k < nmemb):
top += k*size
else:
top += nmemb*size
sp.base = <uint8_t *>base
sp.last = <uint8_t *>base + (nmemb - 1) * size
sp += 1
cdef size_t offset
while (sp > stack):
sp -= 1
lb = sp.base
ub = sp.last
while (lb < ub and lb < top):
#select middle element as pivot and exchange with 1st element
offset = (ub - lb) >> 1
p = lb + offset - offset % size
qswap2(lb, p, size)
#partition into two segments
i = lb + size
j = ub
while 1:
while (i < j and compar(lb, i) > 0):
i += size
while (j >= i and compar(j, lb) > 0):
j -= size
if (i >= j):
break
qswap2(i, j, size)
i += size
j -= size
# move pivot where it belongs
qswap2(lb, j, size)
# keep processing smallest segment, and stack largest
if (j - lb <= ub - j):
sp.base = j + size
sp.last = ub
sp += 1
ub = j - size
else:
sp.base = lb
sp.last = j - size
sp += 1
lb = j + size
cdef int int_comp(void* a, void* b) nogil:
cdef int ai = (<int*>a)[0]
cdef int bi = (<int*>b)[0]
return (ai > bi ) - (ai < bi)
def pyselect2(numpy.ndarray[int, ndim=1, mode="c"] na, int n):
cdef int* a = <int*>&na[0]
qselect2(a, len(na), sizeof(int), n, int_comp)
Here are the benchmark results (1,000 tests):
#of elements K #qsort (s) #qselect2 (s)
1,000 50 0.1261 0.0895
1,000 100 0.1261 0.0910
10,000 50 0.8113 0.4157
10,000 100 0.8113 0.4367
10,000 1,000 0.8113 0.4746
100,000 100 7.5428 3.8259
100,000 1,000 7,5428 3.8325
100,000 10,000 7,5428 4.5727
For those who are curious, this piece of code is a jewel in the field of surface reconstruction using neural networks.
Thanks again to #chqrlie, your code is unique on The Web.
Here is a quick implementation for your purpose: qsort_select is a simple implementation of qsort with automatic pruning of unnecessary ranges.
Without && lb < top, it behaves like the regular qsort except for pathological cases where more advanced versions have better heuristics. This extra test prevents complete sorting of ranges that are outside the target 0 .. (k-1). The function selects the k smallest values and sorts them, the rest of the array has the remaining values in an undefinite order.
#include <stdio.h>
#include <stdint.h>
static void exchange_bytes(uint8_t *ac, uint8_t *bc, size_t size) {
while (size-- > 0) { uint8_t t = *ac; *ac++ = *bc; *bc++ = t; }
}
/* select and sort the k smallest elements from an array */
void qsort_select(void *base, size_t nmemb, size_t size,
int (*compar)(const void *a, const void *b), size_t k)
{
struct { uint8_t *base, *last; } stack[64], *sp = stack;
uint8_t *lb, *ub, *p, *i, *j, *top;
if (nmemb < 2 || size <= 0)
return;
top = (uint8_t *)base + (k < nmemb ? k : nmemb) * size;
sp->base = (uint8_t *)base;
sp->last = (uint8_t *)base + (nmemb - 1) * size;
sp++;
while (sp > stack) {
--sp;
lb = sp->base;
ub = sp->last;
while (lb < ub && lb < top) {
/* select middle element as pivot and exchange with 1st element */
size_t offset = (ub - lb) >> 1;
p = lb + offset - offset % size;
exchange_bytes(lb, p, size);
/* partition into two segments */
for (i = lb + size, j = ub;; i += size, j -= size) {
while (i < j && compar(lb, i) > 0)
i += size;
while (j >= i && compar(j, lb) > 0)
j -= size;
if (i >= j)
break;
exchange_bytes(i, j, size);
}
/* move pivot where it belongs */
exchange_bytes(lb, j, size);
/* keep processing smallest segment, and stack largest */
if (j - lb <= ub - j) {
sp->base = j + size;
sp->last = ub;
sp++;
ub = j - size;
} else {
sp->base = lb;
sp->last = j - size;
sp++;
lb = j + size;
}
}
}
}
int int_cmp(const void *a, const void *b) {
int aa = *(const int *)a;
int bb = *(const int *)b;
return (aa > bb) - (aa < bb);
}
#define ARRAY_SIZE 50000
int array[ARRAY_SIZE];
int main(void) {
int i;
for (i = 0; i < ARRAY_SIZE; i++) {
array[i] = ARRAY_SIZE - i;
}
qsort_select(array, ARRAY_SIZE, sizeof(*array), int_cmp, 50);
for (i = 0; i < 50; i++) {
printf("%d%c", array[i], i + 1 == 50 ? '\n' : ',');
}
return 0;
}
Hello everyone i wanted to calculate number of pi in openmp but something is wrong. Could you please tell me which part did i do wrong?
As you see in the below the time suppose to decrease but it doesn't.
#include <stdio.h>
#include <omp.h>
#define MAX_THREADS 4
static long num_steps = 100000000;
double step;
int main()
{
int i, j;
double pi, full_sum = 0.0;
double start_time, run_time;
double sum[MAX_THREADS];
step = 1.0 / (double)num_steps;
for (j = 1; j <= MAX_THREADS; j++){
omp_set_num_threads(j);
full_sum = 0.0;
start_time = omp_get_wtime();
#pragma omp parallel private(i)
{
int id = omp_get_thread_num();
int numthreads = omp_get_num_threads();
double x;
double partial_sum = 0;
#pragma omp single
printf(" num_threads = %d", numthreads);
for (i = id; i< num_steps; i += numthreads){
x = (i + 0.5)*step;
partial_sum += +4.0 / (1.0 + x*x);
}
#pragma omp critical
full_sum += partial_sum;
}
pi = step * full_sum;
run_time = omp_get_wtime() - start_time;
printf("\n pi is %f in %f seconds %d threds \n ", pi, run_time, j);
}
}