OpenMP actual number of threads

OpenMP actual number of threads - openmp

The below code is based on the video tutorials by Tim Mattson on YouTube.
I would like to find out the number of threads I actually receive when calling parallel (it is possible that I have requested 256 threads but only ended up with 8).
The usual omp_get_num_threads() does not work with the below (if I wanted to create a code block I get an expected a for loop following OpenMP 'directive' directive error):
void pi_with_omp() {
int i;
double x, pi, sum = 0.0;
double start_time, run_time;
step = 1.0 / (double)num_steps;
omp_set_num_threads(NUM_THREADS);
start_time = omp_get_wtime();
#pragma omp parallel for reduction(+:sum) private(x)
for (i = 0; i < num_steps; i++) {
x = (i + 0.5) * step;
sum += 4.0 / (1.0 + x * x);
}
pi = step * sum;
run_time = omp_get_wtime() - start_time;
printf("\n pi with %ld steps is %lf in %lf seconds", num_steps, pi, run_time);
}
The only way I have found is to rewrite the above pragma and dissect it into two like the following:
int nthreads;
#pragma omp parallel
{
double x;
int id, nthrds;
id = omp_get_thread_num();
nthrds = omp_get_num_threads();
if (id == 0) nthreads = nthrds;
#pragma omp for reduction(+:sum)
for (i = 0; i < num_steps; i++) {
x = (i + 0.5) * step;
sum = sum + 4.0 / (1.0 + x * x);
}
}
Which does the job but is not pretty. Has anyone got a better solution?

You can simplify your code, but you will still need to separate the parallel and the for.
int nthreads;
#pragma omp parallel
{
#pragma omp single nowait
nthreads = omp_get_num_threads();
#pragma omp for reduction(+:sum)
for (i = 0; i < num_steps; i++) {
double x = (i + 0.5) * step;
sum = sum + 4.0 / (1.0 + x * x);
}
}

Related

Reducing Shared Memory Bank Conflicts

Nvprof reported that there are about 200 milion shared_ld_bank_conflict and some shared_st_bank_conflict in my sgemm kernel. I tried the padding trick __shared__ float smem[SIZE + OFFSET];, it reduced store bank conflicts to 0, but load bank conflicts are still there. I don't know how to further improve it.
__global__ void sgemm(
const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ C,
int M, int N, int K
){
int tid = threadIdx.x;
int gStartx = blockIdx.x * 128;
int gStarty = blockIdx.y * 128;
int dx = tid % 8;
int dy = tid / 8;
int vx = tid % 16;
int vy = tid / 16;
__shared__ volatile float aSM[8][128+4];
__shared__ volatile float bSM[8][128+4];
float aBuffer1[4];
float bBuffer1[4];
float aBuffer2[4];
float bBuffer2[4];
float cCache[8][8];
#pragma unroll
for (int i=0; i<8; i++)
#pragma unroll
for (int j=0; j<8; j++)
cCache[i][j] = 0.f;
//load first two tiles
#pragma unroll
for (int i=0; i<4; i++){
aBuffer1[i] = A[(gStarty + dy + i*32)*K + (dx)];
bBuffer1[i] = B[(gStartx + dy + i*32)*K + (dx)];
}
int nIt = (K + 8 - 1) / 8;
#pragma unroll
for (int itr=0; itr<nIt; itr++){
int gStartk = itr * 8;
int is_odd = itr & 1;
if (is_odd == 0){
#pragma unroll
for (int i=0; i<4; i++){
if (itr != (nIt - 1)){
// prefetch next tiles
aBuffer2[i] = A[(gStarty + i*32 + dy)*K + (gStartk + 8 + dx)];
bBuffer2[i] = B[(gStartx + i*32 + dy)*K + (gStartk + 8 + dx)];
}
//move current tiles to SMEM
aSM[dx][dy+i*32] = aBuffer1[i];
bSM[dx][dy+i*32] = bBuffer1[i];
}
} else {
#pragma unroll
for (int i=0; i<4; i++){
if (itr != (nIt - 1)){
//prefetch next tiles to another buffer
aBuffer1[i] = A[(gStarty + i*32 + dy)*K + (gStartk + 8 + dx)];
bBuffer1[i] = B[(gStartx + i*32 + dy)*K + (gStartk + 8 + dx)];
}
aSM[dx][dy+i*32] = aBuffer2[i];
bSM[dx][dy+i*32] = bBuffer2[i];
}
}
__syncthreads();
float aCache[8][4];
#pragma unroll
for (int p=0; p<2; p++){
#pragma unroll
for (int ki=0; ki<8; ki++){
#pragma unroll
for (int mi=0; mi<4; mi++){
aCache[ki][mi] = aSM[ki][8*vy + 4*p +mi];
}
}
#pragma unroll
for (int ki=0; ki<8; ki++){
#pragma unroll
for (int ni=0; ni<8; ni++){
float b = bSM[ki][8*vx + ni];
#pragma unroll
for (int mi=0; mi<4; mi++){
float a = aCache[ki][mi];
cCache[mi + 4*p][ni] = fma(a, b, cCache[mi + 4*p][ni] );
}
}
}
}
__syncthreads();
}
#pragma unroll
for (int i=0; i<8; i++){
for (int j=0; j<8; j++){
C[(gStarty + vy*8 + i)*N + (gStartx + vx*8 + j)] = cCache[i][j];
}
}
}
A (2048x2048) matrix is row major, B (2048x2048) is column major, each block has 256 threads, each block calculates 128x128 portion of C, and each thread calculates 8x8x8. the gpu is Tesla P100.

Ok I found a solution: when storing to bSM, insert one padding word between every 32 words in the second dimention
//bSM[dx][dy+i*32] = bBuffer1[i];
bSM[dx][dy+i*33] = bBuffer1[i]; //we're skipping column 32, 65, 98, 131
when reading bSM[i][j], read it like this: bSM[i][j/32 + j]
//float b = bSM[ki][8*vx + ni];
float b = bSM[ki][(8*vx) / 32 + 8*vx + ni];
// (8*vx+ni)/32 is the same as (8*vx)/32, since vi is always less than 8
now it's giving me 55% performance of cublas gemm on tesla p4

OpenMP reduction on SSE2 vector

I want to compute the average of an image (3 channels of interest + 1 alpha channel we ignore here) for each channel using SSE2 intrinsics. I tried that:
__m128 average = _mm_setzero_ps();
#pragma omp parallel for reduction(+:average)
for(size_t k = 0; k < roi_out->height * roi_out->width * ch; k += ch)
{
float *in = ((float *)temp) + k;
average += _mm_load_ps(in);
}
But I get this error with GCC: user-defined reduction not found for average.
Is that possible with SSE2 ? What's wrong ?
Edit
This works:
float sum[4] = { 0.0f };
#pragma omp parallel for simd reduction(+:sum[:4])
for(size_t k = 0; k < roi_out->height * roi_out->width * ch; k += ch)
{
float *in = ((float *)temp) + k;
for (int i = 0; i < ch; ++i) sum[i] += in[i];
}
const __m128 average = _mm_load_ps(sum) / ((float)roi_out->height * roi_out->width);

You can user-define a custom reduction like this:
#pragma omp declare reduction \
(addps:__m128:omp_out+=omp_in) \
initializer(omp_priv=_mm_setzero_ps())
And then use it like:
#pragma omp parallel for reduction(addps:average)
for(size_t k = 0; k < size * ch; k += ch)
{
average += _mm_loadu_ps(data+k);
}
I think, most importantly, openmp needs to know how to get a neutral element (here _mm_setzero_ps()) for your reduction.
Full working example: https://godbolt.org/z/Fpqttc
Interesting link: http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-reduction.html#User-definedreductions

Number Of Pi in Parallel Programming Openmp

Hello everyone i wanted to calculate number of pi in openmp but something is wrong. Could you please tell me which part did i do wrong?
As you see in the below the time suppose to decrease but it doesn't.
#include <stdio.h>
#include <omp.h>
#define MAX_THREADS 4
static long num_steps = 100000000;
double step;
int main()
{
int i, j;
double pi, full_sum = 0.0;
double start_time, run_time;
double sum[MAX_THREADS];
step = 1.0 / (double)num_steps;
for (j = 1; j <= MAX_THREADS; j++){
omp_set_num_threads(j);
full_sum = 0.0;
start_time = omp_get_wtime();
#pragma omp parallel private(i)
{
int id = omp_get_thread_num();
int numthreads = omp_get_num_threads();
double x;
double partial_sum = 0;
#pragma omp single
printf(" num_threads = %d", numthreads);
for (i = id; i< num_steps; i += numthreads){
x = (i + 0.5)*step;
partial_sum += +4.0 / (1.0 + x*x);
}
#pragma omp critical
full_sum += partial_sum;
}
pi = step * full_sum;
run_time = omp_get_wtime() - start_time;
printf("\n pi is %f in %f seconds %d threds \n ", pi, run_time, j);
}
}

How to distribute teams on GPU using OpenMP?

i'm trying to utilize my Nvidia Geforce GT 740M for parallel-programming using OpenMP and the clang-3.8 compiler.
When processed in parallel on the CPU, I manage to get the desired result. However, when processed on the GPU, my results are some almost random numbers.
Therefore, I figured that I'm not correctly distributing my thread teams and that there might be some data races. I guess I have to do my for-loops differently but I have no idea where the mistake could be.
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int main(int argc, char* argv[])
{
const int n =100; float a = 3.0f; float b = 2.0f;
float *x = (float *) malloc(n * sizeof(float));
float *y = (float *) malloc(n * sizeof(float));
int i;
int j;
int k;
double start;
double end;
start = omp_get_wtime();
for (k=0; k<n; k++){
x[k] = 2.0f;
y[k] = 3.0f;
}
#pragma omp target data map(to:x[0:n]) map(tofrom:y[0:n]) map(to:i) map(to:j)
{
#pragma omp target teams
#pragma omp distribute
for(i = 0; i < n; i++) {
#pragma omp parallel for
for (j = 0; j < n; j++){
y[j] = a*x[j] + y[j];
}
}
}
end = omp_get_wtime();
printf("Work took %f seconds.\n", end - start);
free(x); free(y);
return 0;
}
I guess that it might have something to to with the Architecture of my GPU. So therefore I'm adding this:
Im fairly new to the topic, so thanks for your help :)

Yes, there is a race here. Different teams are reading and writing to the same element of the array 'y'. Perhaps you want something like this?
for(i = 0; i < n; i++) {
#pragma omp target teams distribute parallel for
for (j = 0; j < n; j++){
y[j] = a*x[j] + y[j];
}
}

openmp, for loop parallelization and critical zone error

I am new to OpenMP and I am using it to implement the Sieve of Eratosthenes, My code are:
int check_eratothenes(int *p, int pn, int n)
{
int count = 0;
bool* out = new bool[int(pow(pn, 2))];
memset(out, 0, pow(pn, 2));
#pragma omp parallel
for (int i = 0; i < n; i ++)
{
int j = floor((pn + 1) / p[i]) * p[i];
#pragma omp critical
while (j <= pow(pn, 2))
{
out[j] = 1;
j += p[i];
}
}
#pragma omp parallel
for (int i = pn+1; i < pow(pn, 2); i ++)
{
#pragma omp critical
if (out[i] == 0)
{
//cout << i << " ";
count ++;
}
}
return count;
}
But, the above OpenMP pragma is wrong. It can be complied but when it runs, it takes a lot of time to get the result, so it press CTRL + C to stop. And I felt at a loss on how to solve it . Since there are many loops and if statements.
Thanks in advance.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

OpenMP actual number of threads - openmp

Related

Reducing Shared Memory Bank Conflicts

OpenMP reduction on SSE2 vector

Number Of Pi in Parallel Programming Openmp

How to distribute teams on GPU using OpenMP?

openmp, for loop parallelization and critical zone error

Categories

Resources