Why doesn't this parallel for loop give me speedup? - openmp

int main()
{
for (int i = 1; i < 24; i++){
float total_time = 0;
for (int j = 0; j < 5; j++){
omp_set_num_threads(i);
vector<int> A(100000000,2);
double start_time = omp_get_wtime();
#pragma omp parallel for
for (int i = 0; i < 100000000; i++){
A[i] *= 2;
}
double time = omp_get_wtime() - start_time;
total_time += time;
}
total_time /= 5;
std::cout << "Number of threads: " << i << " Time(ms): " << total_time * 1000 << std::endl;
}
return 0;
}
The above code, which just doubles the entries in a vector of integers, has been parallelized by varying the number of threads. On my four-core machine, I observe no speedup. Given the simple nature of this loop, I expect to see at least some speedup. What's the issue here? How can I change it to get speedup?

Related

Thrust's exclusive_scan_by_key function takes the same amount of time as a sequential implementation?

I'm relatively new to Thrust and I'm trying to perform a segmented scan. Here is my code, which you should be able to run as-is:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <chrono>
// Sequential scan for CPU
float* test_seqScan(float* in, int s, int m) {
float* out = new float[s * m];
for (unsigned int i = 0; i < s; i++) {
out[i * m] = 0;
}
for (unsigned int i = 0; i < s; i++) {
for (unsigned int j = 1; j < m; j++) {
out[i * m + j] = out[i * m + j - 1] + in[i * m + j - 1];
}
}
return out;
}
void test_sumScan(thrust::device_vector<float> dev_in, thrust::device_vector<int> dev_keys, int s, int m) {
// Allocate device memory for output
thrust::device_vector<float> dev_out(s * m);
thrust::exclusive_scan_by_key(thrust::device, dev_keys.begin(), dev_keys.end(), dev_in.begin(), dev_out.begin());
}
int main(){
int s = 100;
int m = 100000;
float* seq_in = new float[s * m];
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
seq_in[i * m + j] = j + 1;
}
}
thrust::host_vector<float> par_in(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
par_in[i * m + j] = j + 1;
}
}
thrust::host_vector<int> keys(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
keys[i * m + j] = i;
}
}
thrust::device_vector<float> dev_in = par_in;
thrust::device_vector<int> dev_keys = keys;
auto t1 = std::chrono::high_resolution_clock::now();
test_seqScan(seq_in, s, m);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Sequential duration: " << duration1 << "\n\n";
auto t3 = std::chrono::high_resolution_clock::now();
test_sumScan(dev_in, dev_keys, s, m);
auto t4 = std::chrono::high_resolution_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Parallel duration: " << duration2 << "\n\n";
}
My issue is that both these snippets of code take exactly the same amount of time to run regardless of how small or large I set s and m. I assume that I'm doing something wrong, but I don't know what; can anyone point out the issue?

Access an matrix as its tranpose in tiled matrix mutliplication in CUDA

I'm currently experimenting with CUDA and i came across this kernel from an answer for matrix multiplication: https://stackoverflow.com/a/18856054/7867026
I want instead of doing A*B to do A_Transpose*A but without saving A_Transpose (only matrix A as an input to kernel). I have to properly set the indexes but I'm confused by this matrix representation. Any help would be appreciated.
most of what you need is here and here.
In the first link it is identified that AxAT involves taking inner products of rows of matrix A, and similarly ATxA will involve taking inner products of columns of matrix A. Also note the symmetry statement. In the second link (scroll down from that point a bit in the programming guide) you will find a complete tiled matrix multiply. You just need to index into both tiles by column.
Here is a worked example, using the code from the SO answer you linked:
$ cat t1654.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>
const int TILE_DIM = 32;
template <typename T>
__global__ void ATA(const T * __restrict__ A, T * __restrict__ C, int ARows, int ACols)
{
T CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
__shared__ T As[TILE_DIM][TILE_DIM];
__shared__ T Bs[TILE_DIM][TILE_DIM];
for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {
if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
else
As[threadIdx.y][threadIdx.x] = 0.0;
if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
else
Bs[threadIdx.y][threadIdx.x] = 0.0;
__syncthreads();
for (int n = 0; n < TILE_DIM; ++n)
CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];
__syncthreads();
}
if (Row < ACols && Col < ACols)
C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
(blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
}
template <typename T>
__global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
}
template <typename T>
__global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
T Cval = 0;
for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
C[row*colB+col] = Cval;}
}
typedef float mt;
int main(){
mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
int m = 64;
int n = 64;
h_A = new mt[m*n];
h_C = new mt[n*n];
h_C1 = new mt[n*n];
cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
// test 1
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = (i==j)?1.0f:0.0f;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
dim3 block(TILE_DIM, TILE_DIM);
dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
// test 2
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = rand()%10;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
transpose_naive<<<grid,block>>>(d_A, d_B, n);
mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C1[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
}
$ nvcc -o t1654 t1654.cu
$ cuda-memcheck ./t1654
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that loading the Bs tile is identical in both cases. The main changes are in loading the As tile, and also note the indexing change when computing Cvalue. These changes are necessary to index in both cases by column.
There may still be bugs. I have not tested the non-square case, nor have I tested the case where the matrix size is not a multiple of block size. Furthermore I've taken no advantage of the symmetry in the output. However this should help with the indexing.

Why inner product of same size matrix in Eigen cost quite different time?

I used Eigen to calculate inner product of two matrix, the first one is A=(BC).eval() and second one is D=(EF).eval(). Here B,C,E,F are the same size (1500 * 1500) but with different values. I find the first one cost about 200 ms while the second one cost about 6000 ms, I have no idea why this happened.
#include <iostream>
#include <time.h>
#include "Eigen/Dense"
int main() {
clock_t start, stop;
Eigen::MatrixXf mat_a(1200, 1500);
Eigen::MatrixXf mat_b(1500, 1500);
Eigen::MatrixXf mat_r(1000, 1300);
int i, j;
float c = 0;
for (i = 0; i < 1200; i++) {
for (j = 0; j < 1500; j++) {
mat_a(i, j) = (float)(c/3 * 1.0e-40);
//if (i % 2 == 0 && j % 2 == 0) mat_a(i, j);
c++;
}
}
//std::cout << mat_a.row(0) << std::endl;
c = 100;
for (i = 0; i < 1500; i++) {
for (j = 0; j < 1500; j++) {
mat_b(i, j) = (float)(c/3 * 0.5e-10);
c++;
}
}
//std::cout << mat_b.row(0) << std::endl;
start = clock();
mat_r = mat_a * mat_b;
stop = clock();
std::cout << stop - start << std::endl;
getchar();
return 0;
}
as show in above example code. I find this is caused by the value of the matrix, when mat_a has value about e-40 and mat_b has value about e-10, this problem occurs stably.
Is there anyone who can explain it?
This is because your matrix contains denormal numbers that are slow to deal with for the CPU. You should make sure that you are using reasonable units so that those can be considered as zeros, and then enable the flush-to-zero (FTZ) and denormals-as-zero flags (DAZ), for instance using the fast-math mode of your compiler or at runtime, see this SO question.

Different results when using OpenMP and FFTW

I am trying to parallelize the following loop:
#pragma omp parallel for private(j,i,mxy) firstprivate(in,out,p)
for(int j = 0; j < Ny; j++) {
// #pragma omp parallel for private(i,mxy) firstprivate(in,my,j)
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
in[i+1] = b_2D[mxy] + I*0.0 ;
}
fftw_execute(p);
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
b_2D[mxy] = cimag(out[i+1]) ;
}
}
I do get a small speed up, but I keep getting a different result regardless of what variables I set to private and firstprivate. I believe this is correct how I have done it, but why am I getting a different result than when I run this in series?
I have tried the following:
fftw_make_planner_thread_safe();
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
#pragma omp parallel private(j,i,mxy) firstprivate(in,out)
{
fftw_plan p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
for( j = 0; j < N; j++)
in[j] = 0.0;
#pragma omp for
for( j = 0; j < Ny; j++) {
for( i = 0; i < Nx; i++)
in[i+1] = b_2D[i + j*Nx] + I*0.0;
fftw_execute(p);
for( i = 0; i < Nx; i++)
b_2D[i + j*Nx] = cimag(out[i+1]) ;
}
fftw_destroy_plan(p);
}
fftw_free(in);
fftw_free(out);
This give me the error: "Segmentation fault: 11"
If I run this:
fftw_make_planner_thread_safe();
#pragma omp parallel private(j,i,mxy)
{
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_plan p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
for( j = 0; j < N; j++)
in[j] = 0.0;
#pragma omp for
for( j = 0; j < Ny; j++) {
for( i = 0; i < Nx; i++)
in[i+1] = b_2D[i + j*Nx] + I*0.0;
fftw_execute(p);
for( i = 0; i < Nx; i++)
b_2D[i + j*Nx] = cimag(out[i+1]) ;
}
fftw_destroy_plan(p);
fftw_free(in);
fftw_free(out);
}
I get this error again: "Segmentation fault: 11"
but I run again and it says:
solver(9674,0x7fff74e22000) malloc: *** error for object 0x7f8d70f00410: double free
*** set a breakpoint in malloc_error_break to debug
Abort trap: 6
You are calling FFTW with the same plan p in all threads. Since the plan includes the location of the input and output buffers (the ones supplied to the fftw_plan_dft_whatever plan constructor), all concurrent calls to fftw_execute will utilise those same buffers and not the private copies. The solution is to construct a separate plan for each thread:
#pragma omp parallel private(j,i,mxy) firstprivate(in,out)
{
// The following OpenMP construct enforces thread-safety
// Remove if the plan constructor is thread-safe
#pragma omp critical (plan_ops)
fftw_plan my_p = fftw_plan_dft_whatever(..., in, out, ...);
// my_p now refers the private in and out arrays
#pragma omp for
for(int j = 0; j < Ny; j++) {
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
in[i+1] = b_2D[mxy] + I*0.0 ;
}
fftw_execute(my_p);
for(int i = 0; i < Nx; i++){
mxy = i + j*Nx;
b_2D[mxy] = cimag(out[i+1]) ;
}
}
// See comment above for the constructor operation
#pragma omp critical (plan_ops)
fftw_destroy_plan(my_p);
}
The root cause should be this patch isn't backported to fftw-3.3.5 version, and I think you should merge the patch yourself. You can also refer the discussion here.

openmp, for loop parallelization and critical zone error

I am new to OpenMP and I am using it to implement the Sieve of Eratosthenes, My code are:
int check_eratothenes(int *p, int pn, int n)
{
int count = 0;
bool* out = new bool[int(pow(pn, 2))];
memset(out, 0, pow(pn, 2));
#pragma omp parallel
for (int i = 0; i < n; i ++)
{
int j = floor((pn + 1) / p[i]) * p[i];
#pragma omp critical
while (j <= pow(pn, 2))
{
out[j] = 1;
j += p[i];
}
}
#pragma omp parallel
for (int i = pn+1; i < pow(pn, 2); i ++)
{
#pragma omp critical
if (out[i] == 0)
{
//cout << i << " ";
count ++;
}
}
return count;
}
But, the above OpenMP pragma is wrong. It can be complied but when it runs, it takes a lot of time to get the result, so it press CTRL + C to stop. And I felt at a loss on how to solve it . Since there are many loops and if statements.
Thanks in advance.

Resources