C++ and Openmp with critical too slow - openmp

I tried use Rcpp and Openmp to accelerate my code. Here is my cpp code. I wonder why. What is the best way to accelerate this code by openmp.
// #include <Rcpp.h>
#include <vector>
#include <string.h>
#include <RcppArmadillo.h>
#include "omp.h"
using namespace Rcpp;
using namespace std;
// Function subset("[.data.frame");
// [[Rcpp::plugins(openmp) ]]
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
DataFrame reformdata(DataFrame rawfile, DataFrame genefile){
vector<string> rawchr = rawfile["chr"];
NumericVector rawpos = rawfile["start"];
vector<string> genechr = genefile["X.1"];
NumericVector genestart = genefile["TSS.start"];
NumericVector geneend = genefile["TSS.end"];
vector<string> geneID = genefile["X"];
NumericVector rawnumCs = rawfile["numCs"];
NumericVector rawnumTs = rawfile["numTs"];
NumericVector rawmethyl = rawfile["methyl"];
int n_raw = rawchr.size();
int n_gene = genechr.size();
int i = 0,j = 0;
vector<string> outputgeneID;
vector<string> outputchr;
NumericVector outputstart;
NumericVector outputend;
NumericVector outputmethyl;
NumericVector outputnumCs;
NumericVector outputnumTs;
#pragma omp parallel for num_threads(8)
for(i = 0; i < n_gene; i++){
string loc_gene_name = genechr[i];
int gene_start = genestart[i];
int gene_end = geneend[i];
for(j = 0;j < n_raw; j++){
string raw_name = rawchr[j];
int raw_pos = rawpos[j];
if(raw_name.compare(loc_gene_name)==0&&raw_pos >= gene_start&&raw_pos <= gene_end){
#pragma omp critical
{
outputgeneID.push_back(geneID[i]);
outputchr.push_back(rawchr[j]);
outputstart.push_back(rawpos[j]);
outputend.push_back(rawpos[j]);
outputmethyl.push_back(rawmethyl[j]);
outputnumCs.push_back(rawnumCs[j]);
outputnumTs.push_back(rawnumTs[j]);
}
}
}
}
return DataFrame::create(Named("geneID")=outputgeneID,Named("chr")=outputchr,
Named("start")=outputstart,Named("end")=outputend,
Named("methyl")=outputmethyl,
Named("numCs")=outputnumCs,Named("numTs")=outputnumTs);
}
I just want to input two Dataframe in R, and then do a match between this two data frame. Maybe the push_back is the problem where it is. Is there a easy way to avoid it? I am dealing with big data, speed is important.

Related

OpenBlas parallelisation from OpenMP Thread

I tried to call an OpenBlas function from an OpenMP thread while the Blas parallelisation is set to a value unequal to one. I am using OpenBlas 0.3.9, after downloading the source I untared it and called
make USE_OPENMP=1
make PREFIX=/someFolder/ install
However I always get the following error message from my executeable
OpenBLAS Warning : Detect OpenMP Loop and this application may hang. Please rebuild the library with USE_OPENMP=1 option.
Does anyone know, why this is the case and how I can change it? Here is a minimal example of my code:
#include <complex>
#include <vector>
#include <random>
#include <iostream>
#include <algorithm>
#include <omp.h>
#include <cblas.h>
#include <lapacke.h>
int main(int, char**) {
int const blas_threads = 2,
omp_threads = 2,
matrix_size = 100;
openblas_set_num_threads(blas_threads);
omp_set_max_active_levels(2);
double alpha = 1.,
beta = 0.;
std::vector<std::vector<double>> as(omp_threads,
std::vector<double>(matrix_size*matrix_size));
std::vector<std::vector<double>> bs(omp_threads,
std::vector<double>(matrix_size*matrix_size));
std::vector<std::vector<double>> cs(omp_threads,
std::vector<double>(matrix_size*matrix_size));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<double> dis;
for(int i = 0; i < omp_threads; ++i) {
std::generate(as[i].begin(),
as[i].end(),
[&dis,&gen]() { return dis(gen); });
std::generate(bs[i].begin(),
bs[i].end(),
[&dis,&gen]() { return dis(gen); });
}
// for(int i = 0; i < matrix_size*matrix_size; ++i) {
// std::cout << as[0][i] << " " << bs[0][i] << std::endl;
// }
#pragma omp parallel for num_threads(omp_threads), schedule(static, 1)
for(int i = 0; i < omp_threads; ++i) {
cblas_dgemm(CblasColMajor,
CblasNoTrans,
CblasNoTrans,
matrix_size,
matrix_size,
matrix_size,
alpha,
as[i].data(),
matrix_size,
bs[i].data(),
matrix_size,
beta,
cs[i].data(),
matrix_size);
}
// for(int i = 0; i < matrix_size*matrix_size; ++i) {
// std::cout << cs[0][i] << std::endl;
// }
return 0;
}

Matrix Multiplication OpenMP Counter-Intuitive Results

I am currently porting some code over to OpenMP at my place of work. One of the tasks I am doing is figuring out how to speed up matrix multiplication for one of our applications.
The matrices are stored in row-major format, so A[i*cols +j] gives the A_i_j element of the matrix A.
The code looks like this (uncommenting the pragma parallelises the code):
#include <omp.h>
#include <iostream>
#include <iomanip>
#include <stdio.h>
#define NUM_THREADS 8
#define size 500
#define num_iter 10
int main (int argc, char *argv[])
{
// omp_set_num_threads(NUM_THREADS);
int *A = new int [size*size];
int *B = new int [size*size];
int *C = new int [size*size];
for (int i=0; i<size; i++)
{
for (int j=0; j<size; j++)
{
A[i*size+j] = j*1;
B[i*size+j] = i*j+2;
C[i*size+j] = 0;
}
}
double total_time = 0;
double start = 0;
for (int t=0; t<num_iter; t++)
{
start = omp_get_wtime();
int i, k;
// #pragma omp parallel for num_threads(10) private(i, k) collapse(2) schedule(dynamic)
for (int j=0; j<size; j++)
{
for (i=0; i<size; i++)
{
for (k=0; k<size; k++)
{
C[i*size+j] += A[i*size+k] * B[k*size+j];
}
}
}
total_time += omp_get_wtime() - start;
}
std::setprecision(5);
std::cout << total_time/num_iter << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 0;
}
What is confusing me is the following: why is dynamic scheduling faster than static scheduling for this task? Timing the runs and taking an average shows that static scheduling is slower, which to me is a bit counterintuitive since each thread is doing the same amount of work.
Also, am I correctly speeding up my matrix multiplication code?
Parallel matrix multiplication is non-trivial (have you even considered cache-blocking?). Your best bet is likely to be to use a BLAS Library for this, rather than writing it yourself. (Remember, "The best code is the code I do not have to write").
Wikipedia: Basic Linear Algebra Subprograms points to many implementations, a lot of which (including Intel Math Kernel Library) have free licenses.

Effective implementation of conversion small string to uint64_t

#include <cstdint>
#include <cstring>
template<typename T>
T oph_(const char *s){
constexpr std::size_t MAX = sizeof(T);
const std::size_t size = strnlen(s, MAX);
T r = 0;
for(auto it = s; it - s < size; ++it)
r = r << 8 | *it;
return r;
}
inline uint64_t oph(const char *s){
return oph_<uint64_t>(s);
}
int main(){
uint64_t const a = oph("New York City");
uint64_t const b = oph("Boston International");
return a > b;
}
I want to convert first 8 characters from const char * to uint64_t so I can easily compare if two strings are greater / lesser.
I am aware that equals will semi-work.
However I am not sure if this is most efficient implementation.
I want the implementation to work on both little and big endian machines.
This is a C implementation, that should be faster that your implementation, but I still need to use strncpy which should be the bottleneck
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <byteswap.h>
union small_str {
uint64_t v;
char buf[8];
};
static uint64_t fill_small_str(const char *str)
{
union small_str ss = { 0 };
strncpy(ss.buf, str, 8);
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
return ss.v;
#else
return bswap_64(ss.v);
#endif
}
int main(void)
{
uint64_t const a = fill_small_str("Aew York City");
uint64_t const b = fill_small_str("Boston International");
printf("%lu ; %lu ; %d\n", a, b, (a < b));
return 0;
}

Intel gather instruction

I am a little confused about how Intel gather intrinsic works.
I have the following simple code. One of them is to set y[0]=y[1] = x[0], ... y[20002]=y[20003]=x[10002], the other one is to set y[i] = x[i], y[i+1] = x[i+2].
I just randomly print out some values to check the correctness. I found that I could get both y[10] and y[11] equal 2.46 if "zeros" is used. However, I will get a random number for y[11] when I use "stride", while y[10] is still 2.46. Any idea about what's wrong?
#include <stdio.h>
#include <xmmintrin.h>
#include <immintrin.h>
void dummy(double *x, double *y) {
printf("%lf, %lf\n", y[10], y[11]);
return;
}
int main() {
double x[20004];
double y[20004];
__m128i zeros = _mm_set_epi64x(0, 0);
__m128i stride = _mm_set_epi64x(2, 0);
for (int i = 0; i <= 20004; ++i) {
x[i] = i * 0.246;
}
for (int j = 0; j <= 10000; j+=2) {
#ifdef ZERO
__m128d gather = _mm_i64gather_pd(&x[j], zeros, 1);
#else
__m128d gather = _mm_i64gather_pd(&x[j], stride, 1);
#endif
_mm_store_pd(&y[j], gather);
}
dummy(x, y);
}

"Warning : Non-POD class type passed through ellipsis" for simple thrust program

In spite of reading many answers on the same kind of questions on SO I am not able to figure out solution in my case. I have written the following code to implement a thrust program. Program performs simple copy and display operation.
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
int main(void)
{
// H has storage for 4 integers
thrust::host_vector<int> H(4);
H[0] = 14;
H[1] = 20;
H[2] = 38;
H[3] = 46;
// H.size() returns the size of vector H
printf("\nSize of vector : %d",H.size());
printf("\nVector Contents : ");
for (int i = 0; i < H.size(); ++i) {
printf("\t%d",H[i]);
}
thrust::device_vector<int> D = H;
printf("\nDevice Vector Contents : ");
for (int i = 0; i < D.size(); i++) {
printf("%d",D[i]); //This is where I get the warning.
}
return 0;
}
Thrust implements certain operations to facilitate using elements of a device_vector in host code, but this apparently isn't one of them.
There are many approaches to addressing this issue. The following code demonstrates 3 possible approaches:
explicitly copy D[i] to a host variable, and thrust has an appropriate method defined for that.
copy the thrust device_vector back to a host_vector before print-out.
use thrust::copy to directly copy the elements of the device_vector to a stream.
Code:
#include <stdio.h>
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
int main(void)
{
// H has storage for 4 integers
thrust::host_vector<int> H(4);
H[0] = 14;
H[1] = 20;
H[2] = 38;
H[3] = 46;
// H.size() returns the size of vector H
printf("\nSize of vector : %d",H.size());
printf("\nVector Contents : ");
for (int i = 0; i < H.size(); ++i) {
printf("\t%d",H[i]);
}
thrust::device_vector<int> D = H;
printf("\nDevice Vector Contents : ");
//method 1
for (int i = 0; i < D.size(); i++) {
int q = D[i];
printf("\t%d",q);
}
printf("\n");
//method 2
thrust::host_vector<int> Hnew = D;
for (int i = 0; i < Hnew.size(); i++) {
printf("\t%d",Hnew[i]);
}
printf("\n");
//method 3
thrust::copy(D.begin(), D.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
Note that for methods like these, thrust is generating various kinds of device-> host copy operations to facilitate the use of device_vector in host code. This has performance implications, so you might want to use the defined copy operations for large vectors.

Resources