Can I speed up type conversion using intrinsics? - c++11

I am working on an application which needs to convert data to float.
The data are unsigned char or unsigned short.
I am using both AVX2 and other SIMDs intrinsics in this code.
I wrote the conversion like this:
unsigned char -> float :
#ifdef __AVX2__
__m256i tmp_v =_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(src+j));
v16_avx[0] = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(tmp_v,0x0));
v16_avx[1] = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(tmp_v,0x1));
v32_avx[0] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[0],0x0));
v32_avx[1] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[0],0x1));
v32_avx[2] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[1],0x0));
v32_avx[3] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[1],0x1));
for (int l=0; l<4; l++) {
__m256 vc1_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_lt_avx[l]));
__m256 vc2_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_ge_avx[l]));
/*
....
some processing there.
*/
}
#endif
#ifdef __SSE2__
#ifdef __SSE3__
__m128i tmp_v = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(src+j));
#else
__m128i tmp_v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src+j));
#endif
#ifdef __SSE4_1__
v16[0] = _mm_cvtepu8_epi16(tmp_v);
tmp_v = _mm_shuffle_epi8(tmp_v,mask8);
v16[1] = _mm_cvtepu8_epi16(tmp_v);
v32[0] = _mm_cvtepi16_epi32(v16[0]);
v16[0] = _mm_shuffle_epi32(v16[0],0x4E);
v32[1] = _mm_cvtepi16_epi32(v16[0]);
v32[2] = _mm_cvtepi16_epi32(v16[1]);
v16[1] = _mm_shuffle_epi32(v16[1],0x4E);
v32[3] = _mm_cvtepi16_epi32(v16[1]);
#else
__m128i tmp_v_l = _mm_slli_si128(tmp_v,8);
__m128i tmp_v_r = _mm_srli_si128(tmp_v,8);
v16[0] = _mm_unpacklo_epi8(tmp_v,tmp_v_l);
v16[1] = _mm_unpackhi_epi8(tmp_v,tmp_v_r);
tmp_v_l = _mm_srli_epi16(v16[0],8);
tmp_v_r = _mm_srai_epi16(v16[0],8);
v32[0] = _mm_unpacklo_epi16(v16[0],tmp_v_l);
v32[1] = _mm_unpackhi_epi16(v16[0],tmp_v_r);
v16[0] = _mm_unpacklo_epi8(tmp_v,tmp_v_l);
v16[1] = _mm_unpackhi_epi8(tmp_v,tmp_v_r);
tmp_v_l = _mm_srli_epi16(v16[1],8);
tmp_v_r = _mm_srai_epi16(v16[1],8);
v32[2] = _mm_unpacklo_epi16(v16[1],tmp_v_l);
v32[3] = _mm_unpackhi_epi16(v16[1],tmp_v_r);
#endif
for (int l=0; l<4; l++) {
__m128 vc1_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_lt[l]));
__m128 vc2_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_ge[l]));
/*
...
some processing there.
*/
}
#endif
unsigned short -> float
#ifdef __AVX2__
v32_avx[0] = _mm256_cvtepu16_epi32(_mm256_extractf128_si256(tmp_v,0x0));
v32_avx[1] = _mm256_cvtepu16_epi32(_mm256_extractf128_si256(tmp_v,0x1));
for(int l=0;l<2;l++) {
__m256 vc1_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_lt_avx[l]));
__m256 vc2_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_ge_avx[l]));
/*
...
some processing there.
*/
}
#endif
#ifdef __SSE2__
#ifdef __SSE3__
__m128i tmp_v = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(src+j));
#else
__m128i tmp_v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src+j));
#endif
#ifdef __SSE4_1__
v32[0] = _mm_cvtepu16_epi32(tmp_v);
tmp_v = _mm_shuffle_epi32(tmp_v,0x4E);
v32[1] = _mm_cvtepu16_epi32(tmp_v);
#else
__m128i tmp_v_l = _mm_slli_si128(tmp_v,8);
__m128i tmp_v_r = _mm_srli_si128(tmp_v,8);
v32[0] = _mm_unpacklo_epi16(tmp_v,tmp_v_l);
v32[1] = _mm_unpackhi_epi16(tmp_v,tmp_v_r);
#endif
for(int l=0;l<2;l++) {
__m128 vc1_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_lt[l]));
__m128 vc2_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_ge[l]));
/*
...
some processing there.
*/
}
#endif
The processing in the comments have nothing to do with the conversion step.
I would like to speed up those conversions.
I read in SSE: convert short integer to float and in Converting Int to Float/Float to Int using Bitwise that it's possible to do this using bitwise operations.
Are those approaches really any faster?
I experimented with the implementation in the first link; there was almost no change in processing time, it worked fine for signed short and also for unsigned short as long as the value is included between 0 and MAX_SHRT (32767 on my system):
#include <immintrin.h>
#include <iterator>
#include <iostream>
#include <chrono>
void convert_sse_intrinsic(const ushort *source,const int len, int *destination)
{
__m128i zero2 = _mm_setzero_si128();
for (int i = 0; i < len; i+=4)
{
__m128i value = _mm_unpacklo_epi16(_mm_set_epi64x(0,*((long long*)(source+i)) /**ps*/), zero2);
value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i),value);
}
}
void convert_sse_intrinsic2(const ushort *source,const int len, int *destination)
{
for (int i = 0; i < len; i+=8)
{
__m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(source+i));
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i),_mm_cvtepu16_epi32(value));
value = _mm_shuffle_epi32(value,0x4E);
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i+4),_mm_cvtepu16_epi32(value));
}
}
int main(int argc, char *argv[])
{
ushort CV_DECL_ALIGNED(32) toto[16] =
{0,500,1000,5000,
10000,15000,20000,25000,
30000,35000,40000,45000,
50000,55000,60000,65000};
int CV_DECL_ALIGNED(32) tutu[16] = {0};
std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
convert_sse_intrinsic(toto,16,tutu);
std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();
std::cout<<"processing time 1st method : "<<std::chrono::duration_cast<std::chrono::nanoseconds>(stop-start).count()<<" : ns"<<std::endl;
std::copy(tutu,tutu+16,std::ostream_iterator<int>(std::cout," "));
std::cout<<std::endl;
start = std::chrono::steady_clock::now();
convert_sse_intrinsic2(toto,16,tutu);
stop = std::chrono::steady_clock::now();
std::cout<<"processing time 2nd method : "<<std::chrono::duration_cast<std::chrono::nanoseconds>(stop-start).count()<<" : ns"<<std::endl;
std::copy(tutu,tutu+16,std::ostream_iterator<int>(std::cout," "));
std::cout<<std::endl;
return 0;
}
Thanks in advance for any help.

Well I think there is not really any faster way to convert an unsigned char or an unsigned short to float rather than the intrinsics already there.
I tried several other ways using bitwise operators, but none was significantly faster.
So I think that it's not interesting to let this topic linger any longer.

Related

Convert Rcpp::List to C++ vector of vectors of const char*

I'm trying to build an Rcpp interface to an existing C++ library that uses const char* for strings. I think I need to use Rcpp::List to pass some output indices, which is a ragged 2D array of strings. But I am having trouble converting this to the C++ primitives required by the external function.
#include <Rcpp.h>
// Enable C++11 via this plugin (Rcpp 0.10.3 or later)
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
void test(Rcpp::List IndexList){
// convert list to this
const std::vector<std::vector<const char*>> Indexes;
for (int i = 0; i < IndexList.size(); i++){
Rcpp::StringVector temp = IndexList[i];
std::vector<const char*> temp2;
temp2.clear();
for (int j = 0; j < temp.size(); j++){
Rcpp::Rcout << temp[j];
temp2.push_back(temp[j]);
}
Indexes.push_back(temp2);
}
}
/*** R
test(list(c("a", "b"), c("cde")))
*/
The line Indexes.push_back(temp2); throws an error which ever way I try to build this object (which I need to pass to another function).
Line 19 passing 'const std::vector<std::vector<const char*> >' as 'this' argument of 'void std::vector<_Tp, _Alloc>::push_back(const value_type&) [with _Tp = std::vector<const char*>; _Alloc = std::allocator<std::vector<const char*> >; std::vector<_Tp, _Alloc>::value_type = std::vector<const char*>]' discards qualifiers [-fpermissive]
Solution
#include <Rcpp.h>
// Enable C++11 via this plugin (Rcpp 0.10.3 or later)
// [[Rcpp::plugins(cpp11)]]
// function that wants this data structure
void feedme(const std::vector<std::vector<const char *>> &Indexes){
Rcpp::Rcout << " feedme ";
for (unsigned i = 0; i < Indexes.size(); i++){
for (unsigned j = 0; j < Indexes[i].size(); j++){
Rcpp::Rcout << Indexes[i][j];
}
}
}
// [[Rcpp::export]]
void test(Rcpp::List IndexList){
// convert list to this
Rcpp::Rcout << " test ";
std::vector<std::vector<const char*>> Indexes;
for (int i = 0; i < IndexList.size(); i++){
Rcpp::StringVector temp = IndexList[i];
std::vector<const char*> temp2;
temp2.clear();
for (int j = 0; j < temp.size(); j++){
Rcpp::Rcout << temp[j];
temp2.push_back(temp[j]);
}
Indexes.push_back(temp2);
}
feedme(Indexes);
}
/*** R
test(list(c("a", "b"), c("cde")))
*/
You are declaring Indexes as const but try to change it later on:
const std::vector<std::vector<const char*>> Indexes;
// ^^^^^
// [...]
Indexes.push_back(temp2);
Remove the const qualifier to get the code compiled.
Maybe this helps. Here we pass from of a Rcpp::StringVector to two vectors of both const char * and char *. I was half-expecting to have to const-cast but it just built as is.
Code
#include <Rcpp.h>
void consumeConstChar(std::vector<const char*> v) {
for (auto s : v) Rcpp::Rcout << s << std::endl;
}
void consumeChar(std::vector<char*> v) {
for (auto s : v) Rcpp::Rcout << s << std::endl;
}
// [[Rcpp::export]]
void doStuff(Rcpp::StringVector x) {
std::vector<const char*> vcc;
std::vector<char*> vc;
for (int i=0; i<x.size(); i++) {
vcc.push_back(x[i]);
vc.push_back(x[i]);
}
consumeConstChar(vcc);
consumeChar(vc);
}
/*** R
x <- c("The quick", "brown", "fox", "jumped")
doStuff(x)
*/
Demo
R> Rcpp::sourceCpp("~/git/stackoverflow/58741017/answer.cpp")
R> x <- c("The quick", "brown", "fox", "jumped")
R> doStuff(x)
The quick
brown
fox
jumped
The quick
brown
fox
jumped
R>

C++ and Openmp with critical too slow

I tried use Rcpp and Openmp to accelerate my code. Here is my cpp code. I wonder why. What is the best way to accelerate this code by openmp.
// #include <Rcpp.h>
#include <vector>
#include <string.h>
#include <RcppArmadillo.h>
#include "omp.h"
using namespace Rcpp;
using namespace std;
// Function subset("[.data.frame");
// [[Rcpp::plugins(openmp) ]]
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
DataFrame reformdata(DataFrame rawfile, DataFrame genefile){
vector<string> rawchr = rawfile["chr"];
NumericVector rawpos = rawfile["start"];
vector<string> genechr = genefile["X.1"];
NumericVector genestart = genefile["TSS.start"];
NumericVector geneend = genefile["TSS.end"];
vector<string> geneID = genefile["X"];
NumericVector rawnumCs = rawfile["numCs"];
NumericVector rawnumTs = rawfile["numTs"];
NumericVector rawmethyl = rawfile["methyl"];
int n_raw = rawchr.size();
int n_gene = genechr.size();
int i = 0,j = 0;
vector<string> outputgeneID;
vector<string> outputchr;
NumericVector outputstart;
NumericVector outputend;
NumericVector outputmethyl;
NumericVector outputnumCs;
NumericVector outputnumTs;
#pragma omp parallel for num_threads(8)
for(i = 0; i < n_gene; i++){
string loc_gene_name = genechr[i];
int gene_start = genestart[i];
int gene_end = geneend[i];
for(j = 0;j < n_raw; j++){
string raw_name = rawchr[j];
int raw_pos = rawpos[j];
if(raw_name.compare(loc_gene_name)==0&&raw_pos >= gene_start&&raw_pos <= gene_end){
#pragma omp critical
{
outputgeneID.push_back(geneID[i]);
outputchr.push_back(rawchr[j]);
outputstart.push_back(rawpos[j]);
outputend.push_back(rawpos[j]);
outputmethyl.push_back(rawmethyl[j]);
outputnumCs.push_back(rawnumCs[j]);
outputnumTs.push_back(rawnumTs[j]);
}
}
}
}
return DataFrame::create(Named("geneID")=outputgeneID,Named("chr")=outputchr,
Named("start")=outputstart,Named("end")=outputend,
Named("methyl")=outputmethyl,
Named("numCs")=outputnumCs,Named("numTs")=outputnumTs);
}
I just want to input two Dataframe in R, and then do a match between this two data frame. Maybe the push_back is the problem where it is. Is there a easy way to avoid it? I am dealing with big data, speed is important.

Effective implementation of conversion small string to uint64_t

#include <cstdint>
#include <cstring>
template<typename T>
T oph_(const char *s){
constexpr std::size_t MAX = sizeof(T);
const std::size_t size = strnlen(s, MAX);
T r = 0;
for(auto it = s; it - s < size; ++it)
r = r << 8 | *it;
return r;
}
inline uint64_t oph(const char *s){
return oph_<uint64_t>(s);
}
int main(){
uint64_t const a = oph("New York City");
uint64_t const b = oph("Boston International");
return a > b;
}
I want to convert first 8 characters from const char * to uint64_t so I can easily compare if two strings are greater / lesser.
I am aware that equals will semi-work.
However I am not sure if this is most efficient implementation.
I want the implementation to work on both little and big endian machines.
This is a C implementation, that should be faster that your implementation, but I still need to use strncpy which should be the bottleneck
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <byteswap.h>
union small_str {
uint64_t v;
char buf[8];
};
static uint64_t fill_small_str(const char *str)
{
union small_str ss = { 0 };
strncpy(ss.buf, str, 8);
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
return ss.v;
#else
return bswap_64(ss.v);
#endif
}
int main(void)
{
uint64_t const a = fill_small_str("Aew York City");
uint64_t const b = fill_small_str("Boston International");
printf("%lu ; %lu ; %d\n", a, b, (a < b));
return 0;
}

Intel gather instruction

I am a little confused about how Intel gather intrinsic works.
I have the following simple code. One of them is to set y[0]=y[1] = x[0], ... y[20002]=y[20003]=x[10002], the other one is to set y[i] = x[i], y[i+1] = x[i+2].
I just randomly print out some values to check the correctness. I found that I could get both y[10] and y[11] equal 2.46 if "zeros" is used. However, I will get a random number for y[11] when I use "stride", while y[10] is still 2.46. Any idea about what's wrong?
#include <stdio.h>
#include <xmmintrin.h>
#include <immintrin.h>
void dummy(double *x, double *y) {
printf("%lf, %lf\n", y[10], y[11]);
return;
}
int main() {
double x[20004];
double y[20004];
__m128i zeros = _mm_set_epi64x(0, 0);
__m128i stride = _mm_set_epi64x(2, 0);
for (int i = 0; i <= 20004; ++i) {
x[i] = i * 0.246;
}
for (int j = 0; j <= 10000; j+=2) {
#ifdef ZERO
__m128d gather = _mm_i64gather_pd(&x[j], zeros, 1);
#else
__m128d gather = _mm_i64gather_pd(&x[j], stride, 1);
#endif
_mm_store_pd(&y[j], gather);
}
dummy(x, y);
}

Why does the left shift on a unsigned int happens from the 16th bit?

I am trying to put the values from the vector into the int.
Given vector :'1','0','1','1','1','0','1','1','1','0','1','1','1','0','1','1' :
Expected output (binary representation for the variable out):
00000000000000001011101110111011.
However, I am getting the following output:
10111011101110110000000000000000
Notice: the insertion begun at the 16bit from right end instead of beginning from the leftmost bit
#include<vector>
#include<iostream>
int main() {
std::vector<unsigned char> test = {'1','0','1','1','1','0','1','1','1','0','1','1','1','0','1','1'};
std::vector<unsigned int> out(1);
int j = 0;
for (int i =0; i < test.size(); i++) {
out[j] = out[j] << 1;
if (test[i] == '1') {out[j] |=0x1;}
}
j++;
for (int p = 0; p < j; p++) {
for (int k = 0; k<32; k++ ) {
std::cout << !!((out[p]<<k)&0x8000);
}
std::cout << std::endl;
}
std::cout << "Size Of:" << sizeof(int);
return 0;
}
The reason why this happens is that you are using a wrong constant for the mask: 0x8000 has its 16-bit set, while you probably meant to use 0x80000000 with the 32-nd bit set. To avoid mistakes like that it's best to construct masks with shifts, for example
(1 << 31)
This expression is evaluated at compile time, so the result is the same as if you computed the constant yourself.
Note that both 0x8000 and 0x80000000 constants are system-dependent. Moreover, 0x80000000 assumes 32-bit int, which is not guaranteed.
A better approach would be shifting the number right instead of left, and masking with 1.
The block of code creating out[j] works just fine.
Your problem is in the output block, due to use of 0x8000. Whenever k >= 16, the low 16 bits will be zero, guaranteeing that 0x8000 is zero.
Your code seems overly complicated to me. Here's my version of a C program that transforms a string of 1's and 0's into an int and one going from int to string.
#include <stdlib.h>
#include <stdio.h>
int main(int argc, char **argv);
int main (int argc, char **argv) {
char str [] = "1010101010101010";
int x;
int out;
for (x=0;x<16;x++) {
if (str[x] == '1') {
out |= (1 << x);
}
}
printf("%d", out) ;
}
and
#include <stdlib.h>
#include <stdio.h>
int main(int argc, char **argv);
int main (int argc, char **argv) {
char str [] = "1010101010101010";
int in = 21845;
char out[17] = {0};
for (x=0;x<16;x++) {
if (in & (1<<x)) {
out[x] = '1';
}
else {
out[x] = '0';
}
}
printf("%s", out) ;
}

Resources