Effective implementation of conversion small string to uint64_t - c++11

#include <cstdint>
#include <cstring>
template<typename T>
T oph_(const char *s){
constexpr std::size_t MAX = sizeof(T);
const std::size_t size = strnlen(s, MAX);
T r = 0;
for(auto it = s; it - s < size; ++it)
r = r << 8 | *it;
return r;
}
inline uint64_t oph(const char *s){
return oph_<uint64_t>(s);
}
int main(){
uint64_t const a = oph("New York City");
uint64_t const b = oph("Boston International");
return a > b;
}
I want to convert first 8 characters from const char * to uint64_t so I can easily compare if two strings are greater / lesser.
I am aware that equals will semi-work.
However I am not sure if this is most efficient implementation.
I want the implementation to work on both little and big endian machines.

This is a C implementation, that should be faster that your implementation, but I still need to use strncpy which should be the bottleneck
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <byteswap.h>
union small_str {
uint64_t v;
char buf[8];
};
static uint64_t fill_small_str(const char *str)
{
union small_str ss = { 0 };
strncpy(ss.buf, str, 8);
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
return ss.v;
#else
return bswap_64(ss.v);
#endif
}
int main(void)
{
uint64_t const a = fill_small_str("Aew York City");
uint64_t const b = fill_small_str("Boston International");
printf("%lu ; %lu ; %d\n", a, b, (a < b));
return 0;
}

Related

How could I compute the order of elements in each row of a matrix with cuda?

I am finding how could I do argsort with cuda/thrust along rows or colums of a matrix.
It means that given a matrix such as:
A = [[ 3.4257, -1.2345, 0.6232, -0.1354],
[-1.6639, 0.1557, -0.1763, 1.0257],
[0.6863, 0.0992, 1.4487, 0.0157]].
And I need to compute the order of elements in each row, so the output is:
index = [[1, 3, 2, 0],
[0, 2, 1, 3],
[3, 1, 0, 2]]
How could I do this please?
This is possible using thrust::sort. We need a set of row indices and a set of column indices. The row indices are to make sure the sorting order is segmented among the rows. The column indices are what will give us the result, after sorting.
zip together the value, row index, column index. Create a sort functor that orders rows first, then values. The output is the rearranged column indices.
$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
using namespace thrust::placeholders;
struct my_sort_functor
{
template <typename T1, typename T2>
__host__ __device__
bool operator()(const T1 &t1, const T2 &t2){
if (thrust::get<1>(t1) < thrust::get<1>(t2)) return true;
if (thrust::get<1>(t1) > thrust::get<1>(t2)) return false;
if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
return false;
}
};
typedef float mt;
typedef int it;
int main(){
mt A[] = { 3.4257, -1.2345, 0.6232, -0.1354,
-1.6639, 0.1557, -0.1763, 1.0257,
0.6863, 0.0992, 1.4487, 0.0157};
const int rows = 3;
const int cols = 4;
thrust::device_vector<mt> d_A(A, A+rows*cols);
thrust::device_vector<it> row_idx(d_A.size());
thrust::device_vector<it> col_idx(d_A.size());
thrust::sequence(row_idx.begin(), row_idx.end());
thrust::sequence(col_idx.begin(), col_idx.end());
thrust::transform(row_idx.begin(), row_idx.end(), row_idx.begin(), _1/cols);
thrust::transform(col_idx.begin(), col_idx.end(), col_idx.begin(), _1%cols);
auto my_zip_iterator = thrust::make_zip_iterator(thrust::make_tuple(d_A.begin(), row_idx.begin(), col_idx.begin()));
thrust::sort(my_zip_iterator, my_zip_iterator+rows*cols, my_sort_functor());
thrust::host_vector<it> h_col_idx = col_idx;
thrust::copy_n(h_col_idx.begin(), rows*cols, std::ostream_iterator<it>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$
Here's another approach. This methodology does not reorder the data, but simply produces the ordering result. We only need one index and compute the row index on the fly, and the column index is only computed after the ordered result is determined.
$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <iostream>
using namespace thrust::placeholders;
typedef float mt;
typedef int it;
struct my_sort_functor
{
mt *d;
it cols;
my_sort_functor(mt *_d, it _cols) : d(_d), cols(_cols) {};
__host__ __device__
bool operator()(const it &t1, const it &t2){
it row1 = t1/cols;
it row2 = t2/cols;
if (row1 < row2) return true;
if (row1 > row2) return false;
if (d[t1] < d[t2]) return true;
return false;
}
};
int main(){
mt A[] = { 3.4257, -1.2345, 0.6232, -0.1354,
-1.6639, 0.1557, -0.1763, 1.0257,
0.6863, 0.0992, 1.4487, 0.0157};
const int rows = 3;
const int cols = 4;
thrust::device_vector<mt> d_A(A, A+rows*cols);
thrust::device_vector<it> idx(d_A.size());
thrust::sequence(idx.begin(), idx.end());
thrust::sort(idx.begin(), idx.end(), my_sort_functor(thrust::raw_pointer_cast(d_A.data()), cols));
thrust::transform(idx.begin(), idx.end(), idx.begin(), _1%cols);
thrust::host_vector<it> h_idx = idx;
thrust::copy_n(h_idx.begin(), rows*cols, std::ostream_iterator<it>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$
I would normally lean towards the second approach as being more performant since it is moving only 1/3 the data. However it is also doing two integers divisions, so it may be a wash. We could precompute the row index in the second approach, at the expense of moving twice as much data during the sort, but avoiding the on-the-fly division ops.
If array dimensions were small enough (say row and column dimensions each less than 65536), we could even precompute the row index and column index, and encode row in the upper bits of the index, column in the lower bits, so as to only be moving a single index quantity. Even better:
$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <iostream>
using namespace thrust::placeholders;
typedef float mt;
typedef unsigned it;
struct my_sort_functor
{
mt *d;
it cols;
my_sort_functor(mt *_d, it _cols) : d(_d), cols(_cols) {};
__host__ __device__
bool operator()(const it &t1, const it &t2){
it row1 = t1>>16;
it row2 = t2>>16;
if (row1 < row2) return true;
if (row1 > row2) return false;
it col1 = t1&65535;
it col2 = t2&65535;
it i1 = row1*cols+col1;
it i2 = row2*cols+col2;
if (d[i1] < d[i2]) return true;
return false;
}
};
struct my_transform_functor
{
it cols;
my_transform_functor(it _cols) : cols(_cols) {};
__host__ __device__
it operator()(const it &t1){
it row = t1/cols;
it col = t1 - row*cols;
return (row << 16) + col;
}
};
int main(){
mt A[] = { 3.4257, -1.2345, 0.6232, -0.1354,
-1.6639, 0.1557, -0.1763, 1.0257,
0.6863, 0.0992, 1.4487, 0.0157};
// assume rows and cols are each less than 65536
const int rows = 3;
const int cols = 4;
thrust::device_vector<mt> d_A(A, A+rows*cols);
thrust::device_vector<it> idx(d_A.size());
thrust::sequence(idx.begin(), idx.end());
thrust::transform(idx.begin(), idx.end(), idx.begin(), my_transform_functor(cols));
thrust::sort(idx.begin(), idx.end(), my_sort_functor(thrust::raw_pointer_cast(d_A.data()), cols));
thrust::host_vector<it> h_idx = idx;
for (int i = 0; i < rows*cols; i++) std::cout << (h_idx[i]&65535) << ",";
std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$
Only moving one quantity, and no division on the fly.

Why does the left shift on a unsigned int happens from the 16th bit?

I am trying to put the values from the vector into the int.
Given vector :'1','0','1','1','1','0','1','1','1','0','1','1','1','0','1','1' :
Expected output (binary representation for the variable out):
00000000000000001011101110111011.
However, I am getting the following output:
10111011101110110000000000000000
Notice: the insertion begun at the 16bit from right end instead of beginning from the leftmost bit
#include<vector>
#include<iostream>
int main() {
std::vector<unsigned char> test = {'1','0','1','1','1','0','1','1','1','0','1','1','1','0','1','1'};
std::vector<unsigned int> out(1);
int j = 0;
for (int i =0; i < test.size(); i++) {
out[j] = out[j] << 1;
if (test[i] == '1') {out[j] |=0x1;}
}
j++;
for (int p = 0; p < j; p++) {
for (int k = 0; k<32; k++ ) {
std::cout << !!((out[p]<<k)&0x8000);
}
std::cout << std::endl;
}
std::cout << "Size Of:" << sizeof(int);
return 0;
}
The reason why this happens is that you are using a wrong constant for the mask: 0x8000 has its 16-bit set, while you probably meant to use 0x80000000 with the 32-nd bit set. To avoid mistakes like that it's best to construct masks with shifts, for example
(1 << 31)
This expression is evaluated at compile time, so the result is the same as if you computed the constant yourself.
Note that both 0x8000 and 0x80000000 constants are system-dependent. Moreover, 0x80000000 assumes 32-bit int, which is not guaranteed.
A better approach would be shifting the number right instead of left, and masking with 1.
The block of code creating out[j] works just fine.
Your problem is in the output block, due to use of 0x8000. Whenever k >= 16, the low 16 bits will be zero, guaranteeing that 0x8000 is zero.
Your code seems overly complicated to me. Here's my version of a C program that transforms a string of 1's and 0's into an int and one going from int to string.
#include <stdlib.h>
#include <stdio.h>
int main(int argc, char **argv);
int main (int argc, char **argv) {
char str [] = "1010101010101010";
int x;
int out;
for (x=0;x<16;x++) {
if (str[x] == '1') {
out |= (1 << x);
}
}
printf("%d", out) ;
}
and
#include <stdlib.h>
#include <stdio.h>
int main(int argc, char **argv);
int main (int argc, char **argv) {
char str [] = "1010101010101010";
int in = 21845;
char out[17] = {0};
for (x=0;x<16;x++) {
if (in & (1<<x)) {
out[x] = '1';
}
else {
out[x] = '0';
}
}
printf("%s", out) ;
}

Using C++ using: What am I doing wrong here?

The following code doesn't compile on the latest Microsoft Visual Studio. Could someone enlighten me on what I'm doing wrong here?
#include <iostream>
#include <iomanip>
#include <array>
template <typename T, std::size_t M, std::size_t N>
using Matrix = std::array<T, M * N>;
template <typename T, std::size_t M, std::size_t N>
std::ostream &operator<<(std::ostream &os, const Matrix<T, M, N> &matrix)
{
for (auto i = 0; i < M; ++i)
{
for (auto j = 0; j < N; ++j)
{
os << std::setw(5) << matrix[i * N + j];
}
os << std::endl;
}
return os;
}
int main(int argc, const char * const argv[])
{
Matrix<float, 2, 3> matrix{
1.1f, 1.2f, 1.3f,
2.1f, 2.2f, 2.3f
};
std::cout << matrix << std::endl;
return 0;
}
Here is a snapshot of the compiler error:
1>main.cpp(30): error C2679: binary '<<': no operator found which takes a right-hand operand of type 'std::array<T,6>' (or there is no acceptable conversion)
1> with
1> [
1> T=float
1> ]
Edit:
The following code works though:
#include <iostream>
#include <iomanip>
#include <array>
template <typename T, std::size_t M, std::size_t N>
using Matrix = std::array<std::array<T, N>, M>;
template <typename T, std::size_t M, std::size_t N>
std::ostream &operator<<(std::ostream &os, const Matrix<T, M, N> &matrix)
{
for (auto row : matrix)
{
for (auto element : row)
{
os << std::setw(5) << element;
}
os << std::endl;
}
return os;
}
int main(int argc, const char * const argv[])
{
Matrix<float, 2, 3> matrix{
1.1f, 1.2f, 1.3f,
2.1f, 2.2f, 2.3f
};
std::cout << matrix << std::endl;
return 0;
}
Bearing in mind #dyp's comment what you have to do here is to create the new type instead of alias that will have 2 independent params.
So you just use aggregation including actual data as a field, like:
template <typename T, std::size_t M, std::size_t N>
class Matrix
{
private:
std::array<T, M * N> _data;
template <typename T1, std::size_t M1, std::size_t N1> friend std::ostream &operator<<(std::ostream &os, const Matrix<T1, M1, N1> &matrix);
public:
template <typename...Args>
Matrix(Args... args):
_data{{std::forward<Args>(args)...}}
{}
};

Boost.Variant Vs Virtual Interface Performance

I'm trying to measure a performance difference between using Boost.Variant and using virtual interfaces. For example, suppose I want to increment different types of numbers uniformly, using Boost.Variant I would use a boost::variant over int and float and a static visitor which increments each one of them. Using class interfaces I would use a pure virtual class number and number_int and number_float classes which derive from it and implement an "increment" method.
From my testing, using interfaces is far faster than using Boost.Variant.
I ran the code at the bottom and received these results:
Virtual: 00:00:00.001028
Variant: 00:00:00.012081
Why do you suppose this difference is? I thought Boost.Variant would be a lot faster.
** Note: Usually Boost.Variant uses heap allocations to guarantee that the variant would always be non-empty. But I read on the Boost.Variant documentation that if boost::has_nothrow_copy is true then it doesn't use heap allocations which should make things significantly faster. For int and float boost::has_nothrow_copy is true.
Here is my code for measuring the two approaches against each other.
#include <iostream>
#include <boost/variant/variant.hpp>
#include <boost/variant/static_visitor.hpp>
#include <boost/variant/apply_visitor.hpp>
#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/posix_time/posix_time_types.hpp>
#include <boost/date_time/posix_time/posix_time_io.hpp>
#include <boost/format.hpp>
const int iterations_count = 100000;
// a visitor that increments a variant by N
template <int N>
struct add : boost::static_visitor<> {
template <typename T>
void operator() (T& t) const {
t += N;
}
};
// a number interface
struct number {
virtual void increment() = 0;
};
// number interface implementation for all types
template <typename T>
struct number_ : number {
number_(T t = 0) : t(t) {}
virtual void increment() {
t += 1;
}
T t;
};
void use_virtual() {
number_<int> num_int;
number* num = &num_int;
for (int i = 0; i < iterations_count; i++) {
num->increment();
}
}
void use_variant() {
typedef boost::variant<int, float, double> number;
number num = 0;
for (int i = 0; i < iterations_count; i++) {
boost::apply_visitor(add<1>(), num);
}
}
int main() {
using namespace boost::posix_time;
ptime start, end;
time_duration d1, d2;
// virtual
start = microsec_clock::universal_time();
use_virtual();
end = microsec_clock::universal_time();
// store result
d1 = end - start;
// variant
start = microsec_clock::universal_time();
use_variant();
end = microsec_clock::universal_time();
// store result
d2 = end - start;
// output
std::cout <<
boost::format(
"Virtual: %1%\n"
"Variant: %2%\n"
) % d1 % d2;
}
For those interested, after I was a bit frustrated, I passed the option -O2 to the compiler and boost::variant was way faster than a virtual call.
Thanks
This is obvious that -O2 reduces the variant time, because that whole loop is optimized away. Change the implementation to return the accumulated result to the caller, so that the optimizer wouldn't remove the loop, and you'll get the real difference:
Output:
Virtual: 00:00:00.000120 = 10000000
Variant: 00:00:00.013483 = 10000000
#include <iostream>
#include <boost/variant/variant.hpp>
#include <boost/variant/static_visitor.hpp>
#include <boost/variant/apply_visitor.hpp>
#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/posix_time/posix_time_types.hpp>
#include <boost/date_time/posix_time/posix_time_io.hpp>
#include <boost/format.hpp>
const int iterations_count = 100000000;
// a visitor that increments a variant by N
template <int N>
struct add : boost::static_visitor<> {
template <typename T>
void operator() (T& t) const {
t += N;
}
};
// a visitor that increments a variant by N
template <typename T, typename V>
T get(const V& v) {
struct getter : boost::static_visitor<T> {
T operator() (T t) const { return t; }
};
return boost::apply_visitor(getter(), v);
}
// a number interface
struct number {
virtual void increment() = 0;
};
// number interface implementation for all types
template <typename T>
struct number_ : number {
number_(T t = 0) : t(t) {}
virtual void increment() { t += 1; }
T t;
};
int use_virtual() {
number_<int> num_int;
number* num = &num_int;
for (int i = 0; i < iterations_count; i++) {
num->increment();
}
return num_int.t;
}
int use_variant() {
typedef boost::variant<int, float, double> number;
number num = 0;
for (int i = 0; i < iterations_count; i++) {
boost::apply_visitor(add<1>(), num);
}
return get<int>(num);
}
int main() {
using namespace boost::posix_time;
ptime start, end;
time_duration d1, d2;
// virtual
start = microsec_clock::universal_time();
int i1 = use_virtual();
end = microsec_clock::universal_time();
// store result
d1 = end - start;
// variant
start = microsec_clock::universal_time();
int i2 = use_variant();
end = microsec_clock::universal_time();
// store result
d2 = end - start;
// output
std::cout <<
boost::format(
"Virtual: %1% = %2%\n"
"Variant: %3% = %4%\n"
) % d1 % i1 % d2 % i2;
}

how to reduce page faults in this program?

I'm gating more then 1000 page faults in this program.
can i reduce them to some smaller value or even to zero ?
or even any other changes can speed up the execution
#include <stdio.h>
#include<stdlib.h>
int main(int argc, char* argv[])
{
register unsigned int u, v,i;
register unsigned int arr_size=0;
register unsigned int b_size=0;
register unsigned int c;
register unsigned int *b;
FILE *file;
register unsigned int *arr;
file=fopen(argv[1],"r");
arr=(unsigned int *)malloc(4*10000000);
while(!feof(file)){
++arr_size;
fscanf(file,"%u\n",&arr[arr_size-1]);
}
fclose(file);
b=(unsigned int *)malloc(arr_size*4);
if (arr_size!=0)
{
++b_size;
b[b_size-1]=0;
for (i = 1; i < arr_size; ++i)
{
if (arr[b[b_size-1]] < arr[i])
{
++b_size;
b[b_size-1]=i;
continue;
}
for (u = 0, v = b_size-1; u < v;)
{
c = (u + v) / 2;
if (arr[b[c]] < arr[i]) u=c+1; else v=c;
}
if (arr[i] < arr[b[u]])
{
b[u] = i;
}
if(i>arr_size)break;
}
}
free(arr);
free(b);
printf("%u\n", b_size);
return 0;
}
The line:
arr=(unsigned int *)malloc(4*10000000);
is not a good programming style. Are you sure that your file is as big as 40MBs? try not to allocate the whole memory in the first lines of your program.

Resources