CUDA Thrust and sort_by_key - sorting

I’m looking for a sorting algorithm on CUDA that can sort an array A of elements (double) and returns an array of keys B for that array A.
I know the sort_by_key function in the Thrust library but I want my array of elements A to remain unchanged.
What can I do?
My code is:
void sortCUDA(double V[], int P[], int N) {
real_t *Vcpy = (double*) malloc(N*sizeof(double));
memcpy(Vcpy,V,N*sizeof(double));
thrust::sort_by_key(V, V + N, P);
free(Vcpy);
}
i'm comparing the thrust algorithm against others that i have on sequencial cpu
N mergesort sortCUDA
113 0.000008 0.000010
226 0.000018 0.000016
452 0.000036 0.000020
905 0.000061 0.000034
1810 0.000135 0.000071
3621 0.000297 0.000156
7242 0.000917 0.000338
14484 0.001421 0.000853
28968 0.003069 0.001931
57937 0.006666 0.003939
115874 0.014435 0.008025
231749 0.031059 0.016718
463499 0.067407 0.039848
926999 0.148170 0.118003
1853998 0.329005 0.260837
3707996 0.731768 0.544357
7415992 1.638445 1.073755
14831984 3.668039 2.150179
115035495 39.276560 19.812200
230070990 87.750377 39.762915
460141980 200.940501 74.605219
Thrust performance is not bad, but I think if I use OMP can probably get easily a better CPU time
I think this is because to memcpy
SOLUTION:
void thrustSort(double V[], int P[], int N)
{
thrust::device_vector<int> d_P(N);
thrust::device_vector<double> d_V(V, V + N);
thrust::sequence(d_P.begin(), d_P.end());
thrust::sort_by_key(d_V.begin(), d_V.end(), d_P.begin());
thrust::copy(d_P.begin(),d_P.end(),P);
}
where V is a my double values to sort

You can modify comparison operator to sort keys instead of values. #Robert Crovella correctly pointed that a raw device pointer cannot be assigned from the host. The modified algorithm is below:
struct cmp : public binary_function<int,int,bool>
{
cmp(const double *ptr) : rawA(ptr) { }
__host__ __device__ bool operator()(const int i, const int j) const
{return rawA[i] > rawA[j];}
const double *rawA; // an array in global mem
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
double *rawA = thrust::raw_pointer_cast(devA.data());
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp(rawA));
// B now contains the sorted keys
}
And here is alternative with arrayfire. Though I am not sure which one is more efficient since arrayfire solution uses two additional arrays:
void sortkeys(double *A, int n) {
af::array devA(n, A, af::afHost);
af::array vals, indices;
// sort and populate vals/indices arrays
af::sort(vals, indices, devA);
std::cout << devA << "\n" << indices << "\n";
}

How large is this array? The most efficient way, in terms of speed, will likely be to just duplicate the original array before sorting, if the memory is available.

Building on the answer provided by #asm (I wasn't able to get it working), this code seemed to work for me, and does sort only the keys. However, I believe it is limited to the case where the keys are in sequence 0, 1, 2, 3, 4 ... corresponding to the (double) values. Since this is a "index-value" sort, it could be extended to the case of an arbitrary sequence of keys, perhaps by doing an indexed copy. However I'm not sure the process of generating the index sequence and then rearranging the original keys will be any faster than just copying the original value data to a new vector (for the case of arbitrary keys).
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
using namespace std;
__device__ double *rawA; // an array in global mem
struct cmp : public binary_function<int, int, bool>
{
__host__ __device__ bool operator()(const int i, const int j) const
{return ( rawA[i] < rawA[j]);}
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
// rawA = thrust::raw_pointer_cast(&(devA[0]));
double *test = raw_pointer_cast(devA.data());
cudaMemcpyToSymbol(rawA, &test, sizeof(double *));
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp());
// B now contains the sorted keys
thrust::host_vector<int> hostB = B;
for (int i=0; i<hostB.size(); i++)
std::cout << hostB[i] << " ";
std::cout<<std::endl;
for (int i=0; i<hostB.size(); i++)
std::cout << A[hostB[i]] << " ";
std::cout<<std::endl;
}
int main(){
double C[] = {0.7, 0.3, 0.4, 0.2, 0.6, 1.2, -0.5, 0.5, 0.0, 10.0};
sortkeys(C, 9);
std::cout << std::endl;
return 0;
}

Related

How to fix "segmentation fault (core dumped)" dependant on size

I created a class "config" that contains 12 bool values, organized in a std::array. The class has an "icing" function that returns a double value.
Trying to order a vector of 2^12 (4096) configs through a std:: sort (contained in #include ) using a predicate i have written, i get a segmentation fault error.
Shrinking the vector to 205 (not 1 more) eliminates the error, but I don't know why.
If i make the vector 4096 long, and try to sort only a little part, it works until the part is long 175+.
Shrinking the vector to for example around 1000, limits the partial sorting to around 20, before it gives the segmentation error.
#include <array>
#include <vector>
#include <algorithm>
#include <iostream>
using namespace std;
class config {
public:
config (){ //constructor, default
array<bool,12> t;
for (bool& b: t){
b=false;
}
val=t;
g=1;
}
config (const config& fro): val(fro.val){}; //copy constructor
array<bool,12> get_val(){ return val; } //returns the array
void set_tf(int n, bool tf){ val[n]=tf; } //sets a certain boolean in the array to false/true
void set_g(double d){ g=d; } //this sets the constant for calculation to a number
void print(){
cout<<"values: ";
for (auto b: val){ cout<<b<<" "; }
cout<<endl;
}
config & incr(int n=1){ //this increases the vector by 1 following the rules for binary numbers, but has the digits reversed
for(int j=0; j<n; j++){
int i=0;
bool out=false;
while(val[i]==true){
val[i]=false;
i++;
}
val[i]=true;
}
return *this;
}
double energy(){
int ct=0;
int cf=0;
for(auto b:val){ if(b==true){ ct++; } else { cf++; } }
return (abs(ct-cf));
}
double icing(){ //here is the "value" for ordering purposes
int n=0;
for(int i=0; i<11; i++){
if(val[i]!=val[i+1]){ n++; }
}
double temp=-g*n+this->energy();
return temp;
}
private:
array<bool,12> val;
double g;
};
bool pred (config c1, config c2){ return c1.icing()>c2.icing(); } //this sets the ordering predicate
template <typename T> //this orders the vector
void csort (vector <T>& in){
sort(in.begin(), in.end(), pred);
}
int main(){
vector<config> v;
for (int i=0; i<4096; i++){ //cicle that creates a vector of successive binaries
for(auto& c:v){
c.incr();
}
config t;
v.push_back(t);
}
sort(v.begin(), v.begin()+174, pred); //this gives seg.fault when 175+
csort(v); //this gives segmentation fault when the vec is 206 long or longer
}
I expected the code to order the vector, but it goes into segmentation fault.
Your program has undefined behaviour in sort function because your predicate takes config by value, so copies are made and in this place copy constructor is called which copies only array val, but not g.
bool pred (config c1, config c2){ return c1.icing()>c2.icing(); }
// takes by value, copy ctor is called
config (const config& fro): val(fro.val){}; // only val is copied, g HAS GARBAGE VALUE
// icing in pred uses g !! - stric weak ordering is violated because g has GARBAGE VALUE
Fix 1:
pass config by const config&:
bool pred (const config& c1, const config& c2){ return c1.icing()>c2.icing(); }
or fix 2:
g is initialized in copy constructor:
config (const config& fro): val(fro.val), g(fro.g){};

Hashing using int array or unordered_map in STL?

Which is more efficient in terms of memory and time complexity hashing using int array or unordered_map in STL?
By hashing I mean storing elements formed by the combination of a key value and a mapped value, and fast retrieval of individual elements based on their keys.
Actually I was trying to solve this question.
Here's my solution:-
#include <bits/stdc++.h>
#define MAX 15000005
using namespace std;
/*
* author: vivekcrux
*/
int gcd(int a, int b)
{
if (b == 0)
return a;
return gcd(b, a % b);
}
int c[MAX];
int n;
int sieve()
{
bitset<MAX> m;
m.set();
int ans = 0;
for(int i=2;i<MAX;i++)
{
if(m[i])
{
int mans = 0;
for(int j=i;j<MAX;j+=i)
{
m[j]=0;
mans += c[j];
}
if(mans<n)
ans = max(ans,mans);
}
}
return ans;
}
int main()
{
ios_base::sync_with_stdio(false);
cin.tie(NULL);
cout.tie(NULL);
int i,j;
cin>>n;
int a[n+1];
for(i=0;i<n;i++)
{
cin>>a[i];
}
int g = a[0];
for(i=1;i<n;i++)
{
g = gcd(g,a[i]);
}
for(i=0;i<n;i++)
{
a[i] /= g;
if(a[i]!=1) c[a[i]]++;
}
int m = sieve();
if(m==0)
cout<<"-1";
else
cout<<n - m<<endl;
return 0;
}
In this code if I use
unordered_map<int,int> c;
instead of
int c[MAX];
I get a Memory limit exceeded verdict.I have found here that unordered_map has a constant average time complexity on average, but no details about space complexity is mentioned here.I wonder why am I getting MLE with unordered_map.
unordered_map uses bucket to store values. A bucket is a slot in the container's internal hash table to which elements are assigned based on the hash value of their key. Lets see the following code in C++17.
#include <bits/stdc++.h>
using namespace std;
int main() {
unordered_map<int,int> mp;
mp[4] = 1;
mp[41] = 5;
mp[67] = 6;
cout<<mp.bucket_count();
}
The output comes out be 7 (depends on compiler). This is the number of buckets used in the above code. But if we use an array of size 67, it will obviously take more memory. Another case would be that if we would had numbers 1, 2 and 3 instead of 4, 41 and 67, the output would have been 7. Here using array was the way to go for saving space. So it depends on the keys you are storing in the hash table. For time complexity, both performs equally same. There is a collision condition in unordered_map which would blow the overall time complexity of the code. Here is the codeforces link of the blog.

Cuda matrix addition

I have written the following code to sum two 4x4 matrices in cuda.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
__global__ void Matrix_add(double* a, double* b, double* c,int n)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int index = row * n + col;
if(col<n && row <n)
c[index] = a[index] + b[index];
}
int main()
{
int n=4;
double **h_a;
double **h_b;
double **h_c;
double *d_a, *d_b, *d_c;
int size = n*n*sizeof(double);
h_a = (double **) malloc(n*sizeof(double*));
h_b = (double **) malloc(n*sizeof(double*));
h_c = (double **) malloc(n*sizeof(double*));
cudaMalloc((void**)&d_a,size);
cudaMalloc((void**)&d_b,size);
cudaMalloc((void**)&d_c,size);
int t=0;
for (t=0;t<n;t++)
{
h_a[t]= (double *)malloc(n*sizeof(double));
h_b[t]= (double *)malloc(n*sizeof(double));
h_c[t]= (double *)malloc(n*sizeof(double));
}
int i=0,j=0;
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
h_a[i][j]=sin(i)*sin(i);
h_b[i][j]=cos(i)*cos(i);
}
}
cudaMemcpy(d_a,h_a+n,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b+n,size,cudaMemcpyHostToDevice);
dim3 dimBlock(4,4);
dim3 dimGrid(1,1);
Matrix_add<<<dimGrid, dimBlock>>>(d_a,d_b,d_c,n);
cudaMemcpy(h_c+n,d_c,size,cudaMemcpyDeviceToHost);
for(i=0;i<n;i++)
{
for( j=0;j<n;j++)
{
printf("%f",h_c[i][j]);
printf("\t");
}
printf("\n");
}
for(i=0;i<n;i++)
{
free(h_a[i]);
free(h_b[i]);
free(h_c[i]);
}
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Result of this addition should be a 2x2 all-ones matrix but in the result all the elements of matrix are 0. Also I get this message after getting result:
Segmentation fault (core dumped)
Can anyone please help me to find out the problem.
Thank you
Your host arrays (h_a, h_b, h_c) are not contiguous in memory, so your initial cudaMemcpy() calls will read garbage into GPU memory (apparently zeros in your case).
The reason is that your hosts arrays are not actually flat, but instead are represented as arrays of pointers. I guess to fake two-dimensional arrays in C? In any case, you either need to be more careful with your cudaMemcpy()s and copy the host arrays row-by-row, or use a flat representation on the host.

Conditional reduction in CUDA

I need to sum about 100000 values stored in an array, but with conditions.
Is there a way to do that in CUDA to produce fast results?
Can anyone post a small code to do that?
I think that, to perform conditional reduction, you can directly introduce the condition as a multiplication by 0 (false) or 1 (true) to the addends. In other words, suppose that the condition you would like to meet is that the addends be smaller than 10.f. In this case, borrowing the first code at Optimizing Parallel Reduction in CUDA by M. Harris, then the above would mean
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i]*(g_data[i]<10.f);
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
If you wish to use CUDA Thrust to perform conditional reduction, you can do the same by using thrust::transform_reduce. Alternatively, you can create a new vector d_b copying in that all the elements of d_a satisfying the predicate by thrust::copy_if and then applying thrust::reduce on d_b. I haven't checked which solution performs the best. Perhaps, the second solution will perform better on sparse arrays. Below is an example with an implementation of both the approaches.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/count.h>
#include <thrust/copy.h>
// --- Operator for the first approach
struct conditional_operator {
__host__ __device__ float operator()(const float a) const {
return a*(a<10.f);
}
};
// --- Operator for the second approach
struct is_smaller_than_10 {
__host__ __device__ bool operator()(const float a) const {
return (a<10.f);
}
};
void main(void)
{
int N = 20;
// --- Host side allocation and vector initialization
thrust::host_vector<float> h_a(N,1.f);
h_a[0] = 20.f;
h_a[1] = 20.f;
// --- Device side allocation and vector initialization
thrust::device_vector<float> d_a(h_a);
// --- First approach
float sum = thrust::transform_reduce(d_a.begin(), d_a.end(), conditional_operator(), 0.f, thrust::plus<float>());
printf("Result = %f\n",sum);
// --- Second approach
int N_prime = thrust::count_if(d_a.begin(), d_a.end(), is_smaller_than_10());
thrust::device_vector<float> d_b(N_prime);
thrust::copy_if(d_a.begin(), d_a.begin() + N, d_b.begin(), is_smaller_than_10());
sum = thrust::reduce(d_b.begin(), d_b.begin() + N_prime, 0.f);
printf("Result = %f\n",sum);
getchar();
}

C++11: How to Get A Multidimensional Array Through vector and to Assign it to auto?

I am a lazy programmer. I want to use C++ vector to create a multidimensional array. For example, this code create a 3x2 2D array:
int nR = 3;
int nC = 2;
vector<vector<double> > array2D(nR);
for(int c = 0; c < nC; c++)
array2D.resize(nC, 0);
However, I am too lazy to
declare array2D's data type: vector<vector<double> >
C++ auto could solve this problem.
However, I am too lazy to
write loop(s) to allocate the space(s) for each object like array2D.
Writing a function could solve this problem.
However, I am too lazy to
write each function for each N-dimensional array.
write nested N-1 loops for allocating spaces.
wirte each function for each data type.
The C++11 variadic template with function recursion could solve this problem.
Is it possible ...?
This is what you want. (Tested on Microsoft Visual C++ 2013 Update 1)
#include <iostream>
#include <vector>
using namespace std;
template<class elemType> inline vector<elemType> getArrayND(int dim) {
// Allocate space and initialize all elements to 0s.
return vector<elemType>(dim, 0);
}
template<class elemType, class... Dims> inline auto getArrayND(
int dim, Dims... resDims
) -> vector<decltype(getArrayND<elemType>(resDims...))> {
// Allocate space for this dimension.
auto parent = vector<decltype(getArrayND<elemType>(resDims...))>(dim);
// Recursive to next dimension.
for (int i = 0; i < dim; i++) {
parent[i] = getArrayND<elemType>(resDims...);
}
return parent;
}
int main() {
auto test3D = getArrayND<double>(2, 3, 4);
auto test4D = getArrayND<double>(2, 3, 4, 2);
test3D[0][0][1] = 3;
test4D[1][2][3][1] = 5;
cout << test3D[0][0][1] << endl;
cout << test4D[1][2][3][1] << endl;
return 0;
}

Resources