Unable to compile thrust code using reduce_by_key - matrix

I need to minimum values along columns of a matrix along with row indices using thrust. I use the following code (copied from orange owl solutions), However I get errors while compiling. I have posted it as an issue on the corresponding git page. The error message is huge and i dont know how to debug it. Can anyone help me with it? I am using cuda-8.0, thrust version 1.8.
The code:
#include <iterator>
#include <algorithm>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
using namespace thrust::placeholders;
int main()
{
const int Nrows = 6;
const int Ncols = 8;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
// --- Random uniform integer distribution between 0 and 100
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 20);
// --- Matrix allocation and initialization
thrust::device_vector<double> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (double)dist(rng);
printf("\n\nMatrix\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]\n";
}
/**********************************************/
/* FIND ROW MINIMA ALONG WITH THEIR LOCATIONS */
/**********************************************/
thrust::device_vector<float> d_minima(Ncols);
thrust::device_vector<int> d_indices(Ncols);
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), _1 / Nrows),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), _1 / Nrows) + Nrows * Ncols,
thrust::make_zip_iterator(
thrust::make_tuple(thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 % Nrows) * Ncols + _1 / Nrows)),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), _1 % Nrows))),
thrust::make_discard_iterator(),
thrust::make_zip_iterator(thrust::make_tuple(d_minima.begin(), d_indices.begin())),
thrust::equal_to<int>(),
thrust::minimum<thrust::tuple<float, int> >()
);
printf("\n\n");
for (int i=0; i<Nrows; i++) std::cout << "Min position = " << d_indices[i] << "; Min value = " << d_minima[i] << "\n";
return 0;
}
Error :
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp(58): error: ambiguous "?" operation: second operand of type "const thrust::tuple<double, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>" can be converted to third operand type "thrust::tuple<float, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>", and vice versa
detected during:
instantiation of "thrust::system::cuda::detail::bulk_::detail::reduce_by_key_detail::scan_head_flags_functor<FlagType, ValueType, BinaryFunction>::result_type thrust::system::cuda::detail::bulk_::detail::reduce_by_key_detail::scan_head_flags_functor<FlagType, ValueType, BinaryFunction>::operator()(const thrust::system::cuda::detail::bulk_::detail::reduce_by_key_detail::scan_head_flags_functor<FlagType, ValueType, BinaryFunction>::first_argument_type &, const thrust::system::cuda::detail::bulk_::detail::reduce_by_key_detail::scan_head_flags_functor<FlagType, ValueType, BinaryFunction>::second_argument_type &) [with FlagType=int, ValueType=thrust::tuple<double, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, BinaryFunction=thrust::minimum<thrust::tuple<float, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>]"

I guess you are using this code.
A curious characteristic of that code is that the matrix is defined using double type, but the captured minima are stored in a float vector.
If you want to use that code as-is, according to my testing, thrust (in CUDA 10, and apparently also CUDA 8) doesn't like this line:
thrust::minimum<thrust::tuple<float, int> >()
That operator is being used to compare two items to determine which is smaller, and it is templated to accept different kinds of items. However, it has decided that finding the minimum of two of those tuples is an "ambiguous" request. Part of the reason for this is that the operator returns a float, int tuple, but is being given variously a double,int tuple or a float,int tuple.
We can fix/work around this by passing our own functor to do the job, that is explicit in terms of handling the tuples passed to it:
$ cat t373.cu
#include <iterator>
#include <algorithm>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
using namespace thrust::placeholders;
struct my_min
{
template <typename T1, typename T2>
__host__ __device__
T2 operator()(T1 t1, T2 t2){
if (thrust::get<0>(t1) < thrust::get<0>(t2)) return t1;
return t2;
}
};
int main()
{
const int Nrows = 6;
const int Ncols = 8;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
// --- Random uniform integer distribution between 0 and 100
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 20);
// --- Matrix allocation and initialization
thrust::device_vector<double> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (double)dist(rng);
printf("\n\nMatrix\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]\n";
}
/**********************************************/
/* FIND ROW MINIMA ALONG WITH THEIR LOCATIONS */
/**********************************************/
thrust::device_vector<float> d_minima(Ncols);
thrust::device_vector<int> d_indices(Ncols);
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 / Nrows)),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 / Nrows)) + Nrows * Ncols,
thrust::make_zip_iterator(
thrust::make_tuple(thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), ((_1 % Nrows) * Ncols + _1 / Nrows))),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 % Nrows)))),
thrust::make_discard_iterator(),
thrust::make_zip_iterator(thrust::make_tuple(d_minima.begin(), d_indices.begin())),
thrust::equal_to<int>(),
my_min()
// thrust::minimum<thrust::tuple<float, int> >()
);
printf("\n\n");
for (int i=0; i<Nrows; i++) std::cout << "Min position = " << d_indices[i] << "; Min value = " << d_minima[i] << "\n";
return 0;
}
$ nvcc -o t373 t373.cu
$ ./t373
Matrix
[ 0 1 12 18 20 3 10 8 ]
[ 5 15 1 11 12 17 12 10 ]
[ 18 20 15 20 6 8 18 13 ]
[ 18 20 3 18 19 6 19 8 ]
[ 6 10 8 16 14 11 12 1 ]
[ 12 9 12 17 10 16 1 4 ]
Min position = 0; Min value = 0
Min position = 0; Min value = 1
Min position = 1; Min value = 1
Min position = 1; Min value = 11
Min position = 2; Min value = 6
Min position = 0; Min value = 3
$
I think a better fix is to just choose one or the other, either float or double. If we modify all float types to double, for example, then thrust is happy, without any other changes:
$ cat t373a.cu
#include <iterator>
#include <algorithm>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
using namespace thrust::placeholders;
int main()
{
const int Nrows = 6;
const int Ncols = 8;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
// --- Random uniform integer distribution between 0 and 100
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 20);
// --- Matrix allocation and initialization
thrust::device_vector<double> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (double)dist(rng);
printf("\n\nMatrix\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]\n";
}
/**********************************************/
/* FIND ROW MINIMA ALONG WITH THEIR LOCATIONS */
/**********************************************/
thrust::device_vector<double> d_minima(Ncols);
thrust::device_vector<int> d_indices(Ncols);
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 / Nrows)),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 / Nrows)) + Nrows * Ncols,
thrust::make_zip_iterator(
thrust::make_tuple(thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), ((_1 % Nrows) * Ncols + _1 / Nrows))),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 % Nrows)))),
thrust::make_discard_iterator(),
thrust::make_zip_iterator(thrust::make_tuple(d_minima.begin(), d_indices.begin())),
thrust::equal_to<int>(),
thrust::minimum<thrust::tuple<double, int> >()
);
printf("\n\n");
for (int i=0; i<Nrows; i++) std::cout << "Min position = " << d_indices[i] << "; Min value = " << d_minima[i] << "\n";
return 0;
}
$ nvcc -o t373a t373a.cu
$ ./t373a
Matrix
[ 0 1 12 18 20 3 10 8 ]
[ 5 15 1 11 12 17 12 10 ]
[ 18 20 15 20 6 8 18 13 ]
[ 18 20 3 18 19 6 19 8 ]
[ 6 10 8 16 14 11 12 1 ]
[ 12 9 12 17 10 16 1 4 ]
Min position = 0; Min value = 0
Min position = 0; Min value = 1
Min position = 1; Min value = 1
Min position = 1; Min value = 11
Min position = 2; Min value = 6
Min position = 0; Min value = 3
$
I think this latter solution of using consistent types is the more sensible solution.

Related

Where did torch::jit::load go?

Namespace "torch::jit" is missing member "load"
The official reference says it is there, but I can't use it.
It's not just an intelligence problem, but when I run it, it throws an error saying that there is no "jit::load".
Why?
source code
#include <torch/torch.h>
#include <iostream>
#include <Windows.h>
#include <gdiplus.h>
#include <gdipluspixelformats.h> // PixelFormat24bppRGB
#include <vector>
#include <cstdlib>
#pragma comment(lib, "gdiplus.lib")
int main()
{
Gdiplus::GdiplusStartupInput input;
ULONG_PTR token;
Gdiplus::GdiplusStartup(&token, &input, NULL);
std::vector<uint8_t> pixels;
Gdiplus::BitmapData bmpData;
LARGE_INTEGER freq, start, end;
QueryPerformanceFrequency(&freq);
std::wstring path = L"C:\\Users\\baiji\\Documents\\triggerBot\\data2\\0.jpg";
std::wstring path2 = L"D:\\screenshot\\result\\output.bmp";
auto image = Gdiplus::Bitmap::FromFile(path2.c_str());
QueryPerformanceCounter(&start);
int bWidth = image->GetWidth();
int bHeight = image->GetHeight();
std::cout << bWidth << std::endl;
std::cout << bHeight << std::endl;
auto stride = 3 * bWidth;
pixels.resize(stride * bHeight);
Gdiplus::Rect rect(0, 0, bWidth, bHeight);
image->LockBits(&rect, Gdiplus::ImageLockModeRead, PixelFormat24bppRGB, &bmpData);
for (int y = 0; y < bHeight; ++y) {
memcpy(pixels.data() + y * stride, (byte*)bmpData.Scan0 + y * bmpData.Stride, stride);
}
image->UnlockBits(&bmpData);
Gdiplus::GdiplusShutdown(token);
uint8_t buf1, buf2;
for (int i = 2;i < pixels.size(); i += 3) {
buf1 = pixels[i - 2];
buf2 = pixels[i];
pixels[i-2] = buf2;
pixels[i] = buf1;
}
std::cout << "要素数: " << pixels.size() << "\n";
torch::Tensor tsr = torch::tensor(torch::ArrayRef<uint8_t>(pixels)).to(torch::kFloat64) / 256;
torch::Tensor input = torch::reshape(tsr, { bWidth,bHeight,3 });
torch::jit::script::Module module;
module = torch::jit::load("model to path/traced_model.pt");
QueryPerformanceCounter(&end);
double time = static_cast<double>(end.QuadPart - start.QuadPart) * 1000.0 / freq.QuadPart;
std::cout << time << "ms\n";
//system("PAUSE");
return 1;
}
How can I run torch::jit::load?

Using the lightest carriage to move n objects in specific days

I have to solve this problem but I can't think of any algorithm for it . Any help is appreciated :)
Problem : we want to move n objects with different weights to another location using a carriage and we have limited days to do so . we can only use the carriage once a day and because it costs a lot we want to choose a carriage that can move all of our product in the given days but pay the minimum amount for it so the objective is to choose a carriage with the least capacity that will do the job ( the price we have to pay for the carriage increases greatly as the capacity goes higher ) . Also the items should be moved according to their weight and the smallest items go first .
the given data : n items , i'th object wights Wi , d days
wanted : c which shows the minimum capacity for our chosen carriage
Example :
10 items
weights : 1 2 3 4 5 6 7 8 9 10
5 days
answer : 15
day 1 -> 1,2,3,4
day 2 -> 5,6
day 3 -> 7,8
day 4 -> 9
day 5 -> 10
To solve this problem efficiently, you should do binary search. You can do binary search on carriage and check for which minimum value you can do that. You can see my C++ solution below for better understanding.
#include<bits/stdc++.h>
//#include <ext/pb_ds/tree_policy.hpp>
//#include <ext/pb_ds/assoc_container.hpp>
using namespace std;
//using namespace __gnu_pbds;
typedef long long ll;
typedef unsigned long long ull;
typedef pair<ll,ll> pll;
typedef pair<int,int> pii;
//typedef tree<ll,null_type,less<ll>,rb_tree_tag,tree_order_statistics_node_update>ordered_set;
#define fread freopen("input.txt","r",stdin)
#define fwrite freopen("output.txt","w",stdout)
#define eb emplace_back
#define em emplace
#define pb push_back
#define Mp make_pair
#define ff first
#define ss second
#define all(a) a.begin(),a.end()
#define Unique(a) sort(all(a)),a.erase(unique(all(a)),a.end())
#define FastRead ios_base::sync_with_stdio(0);cin.tie(0);
#define memo(ara,val) memset(ara,val,sizeof(ara))
#define II ( { int a ; read(a) ; a; } )
#define LL ( { ll a ; read(a) ; a; } )
#define DD ({double a; scanf("%lf", &a); a;})
#define rep(i,n) for(int i=0;i<n;i++)
#define rep1(i,n) for(int i=1;i<=n;i++)
#define rrep(i,a,n) for(int i=a;i<=n;i++)
#define per(i,n,a) for(int i=n;i>=a;i--)
#define pf(n) printf("%lld",n)
#define pfi(n) printf("%d",n)
#define sp printf(" ")
#define ln printf("\n")
#define sc(x) scanf("%lld",&x)
#define scw(x) scanf("%I64d",&x)
#define sci(x) scanf("%d",&x)
#define sc2(x,y) scanf("%lld %lld",&x,&y)
#define sc3(x,y,z) scanf("%lld %lld %lld",&x,&y,&z)
#define Found(a, b) a.find(b) != a.end()
// bool operator< (const node& sx) const { return sx.val < val; }
//set<ll,greater<ll> >st;
//priority_queue<ll , vector<ll> , greater<ll> >
template<class T>inline bool read(T &x){ int c=getchar();int sgn=1;
while(~c&&c<'0'|c>'9'){if(c=='-')sgn=-1;c=getchar();}
for(x=0; ~c&&'0'<=c&&c<='9'; c=getchar())x=x*10+c-'0';x*=sgn;return ~c;
}
const ll N = 200005;
const ll MOD = 1e9+7;
ll ara[N], d, n;
bool check(ll carriageCapacity) {
ll days = 1;
ll s = 0;
bool f = true;
rep1(i, n) {
s += ara[i];
if(s > carriageCapacity) {
days += 1;
s = ara[i];
}
}
if(days > d) f = false;
return f;
}
int main(){
//fread;
//fwrite;
n = LL;
ll sum = 0;
rep1(i,n) {
ara[i] = LL;
sum += ara[i];
}
d = LL;
ll lo = ara[n], hi = sum, ans;
while(lo <= hi) {
ll mid = (lo + hi) >> 1;
if(check(mid)) {
hi = mid - 1;
ans = mid;
}
else {
lo = mid + 1;
}
}
cout << ans << endl;
}
Python solution would look like this.
Using binary search to find the container size that will satisfy the given condition. Complexity of this solution would be O(n * log n)
def shipWithinDays(weights: List[int], D: int) -> int:
def feasible(capacity) -> bool:
days = 1
total = 0
for weight in weights:
total += weight
if total > capacity: # too heavy, wait for the next day
total = weight
days += 1
if days > D: # cannot ship within D days
return False
return True
left, right = max(weights), sum(weights)
while left < right:
mid = left + (right - left) // 2
if feasible(mid):
right = mid
else:
left = mid + 1
return left

Dealing with matrices in CUDA: understanding basic concepts

I'm building a CUDA kernel to compute the numerical N*N jacobian of a function, using finite differences; in the example I provided, it is the square function (each entry of the vector is squared). The host coded allocates in linear memory, while I'm using a 2-dimensional indexing in the kernel.
My issue is that I haven't found a way to sum on the diagonal of the matrices cudaMalloc'ed. My attempt has been to use the statement threadIdx.x == blockIdx.x as a condition for the diagonal, but instead it evaluates to true only for them both at 0.
Here is the kernel and EDIT: I posted the whole code as an answer, based on the suggestions in the comments (the main() is basically the same, while the kernel is not)
template <typename T>
__global__ void jacobian_kernel (
T * J,
const T t0,
const T tn,
const T h,
const T * u0,
const T * un,
const T * un_old)
{
T cgamma = 2 - sqrtf(2);
const unsigned int t = threadIdx.x;
const unsigned int b = blockIdx.x;
const unsigned int tid = t + b * blockDim.x;
/*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
/*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
__shared__ T sm_temp_du[BLOCK_SIZE];
T* temp_du = &sm_temp_du[0];
if (tid < N )
{
temp_sx[b][t] = un[t];
temp_dx[b][t] = un[t];
if ( t == b )
{
if ( tn == t0 )
{
temp_du[t] = u0[t]*0.001;
temp_sx[b][t] += temp_du[t]; //(*)
temp_dx[b][t] -= temp_du[t];
temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );
}
else
{
temp_du[t] = MAX( un[t] - un_old[t], 10e-6 );
temp_sx[b][t] += temp_du[t];
temp_dx[b][t] -= temp_du[t];
}
}
__syncthreads();
//J = f(tn, un + du)
d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
__syncthreads();
J[tid] = (temp_sx[b][t] - temp_dx[b][t]) * powf((2 * temp_du[t]), -1);
//J[tid]*= - h*cgamma/2;
//J[tid]+= ( t == b ? 1 : 0);
//J[tid] = temp_J[tid];
}
}
The general procedure for computing the jacobian is
Copy un into every row of temp_sx and temp_dx
Compute du as a 0.01 magnitude from u0
Sum du to the diagonal of temp_sx, subtract du from the diagonal of temp_dx
Compute the square function on each entry of temp_sx and temp_dx
Subtract them and divide every entry by 2*du
This procedure can be summarized with (f(un + du*e_i) - f(un - du*e_i))/2*du.
My problem is to sum du to the diagonal of the matrices of temp_sx and temp_dx like I tried in (*). How can I achieve that?
EDIT: Now calling 1D blocks and threads; in fact, .y axis wasn't used at all in the kernel. I'm calling the kernel with a fixed amount of shared memory
Note that in int main() I'm calling the kernel with
#define REAL sizeof(float)
#define N 32
#define BLOCK_SIZE 16
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
...
dim3 dimGrid(NUM_BLOCKS,);
dim3 dimBlock(BLOCK_SIZE);
size_t shm_size = N*N*REAL;
jacobian_kernel <<< dimGrid, dimBlock, size_t shm_size >>> (...);
So that I attempt to deal with block-splitting the function calls. In the kernel to sum on the diagonal I used if(threadIdx.x == blockIdx.x){...}. Why isn't this correct? I'm asking it because while debugging and making the code print the statement, It only evaluates true if they both are 0. Thus du[0] is the only numerical value and the matrix becomes nan. Note that this approach worked with the first code I built, where instead I called the kernel with
jacobian_kernel <<< N, N >>> (...)
So that when threadIdx.x == blockIdx.x the element is on the diagonal. This approach doesn't fit anymore though, since now I need to deal with larger N (possibly larger than 1024, which is the maximum number of threads per block).
What statement should I put there that works even if the matrices are split into blocks and threads?
Let me know if I should share some other info.
Here is how I managed to solve my problem, based on the suggestion in the comments on the answer. The example is compilable, provided you put helper_cuda.h and helper_string.h in the same directory or you add -I directive to the CUDA examples include path, installed along with the CUDA toolkit. The relevant changes are only in the kernel; there's a minor change in the main() though, since I was calling double the resources to execute the kernel, but the .y axis of the grid of thread blocks wasn't even used at all, so it didn't generate any error.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include "helper_string.h"
#include <fstream>
#ifndef MAX
#define MAX(a,b) ((a > b) ? a : b)
#endif
#define REAL sizeof(float)
#define N 128
#define BLOCK_SIZE 128
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
template <typename T>
inline void printmatrix( T mat, int rows, int cols);
template <typename T>
__global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old);
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h = 1);
template<typename T>
int main ()
{
float t0 = 0.; //float tn = 0.;
float h = 0.1;
float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;}
float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N);
float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N);
float* J = (float*)malloc(REAL*N*N);
float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A);
float *d_u0;
float *d_un;
float *d_un_old;
float *d_J;
float *d_A;
checkCudaErrors(cudaMalloc((void**)&d_u0, REAL*N)); //printf("1: %p\n", d_u0);
checkCudaErrors(cudaMalloc((void**)&d_un, REAL*N)); //printf("2: %p\n", d_un);
checkCudaErrors(cudaMalloc((void**)&d_un_old, REAL*N)); //printf("3: %p\n", d_un_old);
checkCudaErrors(cudaMalloc((void**)&d_J, REAL*N*N)); //printf("4: %p\n", d_J);
checkCudaErrors(cudaMalloc((void**)&d_A, REAL*N*N)); //printf("4: %p\n", d_J);
checkCudaErrors(cudaMemcpy(d_u0, u0, REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL);
checkCudaErrors(cudaMemcpy(d_un, un, REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL);
checkCudaErrors(cudaMemcpy(d_un_old, un_old, REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL);
checkCudaErrors(cudaMemcpy(d_J, J, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL);
checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL);
dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n";
dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n";
size_t shm_size = N*REAL; //std::cout << shm_size << "\n";
//HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING
//THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS,
//WHILE NOW THE GRID IS 1D
jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old);
checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J);
printmatrix( J, N, N);
checkCudaErrors(cudaDeviceReset());
free(u0);
free(un);
free(un_old);
free(J);
}
template <typename T>
__global__ void jacobian_kernel (
const T * A,
T * J,
const T t0,
const T tn,
const T h,
const T * u0,
const T * un,
const T * un_old)
{
T cgamma = 2 - sqrtf(2);
const unsigned int t = threadIdx.x;
const unsigned int b = blockIdx.x;
const unsigned int tid = t + b * blockDim.x;
/*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
/*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
__shared__ T sm_temp_du;
T* temp_du = &sm_temp_du;
//HERE IS A RELEVANT CHANGE (*)
if ( t < BLOCK_SIZE && b < NUM_BLOCKS )
{
temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t]));
temp_dx[b][t] = un[t];
//printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid);
//HERE IS A NOTE (**)
if ( t == b )
{
//printf("t = %d, b = %d \n",t, b);
if ( tn == t0 )
{
*temp_du = u0[t]*0.001;
temp_sx[b][t] += *temp_du;
temp_dx[b][t] -= *temp_du;
temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );
}
else
{
*temp_du = MAX( un[t] - un_old[t], 10e-6 );
temp_sx[b][t] += *temp_du;
temp_dx[b][t] -= *temp_du;
}
;
}
//printf("du[%d] %f\n", tid, (*temp_du));
__syncthreads();
//printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]);
//printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]);
//d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
//d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f );
matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f );
__syncthreads();
//printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t]));
//printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du);
//if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1));
J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du);
}
}
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h )
{
__shared__ float temp_u;
temp_u = u[threadIdx.x];
res[threadIdx.x] = h*powf( (temp_u), 2);
}
template <typename T>
inline void printmatrix( T mat, int rows, int cols)
{
std::ofstream matrix_out;
matrix_out.open( "heat_matrix.txt", std::ofstream::out);
for( int i = 0; i < rows; i++)
{
for( int j = 0; j <cols; j++)
{
double next = mat[i + N*j];
matrix_out << ( (next >= 0) ? " " : "") << next << " ";
}
matrix_out << "\n";
}
}
The relevant change is on (*). Before I used if (tid < N) which has two downsides:
First, it is wrong, since it should be tid < N*N, as my data is 2D, while tid is a global index which tracks all the data.
Even if I wrote tid < N*N, since I'm splitting the function calls into blocks, the t < BLOCK_SIZE && b < NUM_BLOCKS seems clearer to me in how the indexing is arranged in the code.
Moreover, the statement t == b in (**) is actually the right one to operate on the diagonal elements of the matrix. The fact that it was evaluated true only on 0 was because of my error right above.
Thanks for the suggestions!

How to find a random bit 1 (or 0) in a dynamic_bit set?

How to find a random bit 1 (or 0) in a dynamic_bit set?
Example:
bitset: 101101
index : 012345
find_random_bit_1() return 3
find_random_bit_0() return 4
The simplest thing to do would be to keep selecting bits until you find the one you're after:
template <typename T, typename Alloc, typename Rnd = boost::mt19937>
size_t select_random_bit(boost::dynamic_bitset<T, Alloc> const& bs, Rnd& random, bool target = true) {
boost::uniform_int<size_t> pick(0,bs.size()-1);
if (bs.empty() || (bs.all() && !target) || (bs.none() && target))
throw std::range_error("select_random_bit");
while(true) {
auto index = pick(random);
if (bs[index] == target)
return index;
}
throw std::logic_error("select_random_bit");
}
The most important thing is the precondition checks to avoid infinite loops. Of course, the performance could be bad for non-uniform data, but it's the simplest way to arrive at a fair distribution
Live On Coliru
#include <boost/dynamic_bitset.hpp>
#include <boost/random.hpp>
#include <iostream>
#include <stdexcept>
template <typename T, typename Alloc, typename Rnd = boost::mt19937>
size_t select_random_bit(boost::dynamic_bitset<T, Alloc> const& bs, Rnd& random, bool target = true) {
boost::uniform_int<size_t> pick(0,bs.size()-1);
if (bs.empty() || (bs.all() && !target) || (bs.none() && target))
throw std::range_error("select_random_bit");
while(true) {
auto index = pick(random);
if (bs[index] == target)
return index;
}
throw std::logic_error("select_random_bit");
}
boost::dynamic_bitset<> generate_testdata(boost::mt19937& rng) {
boost::dynamic_bitset<> bs(1024+rng()%1024); // [1024,2048) bits
boost::uniform_smallint<uint8_t> gen(0, 1);
for(size_t i = 0; i < bs.size(); ++i)
bs[i] = gen(rng);
return bs;
}
int main() {
using namespace boost;
mt19937 rng(42); // seed it
auto data = generate_testdata(rng);
std::cout << data.count() << " out of " << data.size() << " bits are set\n";
std::cout << "\nTrue: ";
for (int i = 0; i <10; ++i)
std::cout << select_random_bit(data, rng/*, true*/) << " ";
std::cout << "\nFalse: ";
for (int i = 0; i <10; ++i)
std::cout << select_random_bit(data, rng, false) << " ";
}
Prints 10 true and 10 false bits, e.g.:
562 out of 1126 bits are set
True: 1104 394 684 716 624 492 102 817 392 616
False: 335 589 971 785 1069 948 865 290 51 652

Concurrently sorting many arrays with CUDA Thrust

I need to sort 20+ arrays, already on the GPU, each of the same length, by the same keys. I can not use sort_by_key() directly since it sorts the keys as well (making them useless to sort the next array). Here is what I tried instead:
thrust::device_vector<int> indices(N);
thrust::sequence(indices.begin(),indices.end());
thrust::sort_by_key(keys.begin(),keys.end(),indices.begin());
thrust::gather(indices.begin(),indices.end(),a_01,a_01);
thrust::gather(indices.begin(),indices.end(),a_02,a_02);
...
thrust::gather(indices.begin(),indices.end(),a_20,a_20);
This does not seem to work since gather() expects a different array for the output than for the input, i.e. this works:
thrust::gather(indices.begin(),indices.end(),a_01,o_01);
...
However, I would prefer to not allocate 20+ extra arrays for this task. I know that there is a solution using a thrust::tuple, thrust::zip_iterator and thrust::sort_by_keys(), similiar to here. However, I can only combine up to 10 arrays in a tuple, s.t. I would need to duplicate the key vector again. How would you tackle this task?
I think that the classical way to sort multiple arrays is the so-called back-to-back approach which uses uses thrust::stable_sort_by_key two times. You need to create a keys vector such that elements within the same array have the same key. For example:
Elements: 10.5 4.3 -2.3 0. 55. 24. 66.
Keys: 0 0 0 1 1 1 1
In this case we have two arrays, the first with 3 elements and the second with 4 elements.
You first need to call thrust::stable_sort_by_key having the matrix values as the keys like
thrust::stable_sort_by_key(d_matrix.begin(),
d_matrix.end(),
d_keys.begin(),
thrust::less<float>());
After that, you have
Elements: -2.3 0 4.3 10.5 24. 55. 66.
Keys: 0 1 0 0 1 1 1
which means that the array elements are ordered, while the keys are not. Then you need a second to call thrust::stable_sort_by_key
thrust::stable_sort_by_key(d_keys.begin(),
d_keys.end(),
d_matrix.begin(),
thrust::less<int>());
so performing a sorting according to the keys. After that step, you have
Elements: -2.3 4.3 10.5 0 24. 55. 66.
Keys: 0 0 0 1 1 1 1
which is the final desired result.
Below, a full working example which considers the following problem: separately order each row of a matrix. This is a particular case in which all the arrays have the same length, but the approach works with arrays having possibly different lengths.
#include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <stdio.h>
#include <iostream>
#include "Utilities.cuh"
/**************************************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
/**************************************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/********/
/* MAIN */
/********/
int main()
{
const int Nrows = 5; // --- Number of rows
const int Ncols = 8; // --- Number of columns
// --- Random uniform integer distribution between 10 and 99
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// --- Matrix allocation and initialization
thrust::device_vector<float> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);
// --- Print result
printf("Original matrix\n");
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]\n";
}
/*************************/
/* BACK-TO-BACK APPROACH */
/*************************/
thrust::device_vector<float> d_keys(Nrows * Ncols);
// --- Generate row indices
thrust::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(Nrows*Ncols),
thrust::make_constant_iterator(Ncols),
d_keys.begin(),
thrust::divides<int>());
// --- Back-to-back approach
thrust::stable_sort_by_key(d_matrix.begin(),
d_matrix.end(),
d_keys.begin(),
thrust::less<float>());
thrust::stable_sort_by_key(d_keys.begin(),
d_keys.end(),
d_matrix.begin(),
thrust::less<int>());
// --- Print result
printf("\n\nSorted matrix\n");
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]\n";
}
return 0;
}
Well, you really only need to allocate one extra array if you are OK with manipulating pointers to device_vector instead:
thrust::device_vector<int> indices(N);
thrust::sequence(indices.begin(),indices.end());
thrust::sort_by_key(keys.begin(),keys.end(),indices.begin());
thrust::device_vector<int> temp(N);
thrust::device_vector<int> *sorted = &temp;
thrust::device_vector<int> *pa_01 = &a_01;
thrust::device_vector<int> *pa_02 = &a_02;
...
thrust::device_vector<int> *pa_20 = &a_20;
thrust::gather(indices.begin(), indices.end(), *pa_01, *sorted);
pa_01 = sorted; sorted = &a_01;
thrust::gather(indices.begin(), indices.end(), *pa_02, *sorted);
pa_02 = sorted; sorted = &a_02;
...
thrust::gather(indices.begin(), indices.end(), *pa_20, *sorted);
pa_20 = sorted; sorted = &a_20;
Or something like that should work anyway. You would need to fix it so the temp device vector is not automatically deallocated when it goes out of scope -- I suggest allocating the CUDA device pointers using cudaMalloc and then wrapping them with device_ptr instead of using automatic device_vectors.

Resources