R encountered a fatal error while running a parallel program [duplicate] - parallel-processing

I wrote the following code to train myself to use RcppParallel. It is just a toy example.
// [[Rcpp::depends(RcppParallel)]]
#include <Rcpp.h>
#include <RcppParallel.h>
#include <iostream>
using namespace Rcpp;
using namespace RcppParallel;
struct Lapin : public Worker {
// input pars
const NumericVector input;
const size_t dim;
// outputs a matrix
NumericMatrix output;
// two constructors
Lapin(const NumericVector input, const int dim) : input(input), dim(dim), output(NumericMatrix(dim,dim)) {}
Lapin(const Lapin & jeannot, Split) : input(jeannot.input), dim(jeannot.dim), output(NumericMatrix(dim,dim)) {}
// the working operator
void operator()(size_t begin, size_t end) {
for(size_t k = begin; k < end; k++) {
for(size_t i = 0; i < dim; i++) {
for(size_t j = 0; j < dim; j++) {
output(i,j) += input(k)+i+j;
}
}
}
}
// the join
void join(const Lapin & peter) {
output += peter.output;
}
};
// [[Rcpp::export]]
NumericMatrix f(NumericVector A, size_t dim) {
Lapin groumf(A, dim);
parallelReduce(0, A.length(), groumf);
return groumf.output;
}
Here is what happens in R, after sourceCpp-ing it:
> f(rep(1,1100), 5)
[,1] [,2] [,3] [,4] [,5]
[1,] 1100 2200 3300 4400 5500
[2,] 2200 3300 4400 5500 6600
[3,] 3300 4400 5500 6600 7700
[4,] 4400 5500 6600 7700 8800
[5,] 5500 6600 7700 8800 9900
> sourceCpp("parallel-matrix-reduce.cpp")
> f(rep(1,1100), 5)
Warning: stack imbalance in '.Call', 6 then 11
[,1] [,2] [,3] [,4] [,5]
[1,] 1100 2200 3300 4400 5500
[2,] 2200 3300 4400 5500 6600
[3,] 3300 4400 5500 6600 7700
[4,] 4400 5500 6600 7700 8800
[5,] 5500 6600 7700 8800 9900
Note that the behavior is eratic: sometimes, I have no warning at all, sometimes it is at the first run... I guess my session info can be useful here:
> sessionInfo()
R version 3.1.2 (2014-10-31)
Platform: x86_64-redhat-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=fr_FR.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=fr_FR.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=fr_FR.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] Rcpp_0.11.3
loaded via a namespace (and not attached):
[1] RcppParallel_4.3.3 tools_3.1.2
I thank you all in advance for your answers and comments.
EDIT As Dirk explains below, this is due to the use of R types in the Worker, which confuses the garbage collector. I settled the issue by using Armadillo matrices instead (I was a bit confused by RMatrix). Here is the corrected code:
// [[Rcpp::depends(RcppParallel)]]
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <RcppParallel.h>
#include <iostream>
using namespace Rcpp;
using namespace RcppParallel;
struct Lapin : public Worker {
// input pars
const arma::vec input;
const size_t dim;
// outputs a matrix
arma::mat output;
// two constructors
Lapin(const arma::vec input, const int dim) : input(input), dim(dim), output(arma::mat(dim,dim)) {
output.zeros();
}
Lapin(const Lapin & jeannot, Split) : input(jeannot.input), dim(jeannot.dim), output(arma::mat(dim,dim)) {
output.zeros();
}
// the working operator
void operator()(size_t begin, size_t end) {
for(size_t k = begin; k < end; k++) {
for(size_t i = 0; i < dim; i++) {
for(size_t j = 0; j < dim; j++) {
output(i,j) += input(k)+i+j;
}
}
}
}
// the join
void join(const Lapin & peter) {
output += peter.output;
}
};
// [[Rcpp::export]]
arma::mat f(arma::vec & A, size_t dim) {
Lapin groumf(A, dim);
parallelReduce(0, A.size(), groumf);
return groumf.output;
}

Look more closely at an example such as the parallel distance from the Rcpp Gallery.
It does not use NumericMatrix but rather RMatrix<double>. I would do the same here, and have generally advocated not to rely on contact with R types while running parallel segments.

Related

Profilers (nvvp and nvprof) not showing "Page Fault" information

I am profiling a test code presented in the Unified Memory for CUDA Beginners on NVIDIA's developer forum.
Code:
#include <iostream>
#include <math.h>
// CUDA kernel to add elements of two arrays
__global__
void add(int n, float* x, float* y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1 << 20;
float* x, * y;
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Launch kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add << <numBlocks, blockSize >> > (N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
QUESTION: The results of the profiling presented by the author shows information about "Page Faults" but when I run the nvprof and nvvp profilers, I do not get any information about page faults. Is there any flag or something that needs to be explicitly set to get that information?
My nvprof output:
== 20160 == Profiling result :
Type Time(%) Time Calls Avg Min Max Name
GPU activities : 100.00 % 60.513us 1 60.513us 60.513us 60.513us add(int, float*, float*)
API calls : 81.81 % 348.14ms 2 174.07ms 1.5933ms 346.54ms cudaMallocManaged
16.10 % 68.511ms 1 68.511ms 68.511ms 68.511ms cuDevicePrimaryCtxRelease
1.34 % 5.7002ms 1 5.7002ms 5.7002ms 5.7002ms cudaLaunchKernel
0.66 % 2.8192ms 2 1.4096ms 1.0669ms 1.7523ms cudaFree
0.07 % 277.80us 1 277.80us 277.80us 277.80us cudaDeviceSynchronize
0.01 % 33.500us 3 11.166us 3.5000us 16.400us cuModuleUnload
0.00 % 19.800us 1 19.800us 19.800us 19.800us cuDeviceTotalMem
0.00 % 16.700us 101 165ns 100ns 900ns cuDeviceGetAttribute
0.00 % 9.2000us 3 3.0660us 200ns 8.2000us cuDeviceGetCount
0.00 % 3.1000us 1 3.1000us 3.1000us 3.1000us cuDeviceGetName
0.00 % 2.1000us 2 1.0500us 300ns 1.8000us cuDeviceGet
0.00 % 300ns 1 300ns 300ns 300ns cuDeviceGetLuid
0.00 % 200ns 1 200ns 200ns 200ns cuDeviceGetUuid
== 20160 == Unified Memory profiling result :
Device "GeForce GTX 1070 (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
64 128.00KB 128.00KB 128.00KB 8.000000MB 3.217900ms Host To Device
146 84.164KB 32.000KB 1.0000MB 12.00000MB 68.17800ms Device To Host
My nvvp Profiling Result:
The operating system matters.
You are on windows, and the CUDA Unified Memory (UM) system works quite a bit differently on windows as compared to linux, when pascal or newer devices are in view.
On windows, page faults are not the mechanism that the UM system uses to determine when to migrate data, and so they are not reported in or by the profiler.

How to find median value in 2d array for each column with CUDA? [duplicate]

I found the method 'vectorized/batch sort' and 'nested sort' on below link. How to use Thrust to sort the rows of a matrix?
When I tried this method for 500 row and 1000 elements, the result of them are
vectorized/batch sort : 66ms
nested sort : 3290ms
I am using 1080ti HOF model to do this operation but it takes too long compared to your case.
But in the below link, it could be less than 10ms and almost 100 microseconds.
(How to find median value in 2d array for each column with CUDA?)
Could you recommend how to optimize this method to reduce operation time?
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>
#define NSORTS 500
#define DSIZE 1000
int my_mod_start = 0;
int my_mod() {
return (my_mod_start++) / DSIZE;
}
bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2) {
return thrust::equal(d1.begin(), d1.end(), d2.begin());
}
struct sort_functor
{
thrust::device_ptr<int> data;
int dsize;
__host__ __device__
void operator()(int start_idx)
{
thrust::sort(thrust::device, data + (dsize*start_idx), data + (dsize*(start_idx + 1)));
}
};
#include <time.h>
#include <windows.h>
unsigned long long dtime_usec(LONG start) {
SYSTEMTIME timer2;
GetSystemTime(&timer2);
LONG end = (timer2.wSecond * 1000) + timer2.wMilliseconds;
return (end-start);
}
int main() {
for (int i = 0; i < 3; i++) {
SYSTEMTIME timer1;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16 * DSIZE*NSORTS));
thrust::host_vector<int> h_data(DSIZE*NSORTS);
thrust::generate(h_data.begin(), h_data.end(), rand);
thrust::device_vector<int> d_data = h_data;
// first time a loop
thrust::device_vector<int> d_result1 = d_data;
thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
GetSystemTime(&timer1);
LONG time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
for (int i = 0; i < NSORTS; i++)
thrust::sort(r1ptr + (i*DSIZE), r1ptr + ((i + 1)*DSIZE));
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
//vectorized sort
thrust::device_vector<int> d_result2 = d_data;
thrust::host_vector<int> h_segments(DSIZE*NSORTS);
thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
thrust::device_vector<int> d_segments = h_segments;
GetSystemTime(&timer1);
time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
//nested sort
thrust::device_vector<int> d_result3 = d_data;
sort_functor f = { d_result3.data(), DSIZE };
thrust::device_vector<int> idxs(NSORTS);
thrust::sequence(idxs.begin(), idxs.end());
GetSystemTime(&timer1);
time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
thrust::for_each(idxs.begin(), idxs.end(), f);
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
}
return 0;
}
The main takeaway from your thrust experience is that you should never compile a debug project or with device debug switch (-G) when you are interested in performance. Compiling device debug code causes the compiler to omit many performance optimizations. The difference in your case was quite dramatic, about a 30x improvement going from debug to release code.
Here is a segmented cub sort, where we are launching 500 blocks and each block is handling a separate 1024 element array. The CUB code is lifted from here.
$ cat t1761.cu
#include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
#include <iostream>
const int ipt=8;
const int tpb=128;
__global__ void ExampleKernel(int *data)
{
// Specialize BlockRadixSort for a 1D block of 128 threads owning 8 integer items each
typedef cub::BlockRadixSort<int, tpb, ipt> BlockRadixSort;
// Allocate shared memory for BlockRadixSort
__shared__ typename BlockRadixSort::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int thread_keys[ipt];
// just create some synthetic data in descending order 1023 1022 1021 1020 ...
for (int i = 0; i < ipt; i++) thread_keys[i] = (tpb-1-threadIdx.x)*ipt+i;
// Collectively sort the keys
BlockRadixSort(temp_storage).Sort(thread_keys);
__syncthreads();
// write results to output array
for (int i = 0; i < ipt; i++) data[blockIdx.x*ipt*tpb + threadIdx.x*ipt+i] = thread_keys[i];
}
int main(){
const int blks = 500;
int *data;
cudaMalloc(&data, blks*ipt*tpb*sizeof(int));
ExampleKernel<<<blks,tpb>>>(data);
int *h_data = new int[blks*ipt*tpb];
cudaMemcpy(h_data, data, blks*ipt*tpb*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; i++) std::cout << h_data[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1761 t1761.cu -I/path/to/cub/cub-1.8.0
$ CUDA_VISIBLE_DEVICES="2" nvprof ./t1761
==13713== NVPROF is profiling process 13713, command: ./t1761
==13713== Warning: Profiling results might be incorrect with current version of nvcc compiler used to compile cuda app. Compile with nvcc compiler 9.0 or later version to get correct profiling results. Ignore this warning if code is already compiled with the recommended nvcc version
0 1 2 3 4 5 6 7 8 9
==13713== Profiling application: ./t1761
==13713== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 60.35% 308.66us 1 308.66us 308.66us 308.66us [CUDA memcpy DtoH]
39.65% 202.79us 1 202.79us 202.79us 202.79us ExampleKernel(int*)
API calls: 98.39% 210.79ms 1 210.79ms 210.79ms 210.79ms cudaMalloc
0.72% 1.5364ms 1 1.5364ms 1.5364ms 1.5364ms cudaMemcpy
0.32% 691.15us 1 691.15us 691.15us 691.15us cudaLaunchKernel
0.28% 603.26us 97 6.2190us 400ns 212.71us cuDeviceGetAttribute
0.24% 516.56us 1 516.56us 516.56us 516.56us cuDeviceTotalMem
0.04% 79.374us 1 79.374us 79.374us 79.374us cuDeviceGetName
0.01% 13.373us 1 13.373us 13.373us 13.373us cuDeviceGetPCIBusId
0.00% 5.0810us 3 1.6930us 729ns 2.9600us cuDeviceGetCount
0.00% 2.3120us 2 1.1560us 609ns 1.7030us cuDeviceGet
0.00% 748ns 1 748ns 748ns 748ns cuDeviceGetUuid
$
(CUDA 10.2.89, RHEL 7)
Above I am running on a Tesla K20x, which has performance that is "closer" to your 1080ti than a Tesla V100. We see that the kernel execution time is ~200us. If I run the exact same code on a Tesla V100, the kernel execution time drops to ~35us:
$ CUDA_VISIBLE_DEVICES="0" nvprof ./t1761
==13814== NVPROF is profiling process 13814, command: ./t1761
0 1 2 3 4 5 6 7 8 9
==13814== Profiling application: ./t1761
==13814== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 82.33% 163.43us 1 163.43us 163.43us 163.43us [CUDA memcpy DtoH]
17.67% 35.073us 1 35.073us 35.073us 35.073us ExampleKernel(int*)
API calls: 98.70% 316.92ms 1 316.92ms 316.92ms 316.92ms cudaMalloc
0.87% 2.7879ms 1 2.7879ms 2.7879ms 2.7879ms cuDeviceTotalMem
0.19% 613.75us 97 6.3270us 389ns 205.37us cuDeviceGetAttribute
0.19% 601.61us 1 601.61us 601.61us 601.61us cudaMemcpy
0.02% 72.718us 1 72.718us 72.718us 72.718us cudaLaunchKernel
0.02% 59.905us 1 59.905us 59.905us 59.905us cuDeviceGetName
0.01% 37.886us 1 37.886us 37.886us 37.886us cuDeviceGetPCIBusId
0.00% 4.6830us 3 1.5610us 546ns 2.7850us cuDeviceGetCount
0.00% 1.9900us 2 995ns 587ns 1.4030us cuDeviceGet
0.00% 677ns 1 677ns 677ns 677ns cuDeviceGetUuid
$
You'll note there is no "input" array, I'm just synthesizing data in the kernel, since we are interested in performance, primarily. If you need to handle an array size like 1000, you should probably just pad each array to 1024 (e.g. pad with a very large number, then ignore the last numbers in the sorted result.)
This code is largely lifted from external documentation. It is offered for instructional purposes. I'm not suggesting it is defect-free or suitable for any particular purpose. Use it at your own risk.

Unable to compile thrust code using reduce_by_key

I need to minimum values along columns of a matrix along with row indices using thrust. I use the following code (copied from orange owl solutions), However I get errors while compiling. I have posted it as an issue on the corresponding git page. The error message is huge and i dont know how to debug it. Can anyone help me with it? I am using cuda-8.0, thrust version 1.8.
The code:
#include <iterator>
#include <algorithm>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
using namespace thrust::placeholders;
int main()
{
const int Nrows = 6;
const int Ncols = 8;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
// --- Random uniform integer distribution between 0 and 100
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 20);
// --- Matrix allocation and initialization
thrust::device_vector<double> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (double)dist(rng);
printf("\n\nMatrix\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]\n";
}
/**********************************************/
/* FIND ROW MINIMA ALONG WITH THEIR LOCATIONS */
/**********************************************/
thrust::device_vector<float> d_minima(Ncols);
thrust::device_vector<int> d_indices(Ncols);
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), _1 / Nrows),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), _1 / Nrows) + Nrows * Ncols,
thrust::make_zip_iterator(
thrust::make_tuple(thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 % Nrows) * Ncols + _1 / Nrows)),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), _1 % Nrows))),
thrust::make_discard_iterator(),
thrust::make_zip_iterator(thrust::make_tuple(d_minima.begin(), d_indices.begin())),
thrust::equal_to<int>(),
thrust::minimum<thrust::tuple<float, int> >()
);
printf("\n\n");
for (int i=0; i<Nrows; i++) std::cout << "Min position = " << d_indices[i] << "; Min value = " << d_minima[i] << "\n";
return 0;
}
Error :
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp(58): error: ambiguous "?" operation: second operand of type "const thrust::tuple<double, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>" can be converted to third operand type "thrust::tuple<float, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>", and vice versa
detected during:
instantiation of "thrust::system::cuda::detail::bulk_::detail::reduce_by_key_detail::scan_head_flags_functor<FlagType, ValueType, BinaryFunction>::result_type thrust::system::cuda::detail::bulk_::detail::reduce_by_key_detail::scan_head_flags_functor<FlagType, ValueType, BinaryFunction>::operator()(const thrust::system::cuda::detail::bulk_::detail::reduce_by_key_detail::scan_head_flags_functor<FlagType, ValueType, BinaryFunction>::first_argument_type &, const thrust::system::cuda::detail::bulk_::detail::reduce_by_key_detail::scan_head_flags_functor<FlagType, ValueType, BinaryFunction>::second_argument_type &) [with FlagType=int, ValueType=thrust::tuple<double, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, BinaryFunction=thrust::minimum<thrust::tuple<float, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>]"
I guess you are using this code.
A curious characteristic of that code is that the matrix is defined using double type, but the captured minima are stored in a float vector.
If you want to use that code as-is, according to my testing, thrust (in CUDA 10, and apparently also CUDA 8) doesn't like this line:
thrust::minimum<thrust::tuple<float, int> >()
That operator is being used to compare two items to determine which is smaller, and it is templated to accept different kinds of items. However, it has decided that finding the minimum of two of those tuples is an "ambiguous" request. Part of the reason for this is that the operator returns a float, int tuple, but is being given variously a double,int tuple or a float,int tuple.
We can fix/work around this by passing our own functor to do the job, that is explicit in terms of handling the tuples passed to it:
$ cat t373.cu
#include <iterator>
#include <algorithm>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
using namespace thrust::placeholders;
struct my_min
{
template <typename T1, typename T2>
__host__ __device__
T2 operator()(T1 t1, T2 t2){
if (thrust::get<0>(t1) < thrust::get<0>(t2)) return t1;
return t2;
}
};
int main()
{
const int Nrows = 6;
const int Ncols = 8;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
// --- Random uniform integer distribution between 0 and 100
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 20);
// --- Matrix allocation and initialization
thrust::device_vector<double> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (double)dist(rng);
printf("\n\nMatrix\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]\n";
}
/**********************************************/
/* FIND ROW MINIMA ALONG WITH THEIR LOCATIONS */
/**********************************************/
thrust::device_vector<float> d_minima(Ncols);
thrust::device_vector<int> d_indices(Ncols);
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 / Nrows)),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 / Nrows)) + Nrows * Ncols,
thrust::make_zip_iterator(
thrust::make_tuple(thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), ((_1 % Nrows) * Ncols + _1 / Nrows))),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 % Nrows)))),
thrust::make_discard_iterator(),
thrust::make_zip_iterator(thrust::make_tuple(d_minima.begin(), d_indices.begin())),
thrust::equal_to<int>(),
my_min()
// thrust::minimum<thrust::tuple<float, int> >()
);
printf("\n\n");
for (int i=0; i<Nrows; i++) std::cout << "Min position = " << d_indices[i] << "; Min value = " << d_minima[i] << "\n";
return 0;
}
$ nvcc -o t373 t373.cu
$ ./t373
Matrix
[ 0 1 12 18 20 3 10 8 ]
[ 5 15 1 11 12 17 12 10 ]
[ 18 20 15 20 6 8 18 13 ]
[ 18 20 3 18 19 6 19 8 ]
[ 6 10 8 16 14 11 12 1 ]
[ 12 9 12 17 10 16 1 4 ]
Min position = 0; Min value = 0
Min position = 0; Min value = 1
Min position = 1; Min value = 1
Min position = 1; Min value = 11
Min position = 2; Min value = 6
Min position = 0; Min value = 3
$
I think a better fix is to just choose one or the other, either float or double. If we modify all float types to double, for example, then thrust is happy, without any other changes:
$ cat t373a.cu
#include <iterator>
#include <algorithm>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
using namespace thrust::placeholders;
int main()
{
const int Nrows = 6;
const int Ncols = 8;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
// --- Random uniform integer distribution between 0 and 100
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 20);
// --- Matrix allocation and initialization
thrust::device_vector<double> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (double)dist(rng);
printf("\n\nMatrix\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]\n";
}
/**********************************************/
/* FIND ROW MINIMA ALONG WITH THEIR LOCATIONS */
/**********************************************/
thrust::device_vector<double> d_minima(Ncols);
thrust::device_vector<int> d_indices(Ncols);
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 / Nrows)),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 / Nrows)) + Nrows * Ncols,
thrust::make_zip_iterator(
thrust::make_tuple(thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), ((_1 % Nrows) * Ncols + _1 / Nrows))),
thrust::make_transform_iterator(thrust::make_counting_iterator((int) 0), (_1 % Nrows)))),
thrust::make_discard_iterator(),
thrust::make_zip_iterator(thrust::make_tuple(d_minima.begin(), d_indices.begin())),
thrust::equal_to<int>(),
thrust::minimum<thrust::tuple<double, int> >()
);
printf("\n\n");
for (int i=0; i<Nrows; i++) std::cout << "Min position = " << d_indices[i] << "; Min value = " << d_minima[i] << "\n";
return 0;
}
$ nvcc -o t373a t373a.cu
$ ./t373a
Matrix
[ 0 1 12 18 20 3 10 8 ]
[ 5 15 1 11 12 17 12 10 ]
[ 18 20 15 20 6 8 18 13 ]
[ 18 20 3 18 19 6 19 8 ]
[ 6 10 8 16 14 11 12 1 ]
[ 12 9 12 17 10 16 1 4 ]
Min position = 0; Min value = 0
Min position = 0; Min value = 1
Min position = 1; Min value = 1
Min position = 1; Min value = 11
Min position = 2; Min value = 6
Min position = 0; Min value = 3
$
I think this latter solution of using consistent types is the more sensible solution.

Rcpp Parallel or openmp for matrixvector product

I am trying to program the naive parallel version of Conjugate gradient, so I started with the simple Wikipedia algorithm, and I want to change the dot-products and MatrixVector products by their appropriate parallel version, The Rcppparallel documentation has the code for the dot-product using parallelReduce; I think I'm gonna use that version for my code, but I'm trying to make the MatrixVector multiplication, but I haven't achieved good results compared to R base (no parallel)
Some versions of parallel matrix multiplication: using OpenMP, Rcppparallel, serial version, a serial version with Armadillo, and the benchmark
// [[Rcpp::depends(RcppParallel)]]
#include <Rcpp.h>
#include <RcppParallel.h>
#include <numeric>
// #include <cstddef>
// #include <cstdio>
#include <iostream>
using namespace RcppParallel;
using namespace Rcpp;
struct InnerProduct : public Worker
{
// source vectors
const RVector<double> x;
const RVector<double> y;
// product that I have accumulated
double product;
// constructors
InnerProduct(const NumericVector x, const NumericVector y)
: x(x), y(y), product(0) {}
InnerProduct(const InnerProduct& innerProduct, Split)
: x(innerProduct.x), y(innerProduct.y), product(0) {}
// process just the elements of the range I've been asked to
void operator()(std::size_t begin, std::size_t end) {
product += std::inner_product(x.begin() + begin,
x.begin() + end,
y.begin() + begin,
0.0);
}
// join my value with that of another InnerProduct
void join(const InnerProduct& rhs) {
product += rhs.product;
}
};
struct MatrixMultiplication : public Worker
{
// source matrix
const RMatrix<double> A;
//source vector
const RVector<double> x;
// destination matrix
RMatrix<double> out;
// initialize with source and destination
MatrixMultiplication(const NumericMatrix A, const NumericVector x, NumericMatrix out)
: A(A), x(x), out(out) {}
// take the square root of the range of elements requested
void operator()(std::size_t begin, std::size_t end) {
for (std::size_t i = begin; i < end; i++) {
// rows we will operate on
//RMatrix<double>::Row rowi = A.row(i);
RMatrix<double>::Row rowi = A.row(i);
//double res = std::inner_product(rowi.begin(), rowi.end(), x.begin(), 0.0);
//Rcout << "res" << res << std::endl;
out(i,1) = std::inner_product(rowi.begin(), rowi.end(), x.begin(), 0.0);
//Rcout << "res" << out(i,1) << std::endl;
}
}
};
// [[Rcpp::export]]
double parallelInnerProduct(NumericVector x, NumericVector y) {
// declare the InnerProduct instance that takes a pointer to the vector data
InnerProduct innerProduct(x, y);
// call paralleReduce to start the work
parallelReduce(0, x.length(), innerProduct);
// return the computed product
return innerProduct.product;
}
//librar(Rbenchmark)
// [[Rcpp::export]]
NumericVector matrixXvectorRcppParallel(NumericMatrix A, NumericVector x) {
// // declare the InnerProduct instance that takes a pointer to the vector data
// InnerProduct innerProduct(x, y);
int nrows = A.nrow();
NumericVector out(nrows);
for(int i = 0; i< nrows;i++ )
{
out(i) = parallelInnerProduct(A(i,_),x);
}
// return the computed product
return out;
}
// [[Rcpp::export]]
arma::rowvec matrixXvectorParallel(arma::mat A, arma::colvec x){
arma::rowvec y = A.row(0)*0;
int filas = A.n_rows;
int columnas = A.n_cols;
#pragma omp parallel for
for(int j=0;j<columnas;j++)
{
//y(j) = A.row(j)*x(j))
y(j) = dotproduct(A.row(j),x);
}
return y;
}
arma::mat matrixXvector2(arma::mat A, arma::mat x){
//arma::rowvec y = A.row(0)*0;
//y=A*x;
return A*x;
}
arma::rowvec matrixXvectorParallel2(arma::mat A, arma::colvec x){
arma::rowvec y = A.row(0)*0;
int filas = A.n_rows;
int columnas = A.n_cols;
#pragma omp parallel for
for(int j = 0; j < columnas ; j++){
double result = 0;
for(int i = 0; i < filas; i++){
result += x(i)*A(j,i);
}
y(j) = result;
}
return y;
}
Benchmark
test replications elapsed relative user.self sys.self user.child sys.child
1 M %*% a 20 0.026 1.000 0.140 0.060 0 0
2 matrixXvector2(M, as.matrix(a)) 20 0.040 1.538 0.101 0.217 0 0
4 matrixXvectorParallel2(M, a) 20 0.063 2.423 0.481 0.000 0 0
3 matrixXvectorParallel(M, a) 20 0.146 5.615 0.745 0.398 0 0
5 matrixXvectorRcppParallel(M, a) 20 0.335 12.885 2.305 0.079 0 0
My last trial at the moment was using parallefor with Rcppparallel, but I'm getting memory errors and I dont have idea where the problem is
// [[Rcpp::export]]
NumericVector matrixXvectorRcppParallel2(NumericMatrix A, NumericVector x) {
// // declare the InnerProduct instance that takes a pointer to the vector data
int nrows = A.nrow();
NumericMatrix out(nrows,1); //allocar mempria de vector de salida
//crear worker
MatrixMultiplication matrixMultiplication(A, x, out);
parallelFor(0,A.nrow(),matrixMultiplication);
// return the computed product
return out;
}
What I notice is that when I check in my terminal using htop how the processors are working, I see in htop when I apply the conventional Matrix vector multiplication using R-base, that is using all the processors, so Does the matrix multiplication perform parallel by default? because in theory, only one processor should be working if is the serial version.
If someone knows which is the better path, OpenMP or Rcppparallel, or another way, that gives me better performance than the apparently serial version of R-base.
The serial code for conjugte gradient at the moment
// [[Rcpp::export]]
arma::colvec ConjugateGradient(arma::mat A, arma::colvec xini, arma::colvec b, int num_iteraciones){
//arma::colvec xnew = xini*0 //inicializar en 0's
arma::colvec x= xini; //inicializar en 0's
arma::colvec rkold = b - A*xini;
arma::colvec rknew = b*0;
arma::colvec pk = rkold;
int k=0;
double alpha_k=0;
double betak=0;
double normak = 0.0;
for(k=0; k<num_iteraciones;k++){
Rcout << "iteracion numero " << k << std::endl;
alpha_k = sum(rkold.t() * rkold) / sum(pk.t()*A*pk); //sum de un elemento para realizar casting
(pk.t()*A*pk);
x = x+ alpha_k * pk;
rknew = rkold - alpha_k*A*pk;
normak = sum(rknew.t()*rknew);
if( normak < 0.000001){
break;
}
betak = sum(rknew.t()*rknew) / sum( rkold.t() * rkold );
//actualizar valores para siguiente iteracion
pk = rknew + betak*pk;
rkold = rknew;
}
return x;
}
I wasn't aware of the use of BLAS in R, thanks Hong Ooi and tim18, so the new benchmark using option(matprod="internal") and option(matprod="blas")
options(matprod = "internal")
res<-benchmark(M%*%a,matrixXvector2(M,as.matrix(a)),matrixXvectorParallel(M,a),matrixXvectorParallel2(M,a),matrixXvectorRcppParallel(M,a),order="relative",replications = 20)
res
test replications elapsed relative user.self sys.self user.child sys.child
2 matrixXvector2(M, as.matrix(a)) 20 0.043 1.000 0.107 0.228 0 0
4 matrixXvectorParallel2(M, a) 20 0.069 1.605 0.530 0.000 0 0
1 M %*% a 20 0.072 1.674 0.071 0.000 0 0
3 matrixXvectorParallel(M, a) 20 0.140 3.256 0.746 0.346 0 0
5 matrixXvectorRcppParallel(M, a) 20 0.343 7.977 2.272 0.175 0 0
options(matprod="blas")
options(matprod = "blas")
res<-benchmark(M%*%a,matrixXvector2(M,as.matrix(a)),matrixXvectorParallel(M,a),matrixXvectorParallel2(M,a),matrixXvectorRcppParallel(M,a),order="relative",replications = 20)
res
test replications elapsed relative user.self sys.self user.child sys.child
1 M %*% a 20 0.021 1.000 0.093 0.054 0 0
2 matrixXvector2(M, as.matrix(a)) 20 0.092 4.381 0.177 0.464 0 0
5 matrixXvectorRcppParallel(M, a) 20 0.328 15.619 2.143 0.109 0 0
4 matrixXvectorParallel2(M, a) 20 0.438 20.857 3.036 0.000 0 0
3 matrixXvectorParallel(M, a) 20 0.546 26.000 3.667 0.127 0 0
As you already found out, the base R matrix multiplication can be multi-threaded, if a multi-threaded BLAS implementation is used. This is the case for the rocker/* docker images, which typically use OpenBLAS.
In addition, (Rcpp)Armadillo already uses the BLAS library used by R (in this case multi-threaded OpenBLAS) as well as OpenMP. So your "serial" version is actually multi-threaded. You can verify this in htop with a large enough matrix as input.
BTW, what you are trying to do looks like premature optimization to me.

finding shortest path of all nodes from a given node using BFS

when i increase or decrease INF value the Output Behaves Unexpectedly..
I think INF should not have any effect on the output..
length of each edge is 6
for input
1
4 2
1 2
1 3
1
the output is 6 6 -1
when I change INF to 1e8 the output is 0 0 0
#include<iostream>
#include<vector>
#include<queue>
#include<cstring>
using namespace std;
#define MAX 2000
#define INF 1000000
vector<int> adj[MAX];
int d[MAX];
bool visited[MAX];
void initialise(){
for(int i=0;i<=MAX;i++){
visited[i]=false;
}
}
void shortestPathBfs(int start){
queue<int> q;
q.push(start);
visited[start]=true;
d[start]=0;
while(!q.empty()){
int p=q.front();
q.pop();
for(int i=0;i<adj[p].size();i++){
int v=adj[p][i];
if(!visited[v] && d[v]>d[p]+6){
d[v]=d[p]+6;
q.push(v);
visited[v]=true;
}
}
}
}
int main(){
int T,N,M,S,x,y;
cin>>T;
while(T--){
cin>>N>>M;
for(int i=0;i<M;i++){
cin>>x>>y;
adj[x].push_back(y);
adj[y].push_back(x);
}
cin>>S;
initialise();
memset(d,INF,sizeof(d));
shortestPathBfs(S);
for(int i = 1; i <=N; i++) {
if(i == S)
continue;
if(d[i] >= INF)
cout<<"-1"<<" ";
else
cout<<d[i]<<" ";
}
}
}
The problem is with
memset(d,INF,sizeof(d));
memset() only fills memory with a byte value. Here it will wind up filling the array with the least significant byte of the int value INF. To fix it, create an explicit for loop or use std::fill() instead.

Resources