I tried to run a very simple (sparse) matrix vector product (V0) on my laptop but it's very very slow ?... A naive implementation (V1) is very very much faster :
>> g++ -march=native -O3 -ftree-vectorize -funroll-loops -ffast-math -fstrict-aliasing -o matVecProdV0.exe matVecProdV0.cpp -I /path/to/eigen-eigen-5a0156e40feb/local/include/eigen3 -mavx -fopenmp
>> g++ -march=native -O3 -ftree-vectorize -funroll-loops -ffast-math -fstrict-aliasing -o matVecProdV1.exe matVecProdV1.cpp
>> ./matVecProdV0.exe 10000 100
134536 ms
>> ./matVecProdV1.exe 10000 100
498 ms
What did I miss ?
My laptop has 4 procs (2 cores + hyperthreading). Using latest debian/testing with g++-7.2
>> cat /proc/cpuinfo
model name : Intel(R) Core(TM) i7-3687U CPU # 2.10GHz
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm cpuid_fault epb tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt dtherm ida arat pln pts
The code is:
>> more *.cpp
::::::::::::::
matVecProdV0.cpp
::::::::::::::
#include <iostream>
#include <string>
#include <sstream>
#include <chrono>
#include <cmath>
#include <cstdlib> // rand.
#include <Eigen/Sparse>
#include <vector>
using namespace std;
#define AVE 75000
int main(int argc, char ** argv) {
if (argc != 3 || !argv) return 1;
size_t n = 0; stringstream sn(argv[1]); sn >> n; if (n <= 0) return 1;
size_t p = 0; stringstream sp(argv[2]); sp >> p; if (p <= 0 || p > n) return 1;
vector<Eigen::Triplet<double>> ijAij;
ijAij.reserve(n*p); // We have p values per col.
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < p; j++) { // We have p values per col.
ijAij.push_back(Eigen::Triplet<double> (i, rand()%n, 1.)); // Get column in the range 0 to n-1.
}
}
Eigen::SparseMatrix<double> mat(n, n);
mat.reserve(n*p); // We have p values per col.
mat.setFromTriplets(ijAij.begin(), ijAij.end());
Eigen::VectorXd vec(n);
for (size_t i = 0; i < n; i++) vec(i) = 1.;
Eigen::VectorXd res(n);
for (size_t i = 0; i < n; i++) res(i) = 0.;
auto start = chrono::high_resolution_clock::now();
for (size_t a = 0; a < AVE; a++) { // Average.
res += mat*vec;
}
auto end = chrono::high_resolution_clock::now();
cout << chrono::duration_cast<chrono::milliseconds>(end-start).count() << " ms" << flush;
return rc;
}
::::::::::::::
matVecProdV1.cpp
::::::::::::::
#include <iostream>
#include <string>
#include <sstream>
#include <chrono>
#include <cmath>
#include <cstdlib> // rand.
using namespace std;
#define AVE 75000
int main(int argc, char ** argv) {
if (argc != 3 || !argv) return 1;
size_t n = 0; stringstream sn(argv[1]); sn >> n; if (n <= 0) return 1;
size_t p = 0; stringstream sp(argv[2]); sp >> p; if (p <= 0 || p > n) return 1;
int * pMatIr = new int[n+1]; pMatIr[0] = 0;
int nnz = n*p; // Number of non null values: p values per col * n cols.
int * pMatJc = new int[nnz];
double * pMatVal = new double[nnz];
size_t s = 0; // Scan.
for (size_t i = 0; i < n; i++) {
pMatIr[i+1] = p; // We have p values per col.
for (size_t j = 0; j < p; j++) {
pMatJc[s] = rand()%n; // Get column in the range 0 to n-1.
pMatVal[s] = 1.;
s++;
}
}
double * pVec = new double[n];
for (size_t i = 0; i < n; i++) pVec[i] = 1.;
double * pRes = new double[n];
for (size_t i = 0; i < n; i++) pRes[i] = 0.;
auto start = chrono::high_resolution_clock::now();
for (size_t a = 0; a < AVE; a++) { // Average.
for (size_t i = 0; i < n; i++) {
int startJc = pMatIr[i];
size_t nbJc = pMatIr[i+1] - startJc;
for (size_t j = 0; j < nbJc; j++) {
pRes[i] += pMatVal[pMatJc[startJc+j]]*pVec[i];
}
}
}
auto end = chrono::high_resolution_clock::now();
cout << chrono::duration_cast<chrono::milliseconds>(end-start).count() << " ms" << flush;
if (pMatIr) {delete [] pMatIr; pMatIr = NULL;}
if (pMatJc) {delete [] pMatJc; pMatJc = NULL;}
if (pMatVal) {delete [] pMatVal; pMatVal = NULL;}
if (pVec) {delete [] pVec; pVec = NULL;}
if (pRes) {delete [] pRes; pRes = NULL;}
return rc;
}
Why is eigen slower than naive implementation ?
Franck
if I got your naive implementation correctly, the line pMatIr[i+1] = p; should be pMatIr[i+1] = pMatIr[i] + p; and pVec[i] in the inner loop should run on j. Once fixed, the eigen version actually runs faster on my system (>~40%).
note that the wrong code is fast because the for(j) loop performs almost no work ( pMatIr[i+1] - startJc is zero for all i>0 )...
Related
I have been trying to implement some code requiring to call reduce on thrust::device_ptr, and the results are not consistent with CPU implementation while dealing with large values. I have to deal with large values. So is there a way around:
My code:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
int main()
{
real** a;
std::cout.precision(30);
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
}
}
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
///************************
//CUDA KERNELS ARE HERE
// REMOVED FOR CLEAR QUESTION
///*************************
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
////////CPU PART DOING SAME THING//////
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
}
}
cout<<"\nsum cpu "<< sum2<<"\n";
if((sum2-sum1)<0.001)
std::cout << "\nSUCESS "<< "\n";
else
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
}
The compiler that I am using is nvcc and my graphics card is nvidia 1650 with compute capability 7.5.
According to the documentation, thrust expects the type for summation to be reflected in the init value:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
^
The type of that constant you have is an integral type. If you change that to a double-precision constant:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
you get matching results, between CPU and GPU, according to my testing. (You could alternatively cast your constant to real type: (real)0 and use that, and there are other ways to address this as well, such as dropping the use of the init value and the binary op.)
I try to use openmp to parallelise my loop to be faster. The problem is that the parallelised version is not faster than the sequential version
#include <Rcpp.h>
#include <iostream>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
#include "test.h"
using namespace std;
// [[Rcpp::export]]
std::vector<double> parallel_random_sum(int n, int ncores) {
std::vector<double> res(n);
#pragma omp parallel num_threads(ncores)
{
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
}
}
return res;
}
// [[Rcpp::export]]
std::vector<double> not_parallel_random_sum(int n) {
std::vector<double> res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
// cout << "j = "<<j <<" test = " << lres<<endl;
lres += j;
res[j] = lres / n;
}
return res;
}
/*** R
microbenchmark::microbenchmark(
parallel_random_sum(1e7, 8),
not_parallel_random_sum(1e7),
times = 20
)
*/
result ==>
parallel_random_sum(1e+07,8) 62.02360 milliseconds
not_parallel_random_sum(1e+07) 65.56082 milliseconds
The code you are trying to parallelize is just not expensive enough, making the overhead of the parallelization comparable to the gain. If you add some artificial workload in the loop by sleeping for a short amount of time, you can see the performance gain:
#include <chrono>
#include <thread>
#include <Rcpp.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
// [[Rcpp::export]]
Rcpp::NumericVector parallel_sleep(int n, int ncores) {
Rcpp::NumericVector res_(n);
RcppParallel::RVector<double> res(res_);
#pragma omp parallel num_threads(ncores)
{
#pragma omp for
for (int j = 0; j < n; ++j) {
double lres(0);
std::this_thread::sleep_for(std::chrono::microseconds(10));
lres += j;
res[j] = lres / n;
}
}
return res_;
}
// [[Rcpp::export]]
Rcpp::NumericVector not_parallel_sleep(int n) {
Rcpp::NumericVector res(n);
for (int j = 0; j < n; ++j) {
double lres(0);
std::this_thread::sleep_for(std::chrono::microseconds(10));
lres += j;
res[j] = lres / n;
}
return res;
}
/*** R
N <- 1e4
bench::mark(
parallel_sleep(N, 8),
not_parallel_sleep(N)
)
*/
Result:
# A tibble: 2 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <lis> <list>
1 parallel_sle… 73.2ms 81.3ms 82.3ms 87ms 12.3 80.7KB 0 7 569ms <dbl [1… <Rprofme… <bch… <tibbl…
2 not_parallel… 667.8ms 667.8ms 667.8ms 668ms 1.50 80.7KB 0 1 668ms <dbl [1… <Rprofme… <bch… <tibbl…
Note that I am also using data structures from RcppParallel to avoid the need of a deep copy when returning the data (c.f. comment by #coatless).
I'm currently working on a MPI-program and I'm trying to send blocks of a matrix with scatterv to all processes.
Process description
The matrix is given as an array.
First I produce a datatype with MPI_Type_vector to create the necessary block out of the original array.
Second I create a MPI_Type_struct that should hold rows of blocks.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 16
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p,r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
int *arr;
arr = NULL;
if (r == 0){
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n * n; i++) arr[i] = i;
for (int i = 0; i < n; i++){
printf("\n");
for (int j = 0; j < n; j++)
printf("%4d", arr[i * n + j]);
}
}
printf("\n");
int ps = sqrt(p);
int ns = n / ps;
if (r == 0) {
printf("ps: %d ns: %d\n", ps, ns);
}
/* create datatype */
MPI_Datatype block;
MPI_Type_vector(ns, ns, n, MPI_INT, &block);
int blocks[ps];
MPI_Aint displs[ps];
for (int i = 0; i < ps; i++) {
blocks[i] = 1;
displs[i] = i * sizeof(int);
}
MPI_Datatype types[ps];
//for (int i = 0; i < ps - 1; i++) types[i] = block;
//types[ps - 1] = MPI_UB;
types[0] = block;
for (int i = 1; i < ps; i++) types[i] = MPI_UB;
//types[0] = block;
//types[1] = MPI_UB;
if (r == 0) {
printf("displs:\n");
for(int i = 0; i < ps; i++) printf("%3ld", displs[i]);
printf("\n");
}
MPI_Datatype row;
MPI_Type_struct(ps, blocks, displs, types, &row);
MPI_Type_commit(&row);
/* prepare scatter */
int sdispl[p]; int sendcounts[p];
for (int i = 0; i < p; i++) {
sdispl[i] = (i % ps) + (i / ps) * (ns * ps);
sendcounts[i] = 1;
}
if (r == 0) {
printf("sdispl: \n");
for (int i = 0; i < 4; i++) printf("%3d", sdispl[i]);
printf("\n");
}
int rcv[ns * ns];
MPI_Scatterv(arr, sendcounts, sdispl, row, rcv, ns * ns, MPI_INT, 0, comm);
int result = 1;
if (r == result) {
printf("result for %d:\n", result);
for (int i = 0; i < ns * ns; i++) {
printf("%4d", rcv[i]);
if ((i+1) % ns == 0) printf("\n");
}
}
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
So far the structure of the blocks is correct.
The problem
The block, that was sent to process r = 1 starts with 3 instead of 4. The block for process r = 2 also starts with 6 and the one for process r = 3 starts with 9.
For r == 4 it jumps to 48.
What it should do
r start
0 0
1 4
2 8
3 12
4 64
5 68
6 ...
15 204
The help I would need
I think, that I'm making some mistake with displ and sdispl.
Compiling and Running the example
The code is compiled with the folowing command:
mpicc -o main main.c -lm
I run the code with:
mpirun -np 16 ./main
Thanks for any help in advance!
With the hint of Zulan I was able to solve my problem.
The following code is based on the excellent answer to subarrays.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 8
void print_arr(int *arr, int x) {
printf("\n");
for (int i = 0; i < x*x; i++){
if (i % x == 0) printf("\n");
printf("%4d", arr[i]);
}
printf("\n");
}
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm comm = MPI_COMM_WORLD;
int p, r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
/* number of proceses in dim x and dim y */
int ps = sqrt(p);
/* number of elements in dim x and dim y in sarr */
int ns = n/ps;
/* array of data - distributed by process 0 */
int *arr = NULL;
if (r==0) {
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n*n; i++) arr[i] = i;
print_arr(arr, n);
}
MPI_Datatype type, resizedtype;
int sizes[2] = {n,n};
int subsizes[2] = {ns,ns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, ns*sizeof(int), &resizedtype);
MPI_Type_commit(&resizedtype);
int counts[p];
for (int i = 0; i < p; i++) counts[i] = 1;
int displs[p];
for (int i = 0; i < p; i++) displs[i] = i%ps + i/ps * ns * ps;
/* subarray to store distributed data */
int sarr[ns * ns];
/* send submatrices to all processes */
MPI_Scatterv(arr, counts, displs, resizedtype, sarr, ns*ns, MPI_INT, 0, comm);
/* print received data for process pr */
int pr = 3;
if (r == pr)
print_arr(sarr, ns);
/* free arr */
if (arr != NULL) free(arr);
MPI_Finalize();
return 0;
}
You can compile the example with
mpicc -o main main.c
and run it with
mpirun -np 4 ./main
I'm having a task that is to be accelerated by OpenACC. I need to do dynamic memory allocation within a kernel computation. I've built a simpler demo for it as following.
#include <iostream>
using namespace std;
#pragma acc routine seq
int *routine(int init) {
int *ptr;
#pragma acc data create(ptr[:10])
for (int i = 0; i < 10; ++i) {
ptr[i] = init + i;
}
return ptr;
}
void print_array(int *arr) {
for (int i = 0; i < 10; ++i) {
cout << arr[i] << " ";
}
cout << endl;
}
int main(void) {
int *arrs[5];
#pragma acc kernels
for (int i = 0; i < 5; ++i) {
arrs[i] = routine(i);
}
for (int i = 0; i < 5; ++i) {
print_array(arrs[i]);
}
return 0;
}
In this demo, I'm trying to call the routine while running inside a kernel construct. The routine procedure wants to create some data within the GPU and put some values into it.
While I can compile the code, but it reports runtime problems as following.
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ pgc++ -o test main.cc -acc -Minfo=accel
routine(int):
6, Generating acc routine seq
main:
23, Generating implicit copyout(arrs[:])
26, Accelerator restriction: size of the GPU copy of arrs is unknown
Loop is parallelizable
Generating implicit copy(arrs[:][:])
Accelerator kernel generated
Generating Tesla code
26, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
lisanhu#lisanhu-XPS-15-9550:create_and_copyout$ ./test
call to cuStreamSynchronize returned error 715: Illegal instruction
I'm wondering what I should do to accomplish this task (dynamically allocating memory within processing of a kernel construct). Really appreciate it if you could help.
This is untested, and probably very slow, but this might do what you need it to.
int main() {
const int num = 20;
int a[x] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
int* sizes = (int *)malloc(num * sizeof(int));
int *ptrs[num];
int* temp, *temp2;
int sum;
int* finished = (int *)malloc(num * sizeof(int));
for (int x = 0; x < num; ++x){
finished[x] = 0;
}
#pragma acc kernels copyin(a[0:10]) copyout(ptrs[:num][:1]) async(num*2+1)
{
#pragma acc loop private(temp)
for (int i = 0; i < num; ++i){
#pragma acc loop seq async(i)
for (int j = 0; j < 1; ++j){
temp = ptrs[x];
sizes[i] = ...
}
while (ptrs[x] != x);
ptrs[x] = routine(a, sizes[i]);
}
}
while (true){
sum = 0;
for (int x = 0; x < num; ++x){
sum += finished[x];
}
if (sum == num){
break;
}
for (int x = 0; x < num; ++x){
if (acc_async_test(x) != 0 && finished[x] == 0){
finished[x] = 1;
#pragma acc update host(sizes[x:1])
temp = (int *)malloc(size[x] * sizeof(int));
#pragma acc enter data copyin(temp[0:x])
temp2 = acc_deviceptr(temp);
ptrs[x] = temp2;
#pragma acc update device(ptrs[x:1][0:1])
}
}
}
}
I implemented a recursive scan (prefix sum) algorithm, which I've included below. Here, I simply generate random lists of size powers of two up to the twenty-seventh power, checking against a simple sequential scan for accuracy. It works.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <mkl.h>
int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);
int main(int argc, char **argv)
{
int n;
int i, j, k;
int *x, *seq, *r;
double begin, end;
srand48(time(0));
/* Randomly generate array of size n. */
for (k = 2; k < 28; k++) {
n = (int) pow(2, k);
seq = (int *) malloc(sizeof(int) * n);
x = (int *) malloc(sizeof(int) * n);
for (i = 0; i < n; i++) {
x[i] = lrand48() % 100 - 50;
seq[i] = x[i];
}
/* Parallel scan. */
begin = dsecnd();
r = pscan(x, n, 0, 2);
end = dsecnd();
printf("%d %lf\n", n, end - begin);
/* Sequential check. */
for (i = 1; i < n; i++) {
seq[i] = seq[i - 1] + seq[i];
}
for (i = 0; i < n; i++) {
if (r[i] != seq[i]) {
fprintf(stderr, "AGGGHHH!!! ERROR. Found with vector: \n");
for (j = 0; j < n; j++) {
printf("%d ", x[i]);
}
printf("\n");
exit(1);
}
}
free(r);
free(x);
free(seq);
}
return 0;
}
/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
int i, j;
int *sums, *sumscan, *scan, **fsum, *rv;
/* Base case, serially scan a chunk. */
if (n <= chunk_size) {
scan = (int *) malloc(sizeof(int) * n);
scan[0] = x[0] + z;
for (i = 1; i < n; i++) {
scan[i] = x[i] + scan[i - 1];
}
return scan;
}
sums = (int *) malloc(sizeof(int) * (n / chunk_size));
/* Reduce each chunk of the array. */
for (i = 0; i < n / chunk_size; i++) {
sums[i] = reduce(&x[i * chunk_size], chunk_size);
}
/* Perform a scan on the sums. */
sumscan = pscan(sums, n / chunk_size, 0, chunk_size);
free(sums);
fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));
/* Perform a recursive scan on each chunk, using
the appropriate offset from the sums scan. */
for (i = 0; i < n / chunk_size; i++) {
if (i > 0) {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
} else {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
}
}
free(sumscan);
rv = (int *) malloc(sizeof(int) * n);
/* Join the arrays. */
for (i = 0; i < n / chunk_size; i++) {
for (j = 0; j < chunk_size; j++) {
rv[i * chunk_size + j] = fsum[i][j];
}
}
for (i = 0; i < n / chunk_size; i++) {
free(fsum[i]);
}
free(fsum);
return rv;
}
/* Serial reduction. */
int reduce(int *x, int n)
{
int i;
int sum;
sum = 0;
for (i = 0; i < n; i++) {
sum += x[i];
}
return sum;
}
Now, I'd like to parallelize it. Because I'm feeling a little hipster-ish, I've hacked up a Cilk implementation. I just replace the two main for loops to parallelize 1) the reduction and 2) the recursive scan of each chunk, using the appropriate scan of the chunk reductions as an offset. It looks like so.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cilk/cilk.h>
#include <mkl.h>
int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);
int main(int argc, char **argv)
{
int n;
int i, j, k;
int *x, *seq, *r;
double begin, end;
srand48(time(0));
/* Randomly generate array of size n. */
for (k = 2; k < 28; k++) {
n = (int) pow(2, k);
seq = (int *) malloc(sizeof(int) * n);
x = (int *) malloc(sizeof(int) * n);
for (i = 0; i < n; i++) {
x[i] = lrand48() % 100 - 50;
seq[i] = x[i];
}
/* Parallel scan. */
begin = dsecnd();
r = pscan(x, n, 0, 2);
end = dsecnd();
printf("%d %lf\n", n, end - begin);
/* Sequential check. */
for (i = 1; i < n; i++) {
seq[i] = seq[i - 1] + seq[i];
}
for (i = 0; i < n; i++) {
if (r[i] != seq[i]) {
fprintf(stderr, "AGGGHHH!!! ERROR. Found with vector: \n");
for (j = 0; j < n; j++) {
printf("%d ", x[i]);
}
printf("\n");
exit(1);
}
}
free(r);
free(x);
free(seq);
}
return 0;
}
/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
int i, j;
int *sums, *sumscan, *scan, **fsum, *rv;
/* Base case, serially scan a chunk. */
if (n <= chunk_size) {
scan = (int *) malloc(sizeof(int) * n);
scan[0] = x[0] + z;
for (i = 1; i < n; i++) {
scan[i] = x[i] + scan[i - 1];
}
return scan;
}
sums = (int *) malloc(sizeof(int) * (n / chunk_size));
/* Reduce each chunk of the array. */
cilk_for (i = 0; i < n / chunk_size; i++) {
sums[i] = reduce(&x[i * chunk_size], chunk_size);
}
/* Perform a scan on the sums. */
sumscan = pscan(sums, n / chunk_size, 0, chunk_size);
free(sums);
fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));
/* Perform a recursive scan on each chunk, using
the appropriate offset from the sums scan. */
cilk_for (i = 0; i < n / chunk_size; i++) {
if (i > 0) {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
} else {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
}
}
free(sumscan);
rv = (int *) malloc(sizeof(int) * n);
/* Join the arrays. */
for (i = 0; i < n / chunk_size; i++) {
for (j = 0; j < chunk_size; j++) {
rv[i * chunk_size + j] = fsum[i][j];
}
}
for (i = 0; i < n / chunk_size; i++) {
free(fsum[i]);
}
free(fsum);
return rv;
}
/* Serial reduction. */
int reduce(int *x, int n)
{
int i;
int sum;
sum = 0;
for (i = 0; i < n; i++) {
sum += x[i];
}
return sum;
}
And it works! Well, it returns correct results. It doesn't achieve the performance I had hoped. The original performance was
4 0.000004
8 0.000001
16 0.000002
32 0.000003
64 0.000005
128 0.000011
256 0.000019
512 0.000035
1024 0.000068
2048 0.000130
4096 0.000257
8192 0.000512
16384 0.001129
32768 0.002262
65536 0.004519
131072 0.009065
262144 0.018297
524288 0.037416
1048576 0.078307
2097152 0.157448
4194304 0.313855
8388608 0.625689
16777216 1.251949
33554432 2.589439
67108864 5.084731
134217728 10.402186
for the single-threaded application, but the Cilk version performend worse, with the following runtimes
4 0.005383
8 0.000011
16 0.000009
32 0.000111
64 0.000055
128 0.000579
256 0.000339
512 0.000544
1024 0.000701
2048 0.001086
4096 0.001265
8192 0.001742
16384 0.002283
32768 0.003891
65536 0.005398
131072 0.009255
262144 0.020736
524288 0.058156
1048576 0.103893
2097152 0.215460
4194304 0.419988
8388608 0.749368
16777216 1.650938
33554432 2.960451
67108864 5.799836
134217728 11.294398
I have a 24-core machine, so we're obviously not seeing the speed-up we would hope for here. My first thought was that Cilk is mishandling the recursion, causing oversubscription, but Cilk is specifically supposed to handle recursion well. Any tips on how to implement this properly? I tried adding cilk_for to the bottom for loop (freeing everything) and the inner for-loop of the penultimate set of loops (joining the array), but that slowed performance down even more.
Any advice is well-appreciated.
However, please don't tell me to switch to Belloch's parallel scan algorithm discussed here. I already implemented that in Cilk, and it worked quite well. I'd like to see if I can match its performance with this recursive solution.
I fixed my performance problems by finding the optimal chunk size for each problem. At that chunk size, the (same) parallel version performs better than the sequential version.
In summary, there were a few things wrong with both my general approach and particularly the chunk size of two:
My benchmarking approach. In a code with a tuning parameter, it doesn't make much sense to plot runtime vs. problem size using the same value for the tuning parameter because the optimal value is dependent on problem size.
A chunk size of two was likely problematic because, while it maximizes parallelism, it also maximizes the number of levels of recursion and, likewise, the overhead that comes along with it.
A chunk size of two prevents vectorization.
As Leeor suggested, a chunk size of two probably also leads to false sharing in the cache.
Props to Leeor for leading me in the right direction.