A Quickselect C Algorithm faster than C Qsort - algorithm

I have tried to implement a C QuickSelect algorithm as described in this post (3 way quicksort (C implementation)).
However, all I get are performances 5 to 10 times less than the default qsort (even with an initial shuffling).
I tried to dig into the original qsort source code as provide here (https://github.com/lattera/glibc/blob/master/stdlib/qsort.c), but it's too complex.
Does anybody have a simpler, and better algorithm?
Any idea is welcomed.
Thanks,
NB: my original problem is to try to get the Kth smallest values of an array to the first Kth indices. So I planned to call quickselect K times
EDIT 1: Here is the Cython Code as copied and adapted from the link above
cdef void qswap(void* a, void* b, const size_t size) nogil:
cdef char temp[size]# C99, use malloc otherwise
#char serves as the type for "generic" byte arrays
memcpy(temp, b, size)
memcpy(b, a, size)
memcpy(a, temp, size)
cdef void qshuffle(void* base, size_t num, size_t size) nogil: #implementation of Fisher
cdef int i, j, tmp# create local variables to hold values for shuffle
for i in range(num - 1, 0, -1): # for loop to shuffle
j = c_rand() % (i + 1)#randomise j for shuffle with Fisher Yates
qswap(base + i*size, base + j*size, size)
cdef void partition3(void* base,
size_t *low, size_t *high, size_t size,
QComparator compar) nogil:
# Modified median-of-three and pivot selection.
cdef void *ptr = base
cdef size_t lt = low[0]
cdef size_t gt = high[0] # lt is the pivot
cdef size_t i = lt + 1# (+1 !) we don't compare pivot with itself
cdef int c = 0
while (i <= gt):
c = compar(ptr + i * size, ptr + lt * size)
if (c < 0):# base[i] < base[lt] => swap(i++,lt++)
qswap(ptr + lt * size, ptr + i * size, size)
i += 1
lt += 1
elif (c > 0):#base[i] > base[gt] => swap(i, gt--)
qswap(ptr + i * size, ptr + gt* size, size)
gt -= 1
else:#base[i] == base[gt]
i += 1
#base := [<<<<<lt=====gt>>>>>>]
low[0] = lt
high[0] = gt
cdef void qselectk3(void* base, size_t lo, size_t hi,
size_t size, size_t k,
QComparator compar) nogil:
cdef size_t low = lo
cdef size_t high = hi
partition3(base, &low, &high, size, compar)
if ((k - 1) < low): #k lies in the less-than-pivot partition
high = low - 1
low = lo
elif ((k - 1) >= low and (k - 1) <= high): #k lies in the equals-to-pivot partition
qswap(base, base + size*low, size)
return
else: # k > high => k lies in the greater-than-pivot partition
low = high + 1
high = hi
qselectk3(base, low, high, size, k, compar)
"""
A selection algorithm to find the nth smallest elements in an unordered list.
these elements ARE placed at the nth positions of the input array
"""
cdef void qselect(void* base, size_t num, size_t size,
size_t n,
QComparator compar) nogil:
cdef int k
qshuffle(base, num, size)
for k in range(n):
qselectk3(base + size*k, 0, num - k - 1, size, 1, compar)
I use python timeit to get the performance of both method pyselect(with N=50) and pysort.
Like this
def testPySelect():
A = np.random.randint(16, size=(10000), dtype=np.int32)
pyselect(A, 50)
timeit.timeit(testPySelect, number=1)
def testPySort():
A = np.random.randint(16, size=(10000), dtype=np.int32)
pysort(A)
timeit.timeit(testPySort, number=1)

The answer by #chqrlie is the good and final answer, yet to complete the post, I am posting the Cython version along with the benchmarking results.
In short, the proposed solution is 2 times faster than qsort on long vectors!
cdef void qswap2(void *aptr, void *bptr, size_t size) nogil:
cdef uint8_t* ac = <uint8_t*>aptr
cdef uint8_t* bc = <uint8_t*>bptr
cdef uint8_t t
while (size > 0): t = ac[0]; ac[0] = bc[0]; bc[0] = t; ac += 1; bc += 1; size -= 1
cdef struct qselect2_stack:
uint8_t *base
uint8_t *last
cdef void qselect2(void *base, size_t nmemb, size_t size,
size_t k, QComparator compar) nogil:
cdef qselect2_stack stack[64]
cdef qselect2_stack *sp = &stack[0]
cdef uint8_t *lb
cdef uint8_t*ub
cdef uint8_t *p
cdef uint8_t *i
cdef uint8_t *j
cdef uint8_t *top
if (nmemb < 2 or size <= 0):
return
top = <uint8_t *>base
if(k < nmemb):
top += k*size
else:
top += nmemb*size
sp.base = <uint8_t *>base
sp.last = <uint8_t *>base + (nmemb - 1) * size
sp += 1
cdef size_t offset
while (sp > stack):
sp -= 1
lb = sp.base
ub = sp.last
while (lb < ub and lb < top):
#select middle element as pivot and exchange with 1st element
offset = (ub - lb) >> 1
p = lb + offset - offset % size
qswap2(lb, p, size)
#partition into two segments
i = lb + size
j = ub
while 1:
while (i < j and compar(lb, i) > 0):
i += size
while (j >= i and compar(j, lb) > 0):
j -= size
if (i >= j):
break
qswap2(i, j, size)
i += size
j -= size
# move pivot where it belongs
qswap2(lb, j, size)
# keep processing smallest segment, and stack largest
if (j - lb <= ub - j):
sp.base = j + size
sp.last = ub
sp += 1
ub = j - size
else:
sp.base = lb
sp.last = j - size
sp += 1
lb = j + size
cdef int int_comp(void* a, void* b) nogil:
cdef int ai = (<int*>a)[0]
cdef int bi = (<int*>b)[0]
return (ai > bi ) - (ai < bi)
def pyselect2(numpy.ndarray[int, ndim=1, mode="c"] na, int n):
cdef int* a = <int*>&na[0]
qselect2(a, len(na), sizeof(int), n, int_comp)
Here are the benchmark results (1,000 tests):
#of elements K #qsort (s) #qselect2 (s)
1,000 50 0.1261 0.0895
1,000 100 0.1261 0.0910
10,000 50 0.8113 0.4157
10,000 100 0.8113 0.4367
10,000 1,000 0.8113 0.4746
100,000 100 7.5428 3.8259
100,000 1,000 7,5428 3.8325
100,000 10,000 7,5428 4.5727
For those who are curious, this piece of code is a jewel in the field of surface reconstruction using neural networks.
Thanks again to #chqrlie, your code is unique on The Web.

Here is a quick implementation for your purpose: qsort_select is a simple implementation of qsort with automatic pruning of unnecessary ranges.
Without && lb < top, it behaves like the regular qsort except for pathological cases where more advanced versions have better heuristics. This extra test prevents complete sorting of ranges that are outside the target 0 .. (k-1). The function selects the k smallest values and sorts them, the rest of the array has the remaining values in an undefinite order.
#include <stdio.h>
#include <stdint.h>
static void exchange_bytes(uint8_t *ac, uint8_t *bc, size_t size) {
while (size-- > 0) { uint8_t t = *ac; *ac++ = *bc; *bc++ = t; }
}
/* select and sort the k smallest elements from an array */
void qsort_select(void *base, size_t nmemb, size_t size,
int (*compar)(const void *a, const void *b), size_t k)
{
struct { uint8_t *base, *last; } stack[64], *sp = stack;
uint8_t *lb, *ub, *p, *i, *j, *top;
if (nmemb < 2 || size <= 0)
return;
top = (uint8_t *)base + (k < nmemb ? k : nmemb) * size;
sp->base = (uint8_t *)base;
sp->last = (uint8_t *)base + (nmemb - 1) * size;
sp++;
while (sp > stack) {
--sp;
lb = sp->base;
ub = sp->last;
while (lb < ub && lb < top) {
/* select middle element as pivot and exchange with 1st element */
size_t offset = (ub - lb) >> 1;
p = lb + offset - offset % size;
exchange_bytes(lb, p, size);
/* partition into two segments */
for (i = lb + size, j = ub;; i += size, j -= size) {
while (i < j && compar(lb, i) > 0)
i += size;
while (j >= i && compar(j, lb) > 0)
j -= size;
if (i >= j)
break;
exchange_bytes(i, j, size);
}
/* move pivot where it belongs */
exchange_bytes(lb, j, size);
/* keep processing smallest segment, and stack largest */
if (j - lb <= ub - j) {
sp->base = j + size;
sp->last = ub;
sp++;
ub = j - size;
} else {
sp->base = lb;
sp->last = j - size;
sp++;
lb = j + size;
}
}
}
}
int int_cmp(const void *a, const void *b) {
int aa = *(const int *)a;
int bb = *(const int *)b;
return (aa > bb) - (aa < bb);
}
#define ARRAY_SIZE 50000
int array[ARRAY_SIZE];
int main(void) {
int i;
for (i = 0; i < ARRAY_SIZE; i++) {
array[i] = ARRAY_SIZE - i;
}
qsort_select(array, ARRAY_SIZE, sizeof(*array), int_cmp, 50);
for (i = 0; i < 50; i++) {
printf("%d%c", array[i], i + 1 == 50 ? '\n' : ',');
}
return 0;
}

Related

Some CUDA computations fail with larger block dimension (< 1024)

I am learning CUDA with a GTX 960 4GB. I wrote a program which performs an element-wise matrix multiplication. When I increase the block dimensions for x and y to lets say (32 x 32) in combination with a large matrix (lets say 15000 x 15000 elements), some but not all multiplication results are wrong (value 0 instead of 6).
When I then decrease the block dimensions to e.g (8 x 8), all results are right again. When I decrease the Matrix size, the results are right again, too.
So in case of this example, there seems to be combinations of total threads and threads per block, which does not work.
I am surprised I can't find any threads regarding this topic. All I can find is about increasing performance and occupancy, but not about when some but not all calculations are aborted.
The grid dimensions are calculated as follows:
dim3 blocks(ceil<int>(COLS / threads.x), ceil<int>(ROWS / threads.y));
Why do some multiplications fail while others are successful?
Some Examples
Block dim : (8, 8)
Matrix shape : (15000, 15000)
Verification : 0 elements have failed, total length 225000000, shape: (15000, 15000)
Block dim : (16, 16)
Matrix shape : (15000, 15000)
Verification : 239936 elements have failed, total length 225000000, shape: (15000, 15000)
Block dim : (32, 32)
Matrix shape : (15000, 15000)
Verification : 719424 elements have failed, total length 225000000, shape: (15000, 15000).
Block dim : (32, 32)
Matrix shape : (10000, 10000)
Verification : 0 elements have failed, total length 100000000, shape: (10000, 10000).
Driver Version
$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 470.82.00 Thu Oct 14 10:24:40 UTC 2021
Complete Code
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define ROWS 10000
#define COLS 10000
#define MAX_ERR 1e-6
typedef struct {
int width;
int height;
float* elements;
} Matrix;
size_t ij(int i, int j){
return j * ROWS + i;
}
__global__ void matrix_multi_elemwise(const Matrix OUT, const Matrix A, const Matrix B) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (col < A.width && row < A.height) {
int index = row * A.height + col; // linearisation of index
OUT.elements[index] = A.elements[index] * B.elements[index];
}
}
int main(){
Matrix A, B, OUT;
Matrix dev_A, dev_B, dev_OUT;
size_t SIZE = ROWS * COLS * sizeof(float);
// Allocate host memory
A.elements = (float*) malloc(SIZE);
B.elements = (float*) malloc(SIZE);
OUT.elements = (float*) malloc(SIZE);
// Initialize host matrices
A.height = ROWS; A.width = COLS;
B.height = ROWS; B.width = COLS;
OUT.height = ROWS; OUT.width = COLS;
for (int j = 0; j < ROWS; j++) {
for(int i = 0; i < COLS; i++){
A.elements[ij(i, j)] = 2.0f;
B.elements[ij(i, j)] = 3.0f;
}
}
// Allocate device memory
cudaMalloc((void**) &dev_A.elements, SIZE);
cudaMalloc((void**) &dev_B.elements, SIZE);
cudaMalloc((void**) &dev_OUT.elements, SIZE);
dev_A.height = A.height; dev_A.width = A.width;
dev_B.height = A.height; dev_B.width = B.width;
dev_OUT.height = A.height; dev_OUT.width = OUT.width;
// Transfer data from host to device memory
cudaMemcpy(dev_A.elements, A.elements, SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B.elements, B.elements, SIZE, cudaMemcpyHostToDevice);
// Executing kernel
dim3 threads(16, 16);
dim3 blocks(ceil<int>(COLS / threads.x), ceil<int>(ROWS / threads.y));
matrix_multi_elemwise<<<blocks, threads>>>(dev_OUT, dev_A, dev_B);
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) {
printf("CUDA Runtime API Error reported : %s in file %s on line.\n", cudaGetErrorString(err), __FILE__);
}
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Transfer data back to host memory
cudaMemcpy(OUT.elements, dev_OUT.elements, SIZE, cudaMemcpyDeviceToHost);
// Verification
int count = 0, length = 0, i = 0, j = 0;
for (j = 0; j < ROWS; j++) {
for(i = 0; i < COLS; i++){
//assert(fabs(OUT.elements[ij(i, j)] / A.elements[ij(i, j)] - B.elements[ij(i, j)]) < MAX_ERR);
if (fabs(OUT.elements[ij(i, j)] / A.elements[ij(i, j)] - B.elements[ij(i, j)]) > MAX_ERR) {
count++;
}
length++;
}
}
printf("Verification: %i elements have failed, total length %i, shape: (%i, %i).\n", count, length, i, j);
// Deallocate device memory
cudaFree(dev_A.elements);
cudaFree(dev_B.elements);
cudaFree(dev_OUT.elements);
// Deallocate host memory
free(A.elements);
free(B.elements);
free(OUT.elements);
}
The number of blocks is wrong. Indeed, COLS and threads.x are both integers. Thus, the result is a truncated integer. ceil<int> cannot ceil the result as it has already been truncated. This cause some blocks not to be computed: 15000 is divisible by 8 but not by 16. You need to either cast COLS to a floating-point number or compute the ceil result manually (safer). Here is an example:
dim3 blocks((COLS + threads.x - 1) / threads.x, (ROWS + threads.y - 1) / threads.y);
As pointed out in the comment, note that row * A.height + col is wrong: it should be row * A.width + col instead. This causes issues for non square matrices.

Efficient weighted cross-product in Armadillo

I'm trying to find an efficient way to compute X^T * W * X where X is a dense mat of size e.g. 10,000 x 10 and W is a diagonal matrix (I store only the diagonal in an vec).
For now, I use this function
arma::mat& getXtW(const arma::mat& covar,
const arma::vec& w,
arma::mat& tcovar,
size_t n, size_t K) {
size_t i, k;
for (i = 0; i < n; i++) {
for (k = 0; k < K; k++) {
tcovar(k, i) = covar(i, k) * w(i);
}
}
return tcovar;
}
and compute
tcovar = getXtW(covar, w, tcovar, n, K);
cprod = tcovar * covar;
Yet, this seems not optimal.
PS: You can see the whole code there.
Edit1: Seems I can use covar.t() * (covar.each_col() % w), but this doesn't seem to be much faster.
Edit2: If I implement it myself with loops in Rcpp:
arma::mat testProdW2(const arma::mat& x, const arma::vec& w) {
int n = x.n_rows;
int K = x.n_cols;
arma::mat res(K, K);
double tmp;
for (int k = 0; k < K; k++) {
for (int j = k; j < K; j++) {
tmp = 0;
for (int i = 0; i < n; i++) {
tmp += x(i, j) * w[i] * x(i, k);
}
res(j, k) = tmp;
}
}
for (int k = 0; k < K; k++) {
for (int j = 0; j < k; j++) {
res(j, k) = res(k, j);
}
}
return res;
}
This is slower than the first implementation.
According to BLAS matrix by matrix transpose multiply there is no BLAS routine that does this directly. Instead there is the suggestion to loop over the rows of X and use dsyr. I found this an interesting question since I know how to link BLAS in Rcpp, but have not done so using RcppArmadillo. Stack Overflow knows an answer for that as well: Rcpparmadillo: can't call Fortran routine "dgebal"?. Note: I have not checked but I expect that dsyr is not part of the BLAS subset that come with R. So this will only work if your R is linked to a full BLAS implementation.
Combining this we get:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <Rcpp/Benchmark/Timer.h>
#ifdef ARMA_USE_LAPACK
#if !defined(ARMA_BLAS_CAPITALS)
#define arma_dsyr dsyr
#else
#define arma_dsyr DSYR
#endif
extern "C"
void arma_fortran(arma_dsyr)(const char* uplo, const int* n, const double* alpha, const double* X, const int* incX, const double* A, const int* ldA);
#endif
// [[Rcpp::export]]
Rcpp::NumericVector getXtWX(const arma::mat& X, const arma::vec& w) {
Rcpp::Timer timer;
timer.step("start");
arma::mat result1 = X.t() * (X.each_col() % w);
timer.step("Armadillo result");
const int n = X.n_rows;
const int k = X.n_cols;
arma::mat result(k, k, arma::fill::zeros);
for (size_t i = 0; i < n; ++i) {
F77_CALL(dsyr)("U", &k, &w(i), &X(i,0), &n, result.memptr(), &k);
}
result = arma::symmatu(result);
timer.step("BLAS result");
Rcpp::NumericVector res(timer);
return res;
}
/*** R
n <- 10000
k <- 10
X <- matrix(runif(n*k), n, k)
w <- runif(n)
Reduce(rbind, lapply(1:6, function(x) diff(getXtWX(X, w))/1e6))
*/
However, for me the BLAS solution is quite a bit slower:
> Reduce(rbind, lapply(1:6, function(x) diff(getXtWX(X, w))/1e6))
Armadillo result BLAS result
init 1.291243 6.666026
1.176143 6.623282
1.102111 6.644165
1.094917 6.612596
1.098619 6.588431
1.069286 6.615529
I tried to improve this by first transposing the matrix, in the hope that memory access would be faster when looping over column matrices, but this did not make a difference on my (low-powered) system.

How Should I Implement Cilk Parallelism with a Recursive Scan Algorithm?

I implemented a recursive scan (prefix sum) algorithm, which I've included below. Here, I simply generate random lists of size powers of two up to the twenty-seventh power, checking against a simple sequential scan for accuracy. It works.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <mkl.h>
int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);
int main(int argc, char **argv)
{
int n;
int i, j, k;
int *x, *seq, *r;
double begin, end;
srand48(time(0));
/* Randomly generate array of size n. */
for (k = 2; k < 28; k++) {
n = (int) pow(2, k);
seq = (int *) malloc(sizeof(int) * n);
x = (int *) malloc(sizeof(int) * n);
for (i = 0; i < n; i++) {
x[i] = lrand48() % 100 - 50;
seq[i] = x[i];
}
/* Parallel scan. */
begin = dsecnd();
r = pscan(x, n, 0, 2);
end = dsecnd();
printf("%d %lf\n", n, end - begin);
/* Sequential check. */
for (i = 1; i < n; i++) {
seq[i] = seq[i - 1] + seq[i];
}
for (i = 0; i < n; i++) {
if (r[i] != seq[i]) {
fprintf(stderr, "AGGGHHH!!! ERROR. Found with vector: \n");
for (j = 0; j < n; j++) {
printf("%d ", x[i]);
}
printf("\n");
exit(1);
}
}
free(r);
free(x);
free(seq);
}
return 0;
}
/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
int i, j;
int *sums, *sumscan, *scan, **fsum, *rv;
/* Base case, serially scan a chunk. */
if (n <= chunk_size) {
scan = (int *) malloc(sizeof(int) * n);
scan[0] = x[0] + z;
for (i = 1; i < n; i++) {
scan[i] = x[i] + scan[i - 1];
}
return scan;
}
sums = (int *) malloc(sizeof(int) * (n / chunk_size));
/* Reduce each chunk of the array. */
for (i = 0; i < n / chunk_size; i++) {
sums[i] = reduce(&x[i * chunk_size], chunk_size);
}
/* Perform a scan on the sums. */
sumscan = pscan(sums, n / chunk_size, 0, chunk_size);
free(sums);
fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));
/* Perform a recursive scan on each chunk, using
the appropriate offset from the sums scan. */
for (i = 0; i < n / chunk_size; i++) {
if (i > 0) {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
} else {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
}
}
free(sumscan);
rv = (int *) malloc(sizeof(int) * n);
/* Join the arrays. */
for (i = 0; i < n / chunk_size; i++) {
for (j = 0; j < chunk_size; j++) {
rv[i * chunk_size + j] = fsum[i][j];
}
}
for (i = 0; i < n / chunk_size; i++) {
free(fsum[i]);
}
free(fsum);
return rv;
}
/* Serial reduction. */
int reduce(int *x, int n)
{
int i;
int sum;
sum = 0;
for (i = 0; i < n; i++) {
sum += x[i];
}
return sum;
}
Now, I'd like to parallelize it. Because I'm feeling a little hipster-ish, I've hacked up a Cilk implementation. I just replace the two main for loops to parallelize 1) the reduction and 2) the recursive scan of each chunk, using the appropriate scan of the chunk reductions as an offset. It looks like so.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cilk/cilk.h>
#include <mkl.h>
int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);
int main(int argc, char **argv)
{
int n;
int i, j, k;
int *x, *seq, *r;
double begin, end;
srand48(time(0));
/* Randomly generate array of size n. */
for (k = 2; k < 28; k++) {
n = (int) pow(2, k);
seq = (int *) malloc(sizeof(int) * n);
x = (int *) malloc(sizeof(int) * n);
for (i = 0; i < n; i++) {
x[i] = lrand48() % 100 - 50;
seq[i] = x[i];
}
/* Parallel scan. */
begin = dsecnd();
r = pscan(x, n, 0, 2);
end = dsecnd();
printf("%d %lf\n", n, end - begin);
/* Sequential check. */
for (i = 1; i < n; i++) {
seq[i] = seq[i - 1] + seq[i];
}
for (i = 0; i < n; i++) {
if (r[i] != seq[i]) {
fprintf(stderr, "AGGGHHH!!! ERROR. Found with vector: \n");
for (j = 0; j < n; j++) {
printf("%d ", x[i]);
}
printf("\n");
exit(1);
}
}
free(r);
free(x);
free(seq);
}
return 0;
}
/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
int i, j;
int *sums, *sumscan, *scan, **fsum, *rv;
/* Base case, serially scan a chunk. */
if (n <= chunk_size) {
scan = (int *) malloc(sizeof(int) * n);
scan[0] = x[0] + z;
for (i = 1; i < n; i++) {
scan[i] = x[i] + scan[i - 1];
}
return scan;
}
sums = (int *) malloc(sizeof(int) * (n / chunk_size));
/* Reduce each chunk of the array. */
cilk_for (i = 0; i < n / chunk_size; i++) {
sums[i] = reduce(&x[i * chunk_size], chunk_size);
}
/* Perform a scan on the sums. */
sumscan = pscan(sums, n / chunk_size, 0, chunk_size);
free(sums);
fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));
/* Perform a recursive scan on each chunk, using
the appropriate offset from the sums scan. */
cilk_for (i = 0; i < n / chunk_size; i++) {
if (i > 0) {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
} else {
fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
}
}
free(sumscan);
rv = (int *) malloc(sizeof(int) * n);
/* Join the arrays. */
for (i = 0; i < n / chunk_size; i++) {
for (j = 0; j < chunk_size; j++) {
rv[i * chunk_size + j] = fsum[i][j];
}
}
for (i = 0; i < n / chunk_size; i++) {
free(fsum[i]);
}
free(fsum);
return rv;
}
/* Serial reduction. */
int reduce(int *x, int n)
{
int i;
int sum;
sum = 0;
for (i = 0; i < n; i++) {
sum += x[i];
}
return sum;
}
And it works! Well, it returns correct results. It doesn't achieve the performance I had hoped. The original performance was
4 0.000004
8 0.000001
16 0.000002
32 0.000003
64 0.000005
128 0.000011
256 0.000019
512 0.000035
1024 0.000068
2048 0.000130
4096 0.000257
8192 0.000512
16384 0.001129
32768 0.002262
65536 0.004519
131072 0.009065
262144 0.018297
524288 0.037416
1048576 0.078307
2097152 0.157448
4194304 0.313855
8388608 0.625689
16777216 1.251949
33554432 2.589439
67108864 5.084731
134217728 10.402186
for the single-threaded application, but the Cilk version performend worse, with the following runtimes
4 0.005383
8 0.000011
16 0.000009
32 0.000111
64 0.000055
128 0.000579
256 0.000339
512 0.000544
1024 0.000701
2048 0.001086
4096 0.001265
8192 0.001742
16384 0.002283
32768 0.003891
65536 0.005398
131072 0.009255
262144 0.020736
524288 0.058156
1048576 0.103893
2097152 0.215460
4194304 0.419988
8388608 0.749368
16777216 1.650938
33554432 2.960451
67108864 5.799836
134217728 11.294398
I have a 24-core machine, so we're obviously not seeing the speed-up we would hope for here. My first thought was that Cilk is mishandling the recursion, causing oversubscription, but Cilk is specifically supposed to handle recursion well. Any tips on how to implement this properly? I tried adding cilk_for to the bottom for loop (freeing everything) and the inner for-loop of the penultimate set of loops (joining the array), but that slowed performance down even more.
Any advice is well-appreciated.
However, please don't tell me to switch to Belloch's parallel scan algorithm discussed here. I already implemented that in Cilk, and it worked quite well. I'd like to see if I can match its performance with this recursive solution.
I fixed my performance problems by finding the optimal chunk size for each problem. At that chunk size, the (same) parallel version performs better than the sequential version.
In summary, there were a few things wrong with both my general approach and particularly the chunk size of two:
My benchmarking approach. In a code with a tuning parameter, it doesn't make much sense to plot runtime vs. problem size using the same value for the tuning parameter because the optimal value is dependent on problem size.
A chunk size of two was likely problematic because, while it maximizes parallelism, it also maximizes the number of levels of recursion and, likewise, the overhead that comes along with it.
A chunk size of two prevents vectorization.
As Leeor suggested, a chunk size of two probably also leads to false sharing in the cache.
Props to Leeor for leading me in the right direction.

finding the sum of mod operations in a range

if we have 2 numbers, say a and b then how can we find the value of sum of b%i where i ranges from 1 to a?
One way is to iterate through all values from 1 to a but is there any efficient method?
(better than O(n) ?)
E.g : if a = 4 and b = 5 then required ans = 5%1+5%2+5%3+5%4=4
Thanks.
For i > b, we have b % i == b, so that part of the sum is easily calculated in constant time ((a-b)*b, if a >= b, 0 otherwise).
The part for i <= b remains to be calculated (i == b gives 0, thus may be ignored). You can do that in O(sqrt(b)) steps,
For i <= sqrt(b), calculate b % i and add to sum
For i > sqrt(b), let k = floor(b/i), then b % i == b - k*i, and k < sqrt(b). So for k = 1 to ceiling(sqrt(b))-1, let hi = floor(b/k) and lo = floor(b/(k+1)). There are hi - lo numbers i such that k*i <= b < (k+1)*i, the sum of b % i for them is sum_{ lo < i <= hi } (b - k*i) = (hi - lo)*b - k*(hi-lo)*(hi+lo+1)/2.
If a <= sqrt(b), only the first bullet applies, stopping at a. If sqrt(b) < a < b, in the second bullet, run from k = floor(b/a) to ceiling(sqrt(b))-1 and adjust the upper limit for the smallest k to a.
Overall complexity O(min(a,sqrt(b))).
Code (C):
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
unsigned long long usqrt(unsigned long long n);
unsigned long long modSum(unsigned long long a, unsigned long long b);
int main(int argc, char *argv[]){
unsigned long long a, b;
b = (argc > 1) ? strtoull(argv[argc-1],NULL,0) : 10000;
a = (argc > 2) ? strtoull(argv[1],NULL,0) : b;
printf("Sum of moduli %llu %% i for 1 <= i <= %llu: %llu\n",b,a,modSum(a,b));
return EXIT_SUCCESS;
}
unsigned long long usqrt(unsigned long long n){
unsigned long long r = (unsigned long long)sqrt(n);
while(r*r > n) --r;
while(r*(r+2) < n) ++r;
return r;
}
unsigned long long modSum(unsigned long long a, unsigned long long b){
if (a < 2 || b == 0){
return 0;
}
unsigned long long sum = 0, i, l, u, r = usqrt(b);
if (b < a){
sum += (a-b)*b;
}
u = (a < r) ? a : r;
for(i = 2; i <= u; ++i){
sum += b%i;
}
if (r < a){
u = (a < b) ? a : (b-1);
i = b/u;
l = b/(i+1);
do{
sum += (u-l)*b;
sum -= i*(u-l)*(u+l+1)/2;
++i;
u = l;
l = b/(i+1);
}while(u > r);
}
return sum;
}

Sum of subsequences

Consider the first integer is A, A[i] equals i-th digit of A (0-based indexing, from right to left) and the second integer is B , B[i] equals to i-th digit of B (0-based indexing, from right to left).
The lucky sum of A and B is equal to C, C[i] = max(A[i], B[i]). If i is greater than or equal to size of integer, the i-th digit is equal to 0.
For example,
the lucky sum of 47 and 729 is
max(7,9)=9
max(4,2)=4
max(0,7)=7
answer = 749
Similarly, the lucky sum of W = (74, 92, 477)
max(4,2) = 4
max(7,9) = 9
Lucky sum of 74,92 = 94
Lucky sum of W=(Lucky sum of (94,477))
which is
max(4,7)=7
max(9,7)=9
max(0,4)=4
So the lucky sum of w is=497.
The task: we are given an array W, containing n (1<=n<=50) integers.
We have to find a number of non-empty subsequences of W such that the lucky sum of integers in that subsequences is a lucky number (lucky numbers are positive integers whose decimal representation contains only the lucky digits 4 and 7. For example, numbers 47, 744, 4 are lucky and 5, 17, 467 are not.).
Constraint: 0 < W[i] < 1e9
Examples:
W = {4,7}: answer = 3
W = {43, 87 ,44}: answer = 2
Can this problem be solved by dynamic programming?
How this problem can be solved efficiently in C++ ?
Here's what i can think of(unfinished yet):
Uses DP with bit mask. we now represent a number in the following way: every bit is categorized into five kinds:
(0) -> 0
(1,2,3) -> 1
(4) -> 2
(5,6) -> 3
(7) -> 4
(8,9) -> -1
As we can easily see, whenever a bit is 8 or 9, it can never be added into a valid solution. now we represent the number with bit-mask, which takes 5^8.
So we let f[i][s] denotes the total ways we can choose the subset from the first i numbers to make out the number whose bit-mask is s.
Here is the code i just wrote again.....
Three things remains:
use __int64 or long long instead of int for f[][].
use queue to accelerate enumeration for there are a lot of impossible status(i.e. f[][s]==0) if we enumerate with for (i = 0;i < MAXS;i++).
use f[0..1][MAXS] to reduce memory cost.
The sample code:
#include <queue>
#include <cstdio>
#include <cstring>
#include <algorithm>
#define MAXN 51
#define MAXS 390625 //5^8
using namespace std;
const int exp[] = {1, 5, 25, 125, 625, 3125, 15625, 78125, 390625};
int n;
int w[MAXN];
struct node{
int i;
int stat;
node(int x, int y):i(x),stat(y){}
};
queue<node> q;
__int64 f[MAXN][MAXS];
bool inq[MAXN][MAXS];
int main(){
//freopen("test.txt","r",stdin);
memset(f,0,sizeof(f));
memset(inq,0,sizeof(inq));
scanf("%d",&n);
for (int i = 0;i < n;i++) scanf("%d",&w[i]);
while (!q.empty()) q.pop();
f[0][0] = 1;
for (int i = 0;i < n;i++)
for (int j = 0;j < MAXS;j++)
if (f[i][j] > 0){
f[i + 1][j] += f[i][j];
int stat = j;
int loc = 0;
int k = 0;
for (int p = w[i];p > 0;p /= 10){
k = p % 10;
if (k <= 0) k = 0;
else if (k <= 3) k = 1;
else if (k <= 4) k = 2;
else if (k <= 6) k = 3;
else if (k <= 7) k = 4;
else k = -1;
if (k < 0) break;
int bit = stat % exp[loc + 1] / exp[loc];
if (k < bit) k = bit;
stat = stat - (bit - k) * exp[loc];
loc++;
}
if (k < 0) continue;
f[i + 1][stat] += f[i][j];
}
int ans = 0;
for (int i = 0;i < MAXS;i++){
bool flag = false;
for (int loc = 7;loc >= 0;loc--){
int bit = i % exp[loc + 1] / exp[loc];
if (bit > 0) flag = true;
if (flag == true && (bit != 2 && bit != 4)){
flag = false;
break;
}
}
if (flag == true) ans += f[n][i];
}
printf("%d\n",ans);
return 0;
}
Since every bit of the answer is independent. So update them separately and the whole algorithm takes O(n*log10(w))
Here's the code i just wrote:
#include <cstdio>
#include <cstring>
#include <algorithm>
#define MAXL 15
using namespace std;
int n;
int ans[MAXL];
int main(){
int i,j,w;
scanf("%d",&n);
memset(ans,0,sizeof(ans));
while (n--){
scanf("%d",&w);
i = 0;
while (w>0){
j = w % 10;
ans[i] = max(ans[i], j);
i++;
w /= 10;
}
}
bool flag = false;
for (i=MAXL-1;i>=0;i--){
if (ans[i] > 0) flag = true;
if (flag) printf("%d",ans[i]);
}
printf("\n");
return 0;
}

Resources