So I want to use SIMD instructions in C++ to compare values from an uint32_t array and store the values back in a new one of this arrays.
It works more or less fine, but I am still using 4 if-clauses to determine if the values I got after the SIMD instructions to write back the values.
is there a way to do this with SIMD instructions?
The function allocateAlignedBuffer does what the name says and is working correctly.
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t* cnt) {
uint32_t numcnt = 4;
uint32_t * resArr = allocateAlignedBuffer<uint32_t>(num, true);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set_epi32(10,10,10,10);
for (int i = 0; i < num; i+=4) {
__m128i positions = _mm_set_epi32(i+3,i+2,i+1,i);
__m128i vec = _mm_load_si128 ( reinterpret_cast<const __m128i*> ( (&arr[i]) ) );
__m128i simdAnd2 = _mm_cmpge_ps(vec, comp2);
int comp = _mm_movemask_epi8 (simdAnd2);
if (comp == 0x0000) {
//std::cout << "nothing found\n";
continue;
}
else if (comp < 65535) {
if ( ((uint32_t *) &simdAnd2)[0] ){
std::cout << "first byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[0];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[1]){
std::cout << "second byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[1];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[2]){
std::cout << "3rd byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[2];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[3]){
std::cout << "4th byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[3];
resPos++;
*cnt++;
}
}
else { //all elements equal
resPos[0] = ((uint32_t *) &positions)[0];
resPos[1] = ((uint32_t *) &positions)[1];
resPos[2] = ((uint32_t *) &positions)[2];
resPos[3] = ((uint32_t *) &positions)[3];
resPos += numcnt;
*cnt += numcnt;
}
}
std::cout << "cnt "<<*cnt<<"\n";
return resArr;
}
Also there is probably a lot to optimize I believe.
Another variant with using shuffles:
__m128i g_shuffles[16] =
{
_mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0),
_mm_setr_epi8(12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
};
uint32_t g_steps[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
uint32_t * testFunc2(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i threshold = _mm_set1_epi32(10 - 1);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
__m128i _cnt = _mm_setzero_si128();
for (int i = 0; i < num; i += 4)
{
__m128i _arr = _mm_loadu_si128((__m128i*)(arr + i));
__m128i comparemask = _mm_cmpgt_epi32(_arr, threshold);
_cnt = _mm_add_epi32(_cnt, _mm_and_si128(comparemask, _1));
int index = _mm_movemask_ps(_mm_castsi128_ps(comparemask));
__m128i storePositions = _mm_shuffle_epi8(positions, g_shuffles[index]);
_mm_storeu_si128((__m128i*)resPos, storePositions);
resPos += g_steps[index];
positions = _mm_add_epi32(positions, _4);
}
uint32_t cnts[4];
_mm_storeu_si128((__m128i*)cnts, _cnt);
*cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3];
std::cout << "cnt " << *cnt << "\n";
return resArr;
}
Here's a version with a pshufb trick to do the compaction, not tested though and the shuffle masks shouldn't really be local.
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t numcnt = 4;
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set1_epi32(10);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
int count = 0;
const int X = 0x80808080;
__m128i compaction_masks[16];
compaction_masks[0] = _mm_set1_epi8(0x80);
compaction_masks[1] = _mm_set_epi32(X, X, X, 0x03020100);
compaction_masks[2] = _mm_set_epi32(X, X, X, 0x07060504);
compaction_masks[3] = _mm_set_epi32(X, X, 0x07060504, 0x03020100);
compaction_masks[4] = _mm_set_epi32(X, X, X, 0x0B0A0908);
compaction_masks[5] = _mm_set_epi32(X, X, 0x0B0A0908, 0x03020100);
compaction_masks[6] = _mm_set_epi32(X, X, 0x0B0A0908, 0x07060504);
compaction_masks[7] = _mm_set_epi32(X, 0x0B0A0908, 0x07060504, 0x03020100);
compaction_masks[8] = _mm_set_epi32(X, X, X, 0x0F0E0D0C);
compaction_masks[9] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x03020100);
compaction_masks[10] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x07060504);
compaction_masks[11] = _mm_set_epi32(X, 0x0F0E0D0C, 0x07060504, 0x03020100);
compaction_masks[12] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x0B0A0908);
compaction_masks[13] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x03020100);
compaction_masks[14] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x07060504);
compaction_masks[15] = _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100);
for (int i = 0; i < num; i += 4)
{
__m128i vec = _mm_loadu_si128((__m128i*)(arr + i));
__m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);
int comp = _mm_movemask_ps(_mm_castsi128_ps(simdAnd2));
__m128i shufmask = compaction_masks[comp];
vec = _mm_shuffle_epi8(positions, shufmask);
_mm_storeu_si128((__m128i*)resPos, vec);
resPos += __builtin_popcount(comp);
count += __builtin_popcount(comp);
positions = _mm_add_epi32(positions, _4);
}
*cnt = count;
return resArr;
}
The idea here is that every individual case could of course be shuffled into place, the 16 cases are distinguished by loading the shuffle mask corresponding to the case-index, which is given by movmskps. With AVX2 you can do a similar thing using vpermd.
I have made some changes, which have to lead performance increasing:
#include <immintrin.h>
#include <memory.h>
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t numcnt = 4;
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set1_epi32(10);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
__m128i _cnt = _mm_setzero_si128();
for (int i = 0; i < num; i += 4)
{
__m128i vec = _mm_loadu_si128((__m128i*)(arr + i));
__m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);//arr >= comp2
_cnt = _mm_add_epi32(_cnt, _mm_and_si128(simdAnd2, _1));
int comp = _mm_movemask_epi8(simdAnd2);
if (comp == 65535)
{
_mm_storeu_si128((__m128i*)resPos, positions);
resPos += 4;
}
else if (comp < 65535)
{
if (((uint32_t *)&simdAnd2)[0]) {
std::cout << "first byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[0];
resPos++;
}
if (((uint32_t *)&simdAnd2)[1]) {
std::cout << "second byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[1];
resPos++;
}
if (((uint32_t *)&simdAnd2)[2]) {
std::cout << "3rd byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[2];
resPos++;
}
if (((uint32_t *)&simdAnd2)[3]) {
std::cout << "4th byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[3];
resPos++;
}
}
positions = _mm_add_epi32(positions, _4);
}
uint32_t cnts[4];
_mm_storeu_si128((__m128i*)cnts, _cnt);
*cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3];
std::cout << "cnt " << *cnt << "\n";
return resArr;
}
Of course, it will be good if all scalar instructions in cycle are changed to vector instructions.
Related
Im trying to find all the possible variations of a number in the form of:
'1_2_3_4' where _ is a number between 0 to 9.
I was wondering what is the best approach to this problem.
This seems like the simplest method:
static void printPerms()
{
int n = 1020304;
for (int i = 0; i <= 9; i++, n += 90000)
for (int j = 0; j <= 9; j++, n += 900)
for (int k = 0; k <= 9; k++, n += 10)
System.out.println(n);
}
Or even this, which has a lovely symmetry:
static void printPerms()
{
int n = 1020304;
for (int ni = n + 900000; n <= ni; n += 90000)
for (int nj = n + 9000; n <= nj; n += 900)
for (int nk = n + 90; n <= nk; n += 10)
System.out.println(n);
}
import java.util.*;
public class Solution {
public static void main(String[] args){
int[] fillable = {1,-1,2,-1,3,-1,4};
for(int i=0;i<=9;++i){
for(int j=0;j<=9;++j){
for(int k=0;k<=9;++k){
fillable[1] = i;
fillable[3] = j;
fillable[5] = k;
System.out.println(Arrays.toString(fillable));
}
}
}
}
}
OUTPUT:
[1, 0, 2, 0, 3, 0, 4]
[1, 0, 2, 0, 3, 1, 4]
[1, 0, 2, 0, 3, 2, 4]
[1, 0, 2, 0, 3, 3, 4]
[1, 0, 2, 0, 3, 4, 4]
[1, 0, 2, 0, 3, 5, 4]
[1, 0, 2, 0, 3, 6, 4]
.
.
.
.
Time Complexity: O(10^n) where n is no. of places to fill in. If 3 empty places if fixed, then it is O(1).
Space Complexity: O(1)
Note: There is no better way to do this. You have to go through each and every combination.
Python style, assuming ASCII code representation:
n= "1020304"
while True:
n[5]+= 1
if n[5] == ':':
n[5]= '0'
n[3]+= 1
if n[3] == ':':
n[3]= '0'
n[1]+= 1
if n[1] == ':'=
break
I'm going to find lines in an image, so I use hough transform to do it.
But now I'm trying to find the longest line in the image(there must exist a longest line in my image), is there any method to do it without sacrifice the computation speed?
using namespace std;
using namespace cv;
int main()
{
VideoCapture cap("D:\\DataBox\\v0.avi");
if (!cap.isOpened())
cout << "fail to open!" << endl; //return -1;
else
cout << "Video Load Succeed" << endl;
while (true)
{
cout << "----------------------------------------------------" << endl;
Mat src;
cap >> src;
pyrDown(src, src, Size(src.cols / 2, src.rows / 2));
pyrDown(src, src, Size(src.cols / 2, src.rows / 2));
cvtColor(src, src, CV_BGR2GRAY);
Mat tsrc;
threshold(src, tsrc, 90, 255, THRESH_BINARY_INV);
Mat grad_x, grad_y;
Mat abs_grad_x, abs_grad_y;
Mat sobel;
int scale = 1;
int delta = 0;
int ddepth = CV_16S;
Sobel(tsrc, grad_x, ddepth, 1, 0, 3, scale, delta, BORDER_DEFAULT);
convertScaleAbs(grad_x, abs_grad_x);
Sobel(tsrc, grad_y, ddepth, 0, 1, 3, scale, delta, BORDER_DEFAULT);
convertScaleAbs(grad_y, abs_grad_y);
addWeighted(abs_grad_x, 0.5, abs_grad_y, 0.5, 0, sobel);
vector<Vec2f> lines;
int threshold = 250;
HoughLines(sobel , lines, 1, CV_PI / 180, threshold, 0, 0);
Mat cdst;
cvtColor(sobel, cdst, CV_GRAY2BGR);
for (size_t i = 0; i < lines.size(); i++)
{
float rho = lines[i][0], theta = lines[i][1];
Point pt1, pt2;
double a = cos(theta), b = sin(theta);
double x0 = a*rho, y0 = b*rho;
pt1.x = cvRound(x0 + 1000 * (-b));
pt1.y = cvRound(y0 + 1000 * (a));
pt2.x = cvRound(x0 - 1000 * (-b));
pt2.y = cvRound(y0 - 1000 * (a));
line(cdst, pt1, pt2, Scalar(0, 0, 255), 3, CV_AA);
}
imshow("Video", cdst);
waitKey(30);
}
}
I would like to move the MATCH macro and bitmap to a separate file since I use these many places, and I would like to avoid repeating code. How may that be done?
require 'inline'
# Class to calculate the Levenshtein distance between two
# given strings.
# http://en.wikipedia.org/wiki/Levenshtein_distance
class Levenshtein
BYTES_IN_INT = 4
def self.distance(s, t)
return 0 if s == t;
return t.length if s.length == 0;
return s.length if t.length == 0;
v0 = "\0" * (t.length + 1) * BYTES_IN_INT
v1 = "\0" * (t.length + 1) * BYTES_IN_INT
l = self.new
l.distance_C(s, t, s.length, t.length, v0, v1)
end
# >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
inline do |builder|
# Macro for matching nucleotides including ambiguity codes.
builder.prefix %{
#define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
}
# Bitmap for matching nucleotides including ambiguity codes.
# For each value bits are set from the left: bit pos 1 for A,
# bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
builder.prefix %{
char bitmap[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
}
builder.prefix %{
unsigned int min(unsigned int a, unsigned int b, unsigned int c)
{
unsigned int m = a;
if (m > b) m = b;
if (m > c) m = c;
return m;
}
}
builder.c %{
VALUE distance_C(
VALUE _s, // string
VALUE _t, // string
VALUE _s_len, // string length
VALUE _t_len, // string length
VALUE _v0, // score vector
VALUE _v1 // score vector
)
{
char *s = (char *) StringValuePtr(_s);
char *t = (char *) StringValuePtr(_t);
unsigned int s_len = FIX2UINT(_s_len);
unsigned int t_len = FIX2UINT(_t_len);
unsigned int *v0 = (unsigned int *) StringValuePtr(_v0);
unsigned int *v1 = (unsigned int *) StringValuePtr(_v1);
unsigned int i = 0;
unsigned int j = 0;
unsigned int cost = 0;
for (i = 0; i < t_len + 1; i++)
v0[i] = i;
for (i = 0; i < s_len; i++)
{
v1[0] = i + 1;
for (j = 0; j < t_len; j++)
{
cost = (MATCH(s[i], t[j])) ? 0 : 1;
v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost);
}
for (j = 0; j < t_len + 1; j++)
v0[j] = v1[j];
}
return UINT2NUM(v1[t_len]);
}
}
end
end
builder.prefix is just a method call, so you could create a Ruby method which called it with your macro and character array, and then add it to any class which wanted to use those C snippets inline. e.g.
module MixinCommonC
def add_match_macro inline_builder
inline_builder.prefix %{
#define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
}
end
end
In your example code you make the following changes to use it:
At the start of the class
class Levenshtein
extend MixinCommonC
(It's extend and not include, because the inline method is being called against the class, so can only access class methods inside the block)
Where you currently have the call to builder.prefix:
add_match_macro( builder )
I Want to optimize the following function using SIMD (SSE2 & such):
int64_t fun(int64_t N, int size, int* p)
{
int64_t sum = 0;
for(int i=1; i<size; i++)
sum += (N/i)*p[i];
return sum;
}
This seems like an eminently vectorizable task, except that the needed instructions just aren't there ...
We can assume that N is very large (10^12 to 10^18) and size~sqrt(N). We can also assume that p can only take values of -1, 0, and 1; so we don't need a real multiplication, the (N/i)*p[i] can be done with four instructions (pcmpgt, pxor, psub, pand), if we could just somehow compute N/i.
This is as close as I could get to vectorizing that code. I don't really expect it to be faster. I was just trying my hand at writting SIMD code.
#include <stdint.h>
int64_t fun(int64_t N, int size, const int* p)
{
int64_t sum = 0;
int i;
for(i=1; i<size; i++) {
sum += (N/i)*p[i];
}
return sum;
}
typedef int64_t v2sl __attribute__ ((vector_size (2*sizeof(int64_t))));
int64_t fun_simd(int64_t N, int size, const int* p)
{
int64_t sum = 0;
int i;
v2sl v_2 = { 2, 2 };
v2sl v_N = { N, N };
v2sl v_i = { 1, 2 };
union { v2sl v; int64_t a[2]; } v_sum;
v_sum.a[0] = 0;
v_sum.a[1] = 0;
for(i=1; i<size-1; i+=2) {
v2sl v_p = { p[i], p[i+1] };
v_sum.v += (v_N / v_i) * v_p;
v_i += v_2;
}
sum = v_sum.a[0] + v_sum.a[1];
for(; i<size; i++) {
sum += (N/i)*p[i];
}
return sum;
}
typedef double v2df __attribute__ ((vector_size (2*sizeof(double))));
int64_t fun_simd_double(int64_t N, int size, const int* p)
{
int64_t sum = 0;
int i;
v2df v_2 = { 2, 2 };
v2df v_N = { N, N };
v2df v_i = { 1, 2 };
union { v2df v; double a[2]; } v_sum;
v_sum.a[0] = 0;
v_sum.a[1] = 0;
for(i=1; i<size-1; i+=2) {
v2df v_p = { p[i], p[i+1] };
v_sum.v += (v_N / v_i) * v_p;
v_i += v_2;
}
sum = v_sum.a[0] + v_sum.a[1];
for(; i<size; i++) {
sum += (N/i)*p[i];
}
return sum;
}
#include <stdio.h>
static const int test_array[] = {
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0,
1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, -1, 0
};
#define test_array_len (sizeof(test_array)/sizeof(int))
#define big_N (1024 * 1024 * 1024)
int main(int argc, char *argv[]) {
int64_t res1;
int64_t res2;
int64_t res3;
v2sl a = { 123, 456 };
v2sl b = { 100, 200 };
union { v2sl v; int64_t a[2]; } tmp;
a = a + b;
tmp.v = a;
printf("a = { %ld, %ld }\n", tmp.a[0], tmp.a[1]);
printf("test_array size = %zd\n", test_array_len);
res1 = fun(big_N, test_array_len, test_array);
printf("fun() = %ld\n", res1);
res2 = fun_simd(big_N, test_array_len, test_array);
printf("fun_simd() = %ld\n", res2);
res3 = fun_simd_double(big_N, test_array_len, test_array);
printf("fun_simd_double() = %ld\n", res3);
return 0;
}
The derivative of 1/x is -1/x^2, which means as x gets bigger, N/x==N/(x + 1).
For a known value of N/x (let's call that value r), we can determine the next value of x (let's call that value x' such that N/x'<r:
x'= N/(r - 1)
And since we are dealing with integers:
x'= ceiling(N/(r - 1))
So, the loop becomes something like this:
int64_t sum = 0;
int i=1;
int r= N;
while (i<size)
{
int s= (N + r - 1 - 1)/(r - 1);
while (i<s && i<size)
{
sum += (r)*p[i];
++i;
}
r= N/s;
}
return sum;
For sufficiently large N, you will have many many runs of identical values for N/i. Granted, you will hit a divide by zero if you aren't careful.
I suggest you do this with floating point SIMD operations - either single or double precision depending on your accuracy requirements. Conversion from int to float or double is relatively fast using SSE.
The cost is concentrated in computing the divisions. There is no opcode in SSE2 for integral divisions, so you would have to implement a division algorithm yourself, bit by bit. I do not think it would be worth the effort: SSE2 allow you to perform two instances in parallel (you use 64-bit numbers, and SSE2 registers are 128-bit) but I find it likely that a handmade division algorithm would be at least twice as slow as the CPU idiv opcode.
(By the way, do you compile in 32-bit or 64-bit mode ? The latter will be more comfortable with 64-bit integers.)
Reducing the overall number of divisions looks like a more promising way. One may note that for positive integers x and y, then floor(x/(2y)) = floor(floor(x/y)/2). In C terminology, once you have computed N/i (truncated division) then you just have to shift it right by one bit to obtain N/(2*i). Used properly, this makes half of your divisions almost free (that "properly" also includes accessing the billions of p[i] values in a way which does not wreak havoc with the caches, so it does not seem very easy).
I have been looking for an algorithm to perform a transitive reduction on a graph, but without success. There's nothing in my algorithms bible (Introduction To Algorithms by Cormen et al) and whilst I've seen plenty of transitive closure pseudocode, I haven't been able to track down anything for a reduction. The closest I've got is that there is one in "Algorithmische Graphentheorie" by Volker Turau (ISBN:978-3-486-59057-9), but unfortunately I don't have access to this book! Wikipedia is unhelpful and Google is yet to turn up anything. :^(
Does anyone know of an algorithm for performing a transitive reduction?
See Harry Hsu. "An algorithm for finding a minimal equivalent graph of a digraph.", Journal of the ACM, 22(1):11-16, January 1975. The simple cubic algorithm below (using an N x N path matrix) suffices for DAGs, but Hsu generalizes it to cyclic graphs.
// reflexive reduction
for (int i = 0; i < N; ++i)
m[i][i] = false;
// transitive reduction
for (int j = 0; j < N; ++j)
for (int i = 0; i < N; ++i)
if (m[i][j])
for (int k = 0; k < N; ++k)
if (m[j][k])
m[i][k] = false;
The basic gist of the transitive reduction algorithm I used is
foreach x in graph.vertices
foreach y in graph.vertices
foreach z in graph.vertices
delete edge xz if edges xy and yz exist
The transitive closure algorithm I used in the same script is very similar but the last line is
add edge xz if edges xy and yz OR edge xz exist
Based on the reference provided by Alan Donovan, which says you should use the path matrix (which has a 1 if there is a path from node i to node j) instead of the adjacency matrix (which has a 1 only if there is an edge from node i to node j).
Some sample python code follows below to show the differences between the solutions
def prima(m, title=None):
""" Prints a matrix to the terminal """
if title:
print title
for row in m:
print ', '.join([str(x) for x in row])
print ''
def path(m):
""" Returns a path matrix """
p = [list(row) for row in m]
n = len(p)
for i in xrange(0, n):
for j in xrange(0, n):
if i == j:
continue
if p[j][i]:
for k in xrange(0, n):
if p[j][k] == 0:
p[j][k] = p[i][k]
return p
def hsu(m):
""" Transforms a given directed acyclic graph into its minimal equivalent """
n = len(m)
for j in xrange(n):
for i in xrange(n):
if m[i][j]:
for k in xrange(n):
if m[j][k]:
m[i][k] = 0
m = [ [0, 1, 1, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 1, 1],
[0, 0, 0, 0, 1],
[0, 1, 0, 0, 0]]
prima(m, 'Original matrix')
hsu(m)
prima(m, 'After Hsu')
p = path(m)
prima(p, 'Path matrix')
hsu(p)
prima(p, 'After Hsu')
Output:
Adjacency matrix
0, 1, 1, 0, 0
0, 0, 0, 0, 0
0, 0, 0, 1, 1
0, 0, 0, 0, 1
0, 1, 0, 0, 0
After Hsu
0, 1, 1, 0, 0
0, 0, 0, 0, 0
0, 0, 0, 1, 0
0, 0, 0, 0, 1
0, 1, 0, 0, 0
Path matrix
0, 1, 1, 1, 1
0, 0, 0, 0, 0
0, 1, 0, 1, 1
0, 1, 0, 0, 1
0, 1, 0, 0, 0
After Hsu
0, 0, 1, 0, 0
0, 0, 0, 0, 0
0, 0, 0, 1, 0
0, 0, 0, 0, 1
0, 1, 0, 0, 0
The Wikipedia article on transitive reduction points to an implementation within GraphViz (which is open source). Not exactly pseudocode, but maybe someplace to start?
LEDA includes a transitive reduction algorithm. I don't have a copy of the LEDA book anymore, and this function might have been added after the book was published. But if it's in there, then there will be a good description of the algorithm.
Google points to an algorithm that somebody suggested for inclusion in Boost. I didn't try to read it, so maybe not correct?
Also, this might be worth a look.
The algorithm of "girlwithglasses" forgets that a redundant edge could span a chain of three edges. To correct, compute Q = R x R+ where R+ is the transitive closure and then delete all edges from R that show up in Q. See also the Wikipedia article.
Depth-first algorithm in pseudo-python:
for vertex0 in vertices:
done = set()
for child in vertex0.children:
df(edges, vertex0, child, done)
df = function(edges, vertex0, child0, done)
if child0 in done:
return
for child in child0.children:
edge.discard((vertex0, child))
df(edges, vertex0, child, done)
done.add(child0)
The algorithm is sub-optimal, but deals with the multi-edge-span problem of the previous solutions. The results are very similar to what tred from graphviz produces.
ported to java / jgrapht, the python sample on this page from #Michael Clerx:
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.jgrapht.DirectedGraph;
public class TransitiveReduction<V, E> {
final private List<V> vertices;
final private int [][] pathMatrix;
private final DirectedGraph<V, E> graph;
public TransitiveReduction(DirectedGraph<V, E> graph) {
super();
this.graph = graph;
this.vertices = new ArrayList<V>(graph.vertexSet());
int n = vertices.size();
int[][] original = new int[n][n];
// initialize matrix with zeros
// --> 0 is the default value for int arrays
// initialize matrix with edges
Set<E> edges = graph.edgeSet();
for (E edge : edges) {
V v1 = graph.getEdgeSource(edge);
V v2 = graph.getEdgeTarget(edge);
int v_1 = vertices.indexOf(v1);
int v_2 = vertices.indexOf(v2);
original[v_1][v_2] = 1;
}
this.pathMatrix = original;
transformToPathMatrix(this.pathMatrix);
}
// (package visible for unit testing)
static void transformToPathMatrix(int[][] matrix) {
// compute path matrix
for (int i = 0; i < matrix.length; i++) {
for (int j = 0; j < matrix.length; j++) {
if (i == j) {
continue;
}
if (matrix[j][i] > 0 ){
for (int k = 0; k < matrix.length; k++) {
if (matrix[j][k] == 0) {
matrix[j][k] = matrix[i][k];
}
}
}
}
}
}
// (package visible for unit testing)
static void transitiveReduction(int[][] pathMatrix) {
// transitively reduce
for (int j = 0; j < pathMatrix.length; j++) {
for (int i = 0; i < pathMatrix.length; i++) {
if (pathMatrix[i][j] > 0){
for (int k = 0; k < pathMatrix.length; k++) {
if (pathMatrix[j][k] > 0) {
pathMatrix[i][k] = 0;
}
}
}
}
}
}
public void reduce() {
int n = pathMatrix.length;
int[][] transitivelyReducedMatrix = new int[n][n];
System.arraycopy(pathMatrix, 0, transitivelyReducedMatrix, 0, pathMatrix.length);
transitiveReduction(transitivelyReducedMatrix);
for (int i = 0; i <n; i++) {
for (int j = 0; j < n; j++) {
if (transitivelyReducedMatrix[i][j] == 0) {
// System.out.println("removing "+vertices.get(i)+" -> "+vertices.get(j));
graph.removeEdge(graph.getEdge(vertices.get(i), vertices.get(j)));
}
}
}
}
}
unit test :
import java.util.Arrays;
import org.junit.Assert;
import org.junit.Test;
public class TransitiveReductionTest {
#Test
public void test() {
int[][] matrix = new int[][] {
{0, 1, 1, 0, 0},
{0, 0, 0, 0, 0},
{0, 0, 0, 1, 1},
{0, 0, 0, 0, 1},
{0, 1, 0, 0, 0}
};
int[][] expected_path_matrix = new int[][] {
{0, 1, 1, 1, 1},
{0, 0, 0, 0, 0},
{0, 1, 0, 1, 1},
{0, 1, 0, 0, 1},
{0, 1, 0, 0, 0}
};
int[][] expected_transitively_reduced_matrix = new int[][] {
{0, 0, 1, 0, 0},
{0, 0, 0, 0, 0},
{0, 0, 0, 1, 0},
{0, 0, 0, 0, 1},
{0, 1, 0, 0, 0}
};
System.out.println(Arrays.deepToString(matrix) + " original matrix");
int n = matrix.length;
// calc path matrix
int[][] path_matrix = new int[n][n];
{
System.arraycopy(matrix, 0, path_matrix, 0, matrix.length);
TransitiveReduction.transformToPathMatrix(path_matrix);
System.out.println(Arrays.deepToString(path_matrix) + " path matrix");
Assert.assertArrayEquals(expected_path_matrix, path_matrix);
}
// calc transitive reduction
{
int[][] transitively_reduced_matrix = new int[n][n];
System.arraycopy(path_matrix, 0, transitively_reduced_matrix, 0, matrix.length);
TransitiveReduction.transitiveReduction(transitively_reduced_matrix);
System.out.println(Arrays.deepToString(transitively_reduced_matrix) + " transitive reduction");
Assert.assertArrayEquals(expected_transitively_reduced_matrix, transitively_reduced_matrix);
}
}
}
test ouput
[[0, 1, 1, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 1], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]] original matrix
[[0, 1, 1, 1, 1], [0, 0, 0, 0, 0], [0, 1, 0, 1, 1], [0, 1, 0, 0, 1], [0, 1, 0, 0, 0]] path matrix
[[0, 0, 1, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]] transitive reduction