Comparing arrays for "similarity"? - algorithm

I am trying to compare same size arrays.
Given the following array:
I am looking for an algorithm to tell me the most "similar" array to the input. I understand that the word "similar" is not very specific but I don't know how to be more specific.
For example the following is very similar to the input.
The following is somewhat similar.
The following is very different.

You could apply a smoothing kernel to the array, and then compute the L2 norm (Euclidean distance) on it.
This is often used to compare e.g. neural spike trains or other continuous signals.
http://www.cs.utah.edu/~suresh/papers/kerneld/kerneld.pdf
You didn't specify a language...I happen to have code in C++ (may not be the most efficient).
First, you do a smoothing of the vector based on your desired kernel width and parameterize it depending on the scale/desired amount of "blur", etc.. For example:
Output of code below (behaves as expected):
riveale#rv-mba:~/tmpdir$ g++ -std=c++11 test.cpp -o test.exe
riveale#rv-mba:~/tmpdir$ ./test.exe
Distance [1] to [2]: [31.488026] (should be far)
Distance [2] to [3]: [26.591297] (should be far)
Distance [1] to [3]: [12.468342] (should be closer)
And code (test.cpp):
#include <vector>
#include <cstdlib>
#include <cstdio>
#include <cmath>
double gauss_kernel_funct(const size_t& sourcetime, const size_t& thistime)
{
const double tauval = 5.0; //width of kernel
double dist = ((sourcetime-thistime)/tauval); //distance between the points in the vector
double retval = exp(-1 * dist*dist); //exponential decay away from center of that point, squared....
return retval;
}
std::vector<double> convolvegauss( const std::vector<double>& v1)
{
std::vector<double> convolved( v1.size(), 0.0 );
for(size_t t=0; t<v1.size(); ++t)
{
for(size_t u=0; u<v1.size(); ++u)
{
double coeff = gauss_kernel_funct(u, t);
convolved[t]+=v1[u] * coeff;
}
}
return (convolved);
}
double eucliddist( const std::vector<double>& v1, const std::vector<double>& v2 )
{
if(v1.size() != v2.size()) { fprintf(stderr, "ERROR v1!=v2 sizes\n"); exit(1); }
double sum=0.0;
for(size_t x=0; x<v1.size(); ++x)
{
double tmp = (v1[x] - v2[x]);
sum += tmp*tmp; //sum += distance of this dimension squared
}
return (sqrt( sum ));
}
double vectdist( const std::vector<double>& v1, const std::vector<double>& v2 )
{
std::vector<double> convolved1 = convolvegauss( v1 );
std::vector<double> convolved2 = convolvegauss( v2 );
return (eucliddist( convolved1, convolved2 ));
}
int main()
{
//Original 3 vectors. (1 and 3) are closer than (1 and 2) or (2 and 3)...like your example.
std::vector<double> myvector1 = {1.0, 32.0, 10.0, 5.0, 2.0};
std::vector<double> myvector2 = {2.0, 3.0, 10.0, 22.0, 2.0};
std::vector<double> myvector3 = {2.0, 20.0, 17.0, 1.0, 2.0};
//Now run the vectdist on each, which convolves each vector with the gaussian kernel, and takes the euclid distance between the convovled vectors)
fprintf(stdout, "Distance [%d] to [%d]: [%lf] (should be far)\n", 1, 2, vectdist(myvector1, myvector2) );
fprintf(stdout, "Distance [%d] to [%d]: [%lf] (should be far)\n", 2, 3, vectdist(myvector2, myvector3) );
fprintf(stdout, "Distance [%d] to [%d]: [%lf] (should be closer)\n", 1, 3, vectdist(myvector1, myvector3) );
return 0;
}

Related

How to fill a fixed rectangle with square pieces entirely?

Is this knapsack algorithm or bin packing? I couldn't find an exact solution but basically I have a fixed rectangle area that I want to fill with perfect squares that represents my items where each have a different weight which will influence their size relative to others.
They will be sorted from large to smaller from top left to bottom right.
Also even though I need perfect squares, in the end some non-uniform scaling is allowed to fill the entire space as long as they still retain their relative area, and the non-uniform scaling is done with the least possible amount.
What algorithm I can use to achieve this?
There's a fast approximation algorithm due to Hiroshi Nagamochi and Yuusuke Abe. I implemented it in C++, taking care to obtain a worst-case O(n log n)-time implementation with worst-case recursive depth O(log n). If n ≤ 100, these precautions are probably unnecessary.
#include <algorithm>
#include <iostream>
#include <random>
#include <vector>
namespace {
struct Rectangle {
double x;
double y;
double width;
double height;
};
Rectangle Slice(Rectangle &r, const double beta) {
const double alpha = 1 - beta;
if (r.width > r.height) {
const double alpha_width = alpha * r.width;
const double beta_width = beta * r.width;
r.width = alpha_width;
return {r.x + alpha_width, r.y, beta_width, r.height};
}
const double alpha_height = alpha * r.height;
const double beta_height = beta * r.height;
r.height = alpha_height;
return {r.x, r.y + alpha_height, r.width, beta_height};
}
void LayoutRecursive(const std::vector<double> &reals, const std::size_t begin,
std::size_t end, double sum, Rectangle rectangle,
std::vector<Rectangle> &dissection) {
while (end - begin > 1) {
double suffix_sum = reals[end - 1];
std::size_t mid = end - 1;
while (mid > begin + 1 && suffix_sum + reals[mid - 1] <= 2 * sum / 3) {
suffix_sum += reals[mid - 1];
mid -= 1;
}
LayoutRecursive(reals, mid, end, suffix_sum,
Slice(rectangle, suffix_sum / sum), dissection);
end = mid;
sum -= suffix_sum;
}
dissection.push_back(rectangle);
}
std::vector<Rectangle> Layout(std::vector<double> reals,
const Rectangle rectangle) {
std::sort(reals.begin(), reals.end());
std::vector<Rectangle> dissection;
dissection.reserve(reals.size());
LayoutRecursive(reals, 0, reals.size(),
std::accumulate(reals.begin(), reals.end(), double{0}),
rectangle, dissection);
return dissection;
}
std::vector<double> RandomReals(const std::size_t n) {
std::vector<double> reals(n);
std::exponential_distribution<> dist;
std::default_random_engine gen;
for (double &real : reals) {
real = dist(gen);
}
return reals;
}
} // namespace
int main() {
const std::vector<Rectangle> dissection =
Layout(RandomReals(100), {72, 72, 6.5 * 72, 9 * 72});
std::cout << "%!PS\n";
for (const Rectangle &r : dissection) {
std::cout << r.x << " " << r.y << " " << r.width << " " << r.height
<< " rectstroke\n";
}
std::cout << "showpage\n";
}
Ok so lets assume integer positions and sizes (no float operations). To evenly divide rectangle into regular square grid (as big squares as possible) the size of the cells will be greatest common divisor GCD of the rectangle sizes.
However you want to have much less squares than that so I would try something like this:
try all square sizes a from 1 to smaller size of rectangle
for each a compute the naive square grid size of the rest of rectangle once a*a square is cut of it
so its simply GCD again on the 2 rectangles that will be created once a*a square is cut of. If the min of all 3 sizes a and GCD for the 2 rectangles is bigger than 1 (ignoring zero area rectangles) then consider a as valid solution so remember it.
after the for loop use last found valida
so simply add a*a square to your output and recursively do this whole again for the 2 rectangles that will remain from your original rectangle after a*a square was cut off.
Here simple C++/VCL/OpenGL example for this:
//---------------------------------------------------------------------------
#include <vcl.h>
#pragma hdrstop
#include "Unit1.h"
#include "gl_simple.h"
//---------------------------------------------------------------------------
#pragma package(smart_init)
#pragma resource "*.dfm"
TForm1 *Form1;
//---------------------------------------------------------------------------
class square // simple square
{
public:
int x,y,a; // corner 2D position and size
square(){ x=y=a=0.0; }
square(int _x,int _y,int _a){ x=_x; y=_y; a=_a; }
~square(){}
void draw()
{
glBegin(GL_LINE_LOOP);
glVertex2i(x ,y);
glVertex2i(x+a,y);
glVertex2i(x+a,y+a);
glVertex2i(x ,y+a);
glEnd();
}
};
int rec[4]={20,20,760,560}; // x,y,a,b
const int N=1024; // max square number
int n=0; // number of squares
square s[N]; // squares
//---------------------------------------------------------------------------
int gcd(int a,int b) // slow euclid GCD
{
if(!b) return a;
return gcd(b, a % b);
}
//---------------------------------------------------------------------------
void compute(int x0,int y0,int xs,int ys)
{
if ((xs==0)||(ys==0)) return;
const int x1=x0+xs;
const int y1=y0+ys;
int a,b,i,x,y;
square t;
// try to find biggest square first
for (a=1,b=0;(a<=xs)&&(a<=ys);a++)
{
// sizes for the rest of the rectangle once a*a square is cut of
if (xs==a) x=0; else x=gcd(a,xs-a);
if (ys==a) y=0; else y=gcd(a,ys-a);
// min of all sizes
i=a;
if ((x>0)&&(i>x)) i=x;
if ((y>0)&&(i>y)) i=y;
// if divisible better than by 1 remember it as better solution
if (i>1) b=a;
} a=b;
// bigger square not found so use naive square grid division
if (a<=1)
{
t.a=gcd(xs,ys);
for (t.y=y0;t.y<y1;t.y+=t.a)
for (t.x=x0;t.x<x1;t.x+=t.a)
if (n<N){ s[n]=t; n++; }
}
// bigest square found so add it to result and recursively process the rest
else{
t=square(x0,y0,a);
if (n<N){ s[n]=t; n++; }
compute(x0+a,y0,xs-a,a);
compute(x0,y0+a,xs,ys-a);
}
}
//---------------------------------------------------------------------------
void gl_draw()
{
glClear(GL_COLOR_BUFFER_BIT);
glDisable(GL_DEPTH_TEST);
glDisable(GL_TEXTURE_2D);
// set view to 2D [pixel] units
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glTranslatef(-1.0,-1.0,0.0);
glScalef(2.0/float(xs),2.0/float(ys),1.0);
// render input rectangle
glColor3f(0.2,0.2,0.2);
glBegin(GL_QUADS);
glVertex2i(rec[0] ,rec[1]);
glVertex2i(rec[0]+rec[2],rec[1]);
glVertex2i(rec[0]+rec[2],rec[1]+rec[3]);
glVertex2i(rec[0] ,rec[1]+rec[3]);
glEnd();
// render output squares
glColor3f(0.2,0.5,0.9);
for (int i=0;i<n;i++) s[i].draw();
glFinish();
SwapBuffers(hdc);
}
//---------------------------------------------------------------------------
__fastcall TForm1::TForm1(TComponent* Owner):TForm(Owner)
{
// Init of program
gl_init(Handle); // init OpenGL
n=0; compute(rec[0],rec[1],rec[2],rec[3]);
Caption=n;
}
//---------------------------------------------------------------------------
void __fastcall TForm1::FormDestroy(TObject *Sender)
{
// Exit of program
gl_exit();
}
//---------------------------------------------------------------------------
void __fastcall TForm1::FormPaint(TObject *Sender)
{
// repaint
gl_draw();
}
//---------------------------------------------------------------------------
void __fastcall TForm1::FormResize(TObject *Sender)
{
// resize
gl_resize(ClientWidth,ClientHeight);
}
//---------------------------------------------------------------------------
And preview for the actually hardcoded rectangle:
The number 8 in Caption of the window is the number of squares produced.
Beware this is just very simple startup example for this problem. I Have not tested it extensively so there is possibility once prime sizes or just unlucky aspect ratios are involved then this might result in really high number of squares... for example if GCD of the rectangle size is 1 (primes) ... In such case you should tweak the initial rectangle size (+/-1 or whatever)
The important thing in the code is just the compute() function and the global variables holding the output squares s[n]... beware the compute() does not clear the list (in order to allow recursion) so you need to set n=0 before its non recursive call.
To keep this simple I avoided to use any libs or dynamic allocations or containers for the compute itself...

How can I minimize two integers so that their product is less than a certain value?

Given two integers, how can I minimize them so that their product is less than some other value, while maintaining their relative ratio?
That's the formal problem. The practical problem is this: I have width/height pixel resolutions containing random values (anywhere from 1 to 8192 for either dimension). I want to adjust the pairs of values, such that their product doesn't exceed some total number of pixels (ex: 1000000), and I need to ensure that the aspect ratio of the adjusted resolution remains the same (ex: 1.7777).
The naive approach is to just run a loop where I subtract 1 from the width each time, adjusting the height to match the aspect ratio, until their product is under the threshold. Ex:
int wid = 1920;
int hei = 1080;
float aspect = wid / (float)hei;
int maxPixels = 1000000;
while (wid * hei > maxPixels)
{
wid -= 1;
hei = wid / aspect;
}
Surely there must be a more analytical approach to this problem though?
Edit: Misread the original question.
Another way to word your question is with a W and H what is the largest a and b such that a/b = W/H and a*b < C where C is your limit on .
To do so, find the D = gcd(W,H) or greatest common divisor of W and H. Greatest common denominator is commonly found using the Euclidean Algorithm.
Set x = W/D and y = H/D, this is minimal solution with the same ratio.
To produce the maxima under C, start with the inequality of F*x*F*y <= C where F will be our scale factor for x and y
Algebra:
F^2 <= C/(x*y)
F <= sqrt(C/(x*y))
Since we want F to be a whole number and strictly less than the above,
F = floor(sqrt(C/(x*y)))
This will give you a new solution A = x*F and B = y*F where A*B < C and A/B = W/H.
mascoj came up with the answer, but here is an interpretation in code form:
#include <utility>
#include <numeric>
#include <cmath>
#include <iostream>
// Mathsy stuff
std::pair<uint64_t, uint64_t> ReduceRatio(const uint64_t W, const uint64_t H)
{
const double D = std::gcd(W, H);
return {W/D, H/D};
}
std::pair<uint64_t, uint64_t> Maximise(const uint64_t C, const uint64_t W, const uint64_t H)
{
const auto [x, y] = ReduceRatio(W, H);
const uint64_t F = std::floor(std::sqrt(C/double(x*y)));
const uint64_t A = x*F;
const uint64_t B = y*F;
return {A, B};
}
// Test harness
void Test(const uint64_t MaxProduct, const uint64_t W, const uint64_t H)
{
const auto [NewW, NewH] = Maximise(MaxProduct, W, H);
std::cout << W << "\u00D7" << H << " (" << (W*H) << " pixels)";
if (NewW > W)
std::cout << '\n';
else
std::cout << " => " << NewW << "\u00D7" << NewH << " (" << (NewW * NewH) << " pixels)\n";
}
int main()
{
Test(100000, 1024, 768);
Test(100000, 1920, 1080);
Test(500000, 1920, 1080);
Test(1000000, 1920, 1080);
Test(2000000, 1920, 1080);
Test(4000000, 1920, 1080);
}
// g++ -std=c++17 -O2 -Wall -pedantic -pthread main.cpp && ./a.out
// 1024×768 (786432 pixels) => 364×273 (99372 pixels)
// 1920×1080 (2073600 pixels) => 416×234 (97344 pixels)
// 1920×1080 (2073600 pixels) => 928×522 (484416 pixels)
// 1920×1080 (2073600 pixels) => 1328×747 (992016 pixels)
// 1920×1080 (2073600 pixels) => 1872×1053 (1971216 pixels)
// 1920×1080 (2073600 pixels)

Binary Matrix Reduction in CUDA

I have to traverse all cells of an imaginary matrix m * n and add + 1 for all cells that meet a certain condition.
My naive solution was as follows:
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x;
int y = blockIdx.x;
if (x*x + y*y <= center*center) {
*count++;
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
int h_count = 0;
int *d_count;
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<l,l>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
printf("Sum: %d\n", h_count);
return 0;
}
In my use case, the value of interactions can be very large, making it impossible to allocate l * l of space.
Can someone help me? Any suggestions are welcome.
There are at least 2 problems with your code:
Your kernel code will not work correctly with an ordinary add here:
*count++;
this is because multiple threads are trying to do this at the same time, and CUDA does not automatically sort that out for you. For the purpose of this explanation, we will fix this with an atomicAdd(), although other methods are possible.
The ampersand doesn't belong here:
cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
^
I assume that is just a typo, since you did it correctly on the subsequent cudaMemcpy operation:
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
This methodology (effectively creating a square array of threads using threadIdx.x for one dimension and blockIdx.x for the other) will only work up to an interactions value that leads to an l value of 1024, or less, because CUDA threadblocks are limited to 1024 threads, and you are using l as the size of the threadblock in your kernel launch. To fix this you would want to learn how to create a CUDA 2D grid of arbitrary dimensions, and adjust your kernel launch and in-kernel indexing calculations appropriately. For now we will just make sure that the calculated l value is in range for your code design.
Here's an example addressing the above issues:
$ cat t1590.cu
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x;
int y = blockIdx.x;
if (x*x + y*y <= center*center) {
atomicAdd(count, 1);
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
if ((l > 1024) || (l < 1)) {printf("Error: interactions out of range\n"); return 0;}
int h_count = 0;
int *d_count;
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<l,l>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
cudaError_t err = cudaGetLastError();
if (err == cudaSuccess){
printf("Sum: %d\n", h_count);
printf("fraction satisfying test: %f\n", h_count/(float)interactions);
}
else
printf("CUDA error: %s\n", cudaGetErrorString(err));
return 0;
}
$ nvcc -o t1590 t1590.cu
$ ./t1590
Enter the number of interactions: 1048576
Sum: 206381
fraction satisfying test: 0.196820
$
We see that the code indicates a calculated fraction of about 0.2. Does this appear to be correct? I claim that it does appear to be correct based on your test. You are effectively creating a grid that represents dimensions of lxl. Your test is asking, effectively, "which points in that grid are within a circle, with the center at the origin (corner) of the grid, and radius l/2 ?"
Pictorially, that looks something like this:
and it is reasonable to assume the red shaded area is somewhat less than 0.25 of the total area, so 0.2 is a reasonable estimate of that area.
As a bonus, here is a version of the code that reduces the restriction listed in item 3 above:
#include <stdio.h>
__global__ void calculate_pi(int center, int *count) {
int x = threadIdx.x+blockDim.x*blockIdx.x;
int y = threadIdx.y+blockDim.y*blockIdx.y;
if (x*x + y*y <= center*center) {
atomicAdd(count, 1);
}
}
int main() {
int interactions;
printf("Enter the number of interactions: ");
scanf("%d", &interactions);
int l = sqrt(interactions);
int h_count = 0;
int *d_count;
const int bs = 32;
dim3 threads(bs, bs);
dim3 blocks((l+threads.x-1)/threads.x, (l+threads.y-1)/threads.y);
cudaMalloc(&d_count, sizeof(int));
cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
calculate_pi<<<blocks,threads>>>(l/2, d_count);
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_count);
cudaError_t err = cudaGetLastError();
if (err == cudaSuccess){
printf("Sum: %d\n", h_count);
printf("fraction satisfying test: %f\n", h_count/(float)interactions);
}
else
printf("CUDA error: %s\n", cudaGetErrorString(err));
return 0;
}
This is launching a 2D grid based on l, and should work up to at least 1 billion interactions .

Estimating an Affine Transform between Two Images

I have a sample image:
I apply the affine transform with the following warp matrix:
[[ 1.25 0. -128 ]
[ 0. 2. -192 ]]
and crop a 128x128 part from the result to get an output image:
Now, I want to estimate the warp matrix and crop size/location from just comparing the sample and output image. I detect feature points using SURF, and match them by brute force:
There are many matches, of which I'm keeping the best three (by distance), since that is the number required to estimate the affine transform. I then use those 3 keypoints to estimate the affine transform using getAffineTransform. However, the transform it returns is completely wrong:
-0.00 1.87 -6959230028596648489132997794229911552.00
0.00 -1.76 -0.00
What am I doing wrong? Source code is below.
Perform affine transform (Python):
"""Apply an affine transform to an image."""
import cv
import sys
import numpy as np
if len(sys.argv) != 10:
print "usage: %s in.png out.png x1 y1 width height sx sy flip" % __file__
sys.exit(-1)
source = cv.LoadImage(sys.argv[1])
x1, y1, width, height, sx, sy, flip = map(float, sys.argv[3:])
X, Y = cv.GetSize(source)
Xn, Yn = int(sx*(X-1)), int(sy*(Y-1))
if flip:
arr = np.array([[-sx, 0, sx*(X-1)-x1], [0, sy, -y1]])
else:
arr = np.array([[sx, 0, -x1], [0, sy, -y1]])
print arr
warp = cv.fromarray(arr)
cv.ShowImage("source", source)
dest = cv.CreateImage((Xn, Yn), source.depth, source.nChannels)
cv.WarpAffine(source, dest, warp)
cv.SetImageROI(dest, (0, 0, int(width), int(height)))
cv.ShowImage("dest", dest)
cv.SaveImage(sys.argv[2], dest)
cv.WaitKey(0)
Estimate affine transform from two images (C++):
#include <stdio.h>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/features2d/features2d.hpp>
#include <opencv2/calib3d/calib3d.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/nonfree/nonfree.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <algorithm>
using namespace cv;
void readme();
bool cmpfun(DMatch a, DMatch b) { return a.distance < b.distance; }
/** #function main */
int main( int argc, char** argv )
{
if( argc != 3 )
{
return -1;
}
Mat img_1 = imread( argv[1], CV_LOAD_IMAGE_GRAYSCALE );
Mat img_2 = imread( argv[2], CV_LOAD_IMAGE_GRAYSCALE );
if( !img_1.data || !img_2.data )
{
return -1;
}
//-- Step 1: Detect the keypoints using SURF Detector
int minHessian = 400;
SurfFeatureDetector detector( minHessian );
std::vector<KeyPoint> keypoints_1, keypoints_2;
detector.detect( img_1, keypoints_1 );
detector.detect( img_2, keypoints_2 );
//-- Step 2: Calculate descriptors (feature vectors)
SurfDescriptorExtractor extractor;
Mat descriptors_1, descriptors_2;
extractor.compute( img_1, keypoints_1, descriptors_1 );
extractor.compute( img_2, keypoints_2, descriptors_2 );
//-- Step 3: Matching descriptor vectors with a brute force matcher
BFMatcher matcher(NORM_L2, false);
std::vector< DMatch > matches;
matcher.match( descriptors_1, descriptors_2, matches );
double max_dist = 0;
double min_dist = 100;
//-- Quick calculation of max and min distances between keypoints
for( int i = 0; i < descriptors_1.rows; i++ )
{ double dist = matches[i].distance;
if( dist < min_dist ) min_dist = dist;
if( dist > max_dist ) max_dist = dist;
}
printf("-- Max dist : %f \n", max_dist );
printf("-- Min dist : %f \n", min_dist );
//-- Draw only "good" matches (i.e. whose distance is less than 2*min_dist )
//-- PS.- radiusMatch can also be used here.
sort(matches.begin(), matches.end(), cmpfun);
std::vector< DMatch > good_matches;
vector<Point2f> match1, match2;
for (int i = 0; i < 3; ++i)
{
good_matches.push_back( matches[i]);
Point2f pt1 = keypoints_1[matches[i].queryIdx].pt;
Point2f pt2 = keypoints_2[matches[i].trainIdx].pt;
match1.push_back(pt1);
match2.push_back(pt2);
printf("%3d pt1: (%.2f, %.2f) pt2: (%.2f, %.2f)\n", i, pt1.x, pt1.y, pt2.x, pt2.y);
}
//-- Draw matches
Mat img_matches;
drawMatches( img_1, keypoints_1, img_2, keypoints_2, good_matches, img_matches,
Scalar::all(-1), Scalar::all(-1), vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS);
//-- Show detected matches
imshow("Matches", img_matches );
imwrite("matches.png", img_matches);
waitKey(0);
Mat fun = getAffineTransform(match1, match2);
for (int i = 0; i < fun.rows; ++i)
{
for (int j = 0; j < fun.cols; j++)
{
printf("%.2f ", fun.at<float>(i,j));
}
printf("\n");
}
return 0;
}
/** #function readme */
void readme()
{
std::cout << " Usage: ./SURF_descriptor <img1> <img2>" << std::endl;
}
The cv::Mat getAffineTransform returns is made of doubles, not of floats. The matrix you get probably is fine, you just have to change the printf command in your loops to
printf("%.2f ", fun.at<double>(i,j));
or even easier: Replace this manual output with
std::cout << fun << std::endl;
It's shorter and you don't have to care about data types yourself.

CUDA Thrust and sort_by_key

I’m looking for a sorting algorithm on CUDA that can sort an array A of elements (double) and returns an array of keys B for that array A.
I know the sort_by_key function in the Thrust library but I want my array of elements A to remain unchanged.
What can I do?
My code is:
void sortCUDA(double V[], int P[], int N) {
real_t *Vcpy = (double*) malloc(N*sizeof(double));
memcpy(Vcpy,V,N*sizeof(double));
thrust::sort_by_key(V, V + N, P);
free(Vcpy);
}
i'm comparing the thrust algorithm against others that i have on sequencial cpu
N mergesort sortCUDA
113 0.000008 0.000010
226 0.000018 0.000016
452 0.000036 0.000020
905 0.000061 0.000034
1810 0.000135 0.000071
3621 0.000297 0.000156
7242 0.000917 0.000338
14484 0.001421 0.000853
28968 0.003069 0.001931
57937 0.006666 0.003939
115874 0.014435 0.008025
231749 0.031059 0.016718
463499 0.067407 0.039848
926999 0.148170 0.118003
1853998 0.329005 0.260837
3707996 0.731768 0.544357
7415992 1.638445 1.073755
14831984 3.668039 2.150179
115035495 39.276560 19.812200
230070990 87.750377 39.762915
460141980 200.940501 74.605219
Thrust performance is not bad, but I think if I use OMP can probably get easily a better CPU time
I think this is because to memcpy
SOLUTION:
void thrustSort(double V[], int P[], int N)
{
thrust::device_vector<int> d_P(N);
thrust::device_vector<double> d_V(V, V + N);
thrust::sequence(d_P.begin(), d_P.end());
thrust::sort_by_key(d_V.begin(), d_V.end(), d_P.begin());
thrust::copy(d_P.begin(),d_P.end(),P);
}
where V is a my double values to sort
You can modify comparison operator to sort keys instead of values. #Robert Crovella correctly pointed that a raw device pointer cannot be assigned from the host. The modified algorithm is below:
struct cmp : public binary_function<int,int,bool>
{
cmp(const double *ptr) : rawA(ptr) { }
__host__ __device__ bool operator()(const int i, const int j) const
{return rawA[i] > rawA[j];}
const double *rawA; // an array in global mem
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
double *rawA = thrust::raw_pointer_cast(devA.data());
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp(rawA));
// B now contains the sorted keys
}
And here is alternative with arrayfire. Though I am not sure which one is more efficient since arrayfire solution uses two additional arrays:
void sortkeys(double *A, int n) {
af::array devA(n, A, af::afHost);
af::array vals, indices;
// sort and populate vals/indices arrays
af::sort(vals, indices, devA);
std::cout << devA << "\n" << indices << "\n";
}
How large is this array? The most efficient way, in terms of speed, will likely be to just duplicate the original array before sorting, if the memory is available.
Building on the answer provided by #asm (I wasn't able to get it working), this code seemed to work for me, and does sort only the keys. However, I believe it is limited to the case where the keys are in sequence 0, 1, 2, 3, 4 ... corresponding to the (double) values. Since this is a "index-value" sort, it could be extended to the case of an arbitrary sequence of keys, perhaps by doing an indexed copy. However I'm not sure the process of generating the index sequence and then rearranging the original keys will be any faster than just copying the original value data to a new vector (for the case of arbitrary keys).
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
using namespace std;
__device__ double *rawA; // an array in global mem
struct cmp : public binary_function<int, int, bool>
{
__host__ __device__ bool operator()(const int i, const int j) const
{return ( rawA[i] < rawA[j]);}
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
// rawA = thrust::raw_pointer_cast(&(devA[0]));
double *test = raw_pointer_cast(devA.data());
cudaMemcpyToSymbol(rawA, &test, sizeof(double *));
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp());
// B now contains the sorted keys
thrust::host_vector<int> hostB = B;
for (int i=0; i<hostB.size(); i++)
std::cout << hostB[i] << " ";
std::cout<<std::endl;
for (int i=0; i<hostB.size(); i++)
std::cout << A[hostB[i]] << " ";
std::cout<<std::endl;
}
int main(){
double C[] = {0.7, 0.3, 0.4, 0.2, 0.6, 1.2, -0.5, 0.5, 0.0, 10.0};
sortkeys(C, 9);
std::cout << std::endl;
return 0;
}

Resources