c code is running to slow from nested for loops - performance

my c program is running to slow (right now it is around 40 seconds without parallelization). I have tried using openmp which has brought the timing down significantly but I am looking to use simple and natural ways to make my code run faster other than using parallel for loops. The basic structure of the code is that is takes some command line arguments as inputs and then saves those inputs as variables. Then it recursively computes a variable called Rplus1 using the math.h library and the complex.h library. The problem of the code and where it is taking most of it's time is at the bottom where there are nested for loops. My goal is to get the whole code running in under 5 seconds but as of now it runs in about 40 seconds without using parallel for loops. Please Help!
#include "time.h"
#include "stdio.h"
#include "stdlib.h"
#include "complex.h"
#include "math.h"
#include "string.h"
#include "unistd.h"
#include "omp.h"
#define PI 3.14159265
int main (int argc, char *argv[]){
if(argc >= 8){
double start1 = omp_get_wtime();
// command line arguments are aligned in the following order: [theta] [number of layers in superlattice] [material_1] [lat const_1] [number of unit cells_1] [material_2] [lat const_2] [number of unit cells_2] .... [material_N] [lat const_N] [number of unit cells_N] [Log/Linear] [number of repeating superlattice layers] [yes/no]
int N;
sscanf(argv[2],"%d",&N); // Number of layers in superlattice specified by second input argument
if(strcmp(argv[argc-1],"yes") == 0) //If the substrate is included then add one more layer to the N variable
{
N = N+1;
}
int total;
sscanf(argv[argc-2],"%d",&total); // Number of repeating superlattice layers specified by second to last argument
double layers[N][6], horizangle[1001], vertangle[1001];
double complex (*F_hkl)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*F_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)),SF_table[10];// this array will hold the unit cell structure factors for all of the materials selected for each wavevector in the beam spectrum
double real, real2, lam, c_light = 299792458, h_pl = 4.135667516e-15,E = 10e3, r_0 = 2.818e-15, Lccd = 1.013;// just a few variables to hold values through calculations and constants, speed of light, plancks const, photon energy, and detector distance from sample
double angle;
double complex z;// just a variable to hold complex numbers throughout calculations
int i,j,m,n,t; // integers to index through arrays
lam = (h_pl*c_light)/E;
sscanf(argv[1],"%lf",&angle); //first argument is the angle of incidence, read it
angle = angle*(PI/180.0);
angle2 = -angle;
double (*table)[10] = malloc(10*9*sizeof(double)); // this array holds all the coefficients to calculate the atomic scattering factor below
double (*table2)[10] = malloc(10*2*sizeof(double));
FILE*datfile1 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/coef_table.bin","rb"); // read the binary file containg all the coefficients
fread(table,sizeof(double),90,datfile1);
fclose(datfile1);
FILE*datfile2 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/dispersioncs.bin","rb");
fread(table2,sizeof(double),20,datfile2);
fclose(datfile2);
// Calculate scattering factors for all elements
double a,b;
double k_z = (sin(angle)/lam)*1e-10; // incorporate angular dependence of SF but neglect 0.24 degree divergence because of approximation
for(i = 0;i<10;i++) // for each element...
{
SF_table[i] = 0;
for(j = 0;j<4;j++) // summation
{
a = table[2*j][i];
b = table[2*j+1][i];
SF_table[i] = SF_table[i] + a * exp(-b*k_z*k_z);
}
SF_table[i] = SF_table[i] + table[8][i] + table2[0][i] + table2[1][i]*I;
}
free(table);
double mm = 4.0, (*phi)[1001][1001] = malloc(N*1001*1001*sizeof(double));
for(i = 1; i < N+1; i++) // for each layer of material...
{
sscanf(argv[i*3+1],"%lf",&layers[i-1][1]); // get out of plane lattice constant
sscanf(argv[i*3+2],"%lf",&layers[i-1][2]); // get the number of unit cells in the layer
layers[i-1][1] = layers[i-1][1]*1e-10; // convert lat const input to meters
// Define reciprocal space positions at the incident angle h, k, l
layers[i-1][3] = 0; // h
layers[i-1][4] = 0; // k
double l; // l calculated for each wavevector in the spectrum because l changes with angle of incidence
for (m = 0; m < 1001; m++)
{
for (n = 0; n <1001; n++)
{
l = 4;
phi[i-1][m][n] = 2*PI*layers[i-1][1]*sin(angle)/lam; // Caculate phi for each layer
if(strcmp(argv[i*3],"GaAs") == 0)
{
F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*(SF_table[2]+SF_table[3]*cexp(I*PI*l/2));
F_0[i-1][m][n] = 0.5*8.0*(31 + table2[0][2] + table2[1][2]*I) + 0.5*8.0*(33 + table2[0][3] + table2[1][3]*I);
g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
}
if(strcmp(argv[i*3],"AlGaAs") == 0)
{
F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*((0.76*SF_table[2]+ 0.24*SF_table[4])+SF_table[3]*cexp(I*PI*l/2));
F_0[i-1][m][n] = 0.24*4.0*(13 + table2[0][4] + table2[1][4]*I) + 0.76*4.0*(31 + table2[0][2] + table2[1][2]*I) + 4.0*(33 + table2[0][3] + table2[1][3]*I);
g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
}
}
}
}
double complex (*Rplus1)[1001] = malloc(1001*1001*sizeof(double complex));
for (m = 0; m < 1001; m++)
{
for (n = 0; n <1001; n++)
{
Rplus1[m][n] = 0.0;
}
}
double stop1 = omp_get_wtime();
for(i=1;i<N;i++) // For each layer of the film
{
for(j=0;j<layers[i][2];j++) // For each unit cell
{
for (m = 0; m < 1001; m++) // For each row of the diffraction pattern
{
for (n = 0; n <1001; n++) // For each column of the diffraction pattern
{
Rplus1[m][n] = -I*g[i][m][n] + ((1-I*g_0[i][m][n])*(1-I*g_0[i][m][n]))/(I*g[i][m][n] + (cos(-2*phi[i][m][n])+I*sin(-2*phi[i][m][n]))/Rplus1[m][n]);
}
}
}
}
double stop2 = omp_get_wtime();
double elapsed1 = (double)(stop1 - start1);// Second user defined function to use Durbin and Follis recursive formula
double elapsed2 = (double)(stop2 - start1);// Second user defined function to use Durbin and Follis recursive formula
printf("main() through before diffraction function took %f seconds to run\n\n",elapsed1);
printf("main() through after diffraction function took %f seconds to run\n\n",elapsed2);
}
}

Related

Extracting patterns from time-series

I have a time-series, which essentially amounts to some instrument recording the current time whenever it makes a "detection". The sampling rate is therefore not in constant time, however we can treat it as such by "re-sampling", relying on the fact that the detections are made reliably and we can simply insert 0's to "fill in" the gaps. This will be important later...
The instrument should detect the "signals" sent by another, nearby instrument. This second instrument emits a signal at some unknown period, T (e.g. 1 signal per second), with a "jitter" likely on the order of a few tenths of a percent of the period.
My goal is to determine this period (or frequency, if you like) using only the timestamps recorded by the "detecting" instrument. Unfortunately, however, the detector is flooded with noise, and a significant amount (I estimate 97-98%) of "detections" (and therefore "points" in the time-series) are due to noise. Therefore, extracting the period will require more careful analysis.
My first thought was to simply feed the time series into an FFT algorithm (I'm using FFTW/DHT), however this wasn't particularly enlightening. I've also tried my own (admittedly rather crude) algorithm, which simply computed a cross-correlation of the series with "clean" series of increasing period. I didn't get very far with this, either, and there are quite a handful of details to consider (phase, etc).
It occurs to me that something like this must've been done before, and surely there's a "nice" way to accomplish it.
Here's my approach. Given a period, we can score it using a dynamic program to find the subsequence of detection times that includes the first and last detection and maximizes the sum of gap log-likelihoods, where the gap log-likelihood is defined as minus the square of the difference of the gap and the period (Gaussian jitter model).
If we have approximately the right period, then we can get a very good gap sequence (some weirdness at the beginning and end and wherever there is a missed detection, but this is OK).
If we have the wrong period, then we end up with basically exponential jitter, which has low log-likelihood.
The C++ below generates fake detection times with a planted period and then searches over periods. Scores are normalized by a (bad) estimate of the score for Poisson noise, so wrong periods score about 0.4. See the plot below.
#include <algorithm>
#include <cmath>
#include <iostream>
#include <limits>
#include <random>
#include <vector>
namespace {
static constexpr double kFalseNegativeRate = 0.01;
static constexpr double kCoefficientOfVariation = 0.003;
static constexpr int kSignals = 6000;
static constexpr int kNoiseToSignalRatio = 50;
template <class URNG>
std::vector<double> FakeTimes(URNG &g, const double period) {
std::vector<double> times;
std::bernoulli_distribution false_negative(kFalseNegativeRate);
std::uniform_real_distribution<double> phase(0, period);
double signal = phase(g);
std::normal_distribution<double> interval(period,
kCoefficientOfVariation * period);
std::uniform_real_distribution<double> noise(0, kSignals * period);
for (int i = 0; i < kSignals; i++) {
if (!false_negative(g)) {
times.push_back(signal);
}
signal += interval(g);
for (double j = 0; j < kNoiseToSignalRatio; j++) {
times.push_back(noise(g));
}
}
std::sort(times.begin(), times.end());
return times;
}
constexpr double Square(const double x) { return x * x; }
struct Subsequence {
double score;
int previous;
};
struct Result {
double score = std::numeric_limits<double>::quiet_NaN();
double median_interval = std::numeric_limits<double>::quiet_NaN();
};
Result Score(const std::vector<double> &times, const double period) {
if (times.empty() || !std::is_sorted(times.begin(), times.end())) {
return {};
}
std::vector<Subsequence> bests;
bests.reserve(times.size());
bests.push_back({0, -1});
for (int i = 1; i < times.size(); i++) {
Subsequence best = {std::numeric_limits<double>::infinity(), -1};
for (int j = i - 1; j > -1; j--) {
const double difference = times[i] - times[j];
const double penalty = Square(difference - period);
if (difference >= period && penalty >= best.score) {
break;
}
const Subsequence candidate = {bests[j].score + penalty, j};
if (candidate.score < best.score) {
best = candidate;
}
}
bests.push_back(best);
}
std::vector<double> intervals;
int i = bests.size() - 1;
while (true) {
int previous_i = bests[i].previous;
if (previous_i < 0) {
break;
}
intervals.push_back(times[i] - times[previous_i]);
i = previous_i;
}
if (intervals.empty()) {
return {};
}
const double duration = times.back() - times.front();
// The rate is doubled because we can look for a time in either direction.
const double rate = 2 * (times.size() - 1) / duration;
// Mean of the square of an exponential distribution with the given rate.
const double mean_square = 2 / Square(rate);
const double score = bests.back().score / (intervals.size() * mean_square);
const auto median_interval = intervals.begin() + intervals.size() / 2;
std::nth_element(intervals.begin(), median_interval, intervals.end());
return {score, *median_interval};
}
} // namespace
int main() {
std::default_random_engine g;
const auto times = FakeTimes(g, std::sqrt(2));
for (int i = 0; i < 2000; i++) {
const double period = std::pow(1.001, i) / 3;
const Result result = Score(times, period);
std::cout << period << ' ' << result.score << ' ' << result.median_interval
<< std::endl;
}
}

Cuckoo Search with Levy Flight in Java

I'm trying to learn concept about cuckoo search algorithm. Honestly, i'm learning cuckoo search using java source code. we know that cuckoo search using levy distribution for random walk. I have java source code impelements cuckoo search optimation , but i'm doubt that source code using levy distribution. Can anyone help me to inspect wheteher my source code using levy distribution or not?.
this is method that implements random walk..
public CSSolution randomWalk (OptimizationProblem prob, String distribution) {
int n = prob.getNumVar(); //2
// creates a neighborhood of size 1 times the scaling factor
double distanceSquared = Math.pow(rand.nextDouble() *
prob.getScalingFactor(),2);
System.out.println("distance Squared : "+distanceSquared);
// creates an ArrayList from 0 to n-1 (for indexing purposes only)
ArrayList<Integer> varIndices = new ArrayList<Integer>(n);
for (int i = 0; i < n; i++) {
varIndices.add(i, i);
}
ArrayList<Double> vars = this.getVars();
CSSolution newSol = new CSSolution(this.numVars);
newSol.initializeWithNull();
ArrayList<Double> newVars = newSol.getVars();
for (int i = 0; i < n; i++) {
/* Chooses a random variable index from the indices
* of the remaining/unwalked variables. */
int index = rand.nextInt(varIndices.size());
// Finds the variable value that this index corresponds to.
int varIndex = varIndices.get(index);
// System.out.println("varIndicesSize:"+varIndices.size()+" index: "+index+" varIndex: "+varIndex);
double curVar = vars.get(varIndex);
// use correct distribution to generate random double [0,1)
double r;
if (distribution == "weibull"){
r = weibull.random(1.5, 1, new uniform());
}
else if(distribution == "levy"){
r = 0.0;
}
else
r = rand.nextDouble();
// alters this variable coefficient by adding a random step between (-distance,distance)
double distance = Math.sqrt(distanceSquared);
System.out.println("distance : "+distance+" distance Squared : "+distanceSquared);
double varStep = r*distance*2-distance;
double newVar = curVar + varStep;
// System.out.println("x"+varIndex+" : "+curVar+" to "+newVar);
newVars.set(varIndex, newVar);
// removes the variable that has already been visited
varIndices.remove(index);
// updates distance for next for loop
distanceSquared -= Math.pow(varStep, 2);
}
// System.out.println("");
return newSol;
}
source code : https://github.com/cloudrave/Optimizer

Mandelbrot optimization in openmp

Well i have to paralellisize the mandelbrot program in C. I think i have done it well and i cant get better times. My question if someone has an idea to improve the code, ive been thinking perhaps in nested parallel regions between the outer and insider for...
Also i have doubts if its more elegant or recommended to put all the pragmas in a single line or to write separate pragmas ( one for omp parallel and shared and private variables and a conditional, and another pragma with omp for and schedule dynamic).
Ive the doubt if constants can be used as private variables because i think is cleaner to have constants instead of defined variables.
Also i have written a conditional ( if numcpu >1) it has no sense to use parallel region and make a normal sequential execution.
Finally as i have read the dynamic chunk it depends on hardware and your system configuration... so i have left it as a constant, so it can be easily changed.
Also i adapt the number of threads to the number of processors available..
int main(int argc, char *argv[])
{
omp_set_dynamic(1);
int xactual, yactual;
//each iteration, it calculates: newz = oldz*oldz + p, where p is the current pixel, and oldz stars at the origin
double pr, pi; //real and imaginary part of the pixel p
double newRe, newIm, oldRe, oldIm; //real and imaginary parts of new and old z
double zoom = 1, moveX = -0.5, moveY = 0; //you can change these to zoom and change position
pixel_t *pixels = malloc(sizeof(pixel_t)*IMAGEHEIGHT*IMAGEWIDTH);
clock_t begin, end;
double time_spent;
begin=clock();
int numcpu;
numcpu = omp_get_num_procs();
//FILE * fp;
printf("El nĂºmero de procesadores que utilizaremos es: %d", numcpu);
omp_set_num_threads(numcpu);
#pragma omp parallel shared(pixels, moveX, moveY, zoom) private(xactual, yactual, pr, pi, newRe, newIm) (if numcpu>1)
{
//int xactual=0;
// int yactual=0;
#pragma omp for schedule(dynamic, CHUNK)
//loop through every pixel
for(yactual = 0; yactual < IMAGEHEIGHT; yactual++)
for(xactual = 0; xactual < IMAGEWIDTH; xactual++)
{
//calculate the initial real and imaginary part of z, based on the pixel location and zoom and position values
pr = 1.5 * (xactual - IMAGEWIDTH / 2) / (0.5 * zoom * IMAGEWIDTH) + moveX;
pi = (yactual - IMAGEHEIGHT / 2) / (0.5 * zoom * IMAGEHEIGHT) + moveY;
newRe = newIm = oldRe = oldIm = 0; //these should start at 0,0
//"i" will represent the number of iterations
int i;
//start the iteration process
for(i = 0; i < ITERATIONS; i++)
{
//remember value of previous iteration
oldRe = newRe;
oldIm = newIm;
//the actual iteration, the real and imaginary part are calculated
newRe = oldRe * oldRe - oldIm * oldIm + pr;
newIm = 2 * oldRe * oldIm + pi;
//if the point is outside the circle with radius 2: stop
if((newRe * newRe + newIm * newIm) > 4) break;
}
// color(i % 256, 255, 255 * (i < maxIterations));
if(i == ITERATIONS)
{
//color(0, 0, 0); // black
pixels[yactual*IMAGEWIDTH+xactual][0] = 0;
pixels[yactual*IMAGEWIDTH+xactual][1] = 0;
pixels[yactual*IMAGEWIDTH+xactual][2] = 0;
}
else
{
double z = sqrt(newRe * newRe + newIm * newIm);
int brightness = 256 * log2(1.75 + i - log2(log2(z))) / log2((double)ITERATIONS);
//color(brightness, brightness, 255)
pixels[yactual*IMAGEWIDTH+xactual][0] = brightness;
pixels[yactual*IMAGEWIDTH+xactual][1] = brightness;
pixels[yactual*IMAGEWIDTH+xactual][2] = 255;
}
}
} //end of parallel region
end= clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
fprintf(stderr, "Elapsed time: %.2lf seconds.\n", time_spent);
You could extend the implementation to leverage SIMD extensions. As far as I know the latest OpenMP standard includes vector constructs. Check out this article that describes the new capabilities.
This whitepaper explains how SSE3 can be used when calculating the Mandelbrot set.

improper mandelbrot set output plotting

i am trying to write a code to display Mandelbrot set for the numbers between
(-3,-3) to (2,2) on my terminal.
The main function generates & feeds a complex number to analyze function.
The analyze function returns character "*" for the complex number Z within the set and "." for the numbers which lie outside the set.
The code:
#define MAX_A 2 // upperbound on real
#define MAX_B 2 // upper bound on imaginary
#define MIN_A -3 // lowerbnd on real
#define MIN_B -3 // lower bound on imaginary
#define NX 300 // no. of points along x
#define NY 200 // no. of points along y
#define max_its 50
int analyze(double real,double imag);
void main()
{
double a,b;
int x,x_arr,y,y_arr;
int array[NX][NY];
int res;
for(y=NY-1,x_arr=0;y>=0;y--,x_arr++)
{
for(x=0,y_arr++;x<=NX-1;x++,y_arr++)
{
a= MIN_A+ ( x/( (double)NX-1)*(MAX_A-MIN_A) );
b= MIN_B+ ( y/( (double)NY-1 )*(MAX_B-MIN_B) );
//printf("%f+i%f ",a,b);
res=analyze(a,b);
if(res>49)
array[x][y]=42;
else
array[x][y]=46;
}
// printf("\n");
}
for(y=0;y<NY;y++)
{
for(x=0;x<NX;x++)
printf("%2c",array[x][y]);
printf("\n");
}
}
The analyze function accepts the coordinate on imaginary plane ;
and computes (Z^2)+Z 50 times ; and while computing if the complex number explodes, then function returns immidiately else the function returns after finishing 50 iterations;
int analyze(double real,double imag)
{
int iter=0;
double r=4.0;
while(iter<50)
{
if ( r < ( (real*real) + (imag*imag) ) )
{
return iter;
}
real= ( (real*real) - (imag*imag) + real);
imag= ( (2*real*imag)+ imag);
iter++;
}
return iter;
}
So, i am analyzing 60000 (NX * NY) numbers & displaying it on the terminal
considering 3:2 ratio (300,200) , i even tried 4:3 (NX:NY) , but the output remains same and the generated shape is not even close to the mandlebrot set :
hence, the output appears inverted ,
i browsed & came across lines like:
(x - 400) / ZOOM;
(y - 300) / ZOOM;
on many mandelbrot codes , but i am unable to understand how this line may rectify my output.
i guess i am having trouble in mapping output to the terminal!
(LB_Real,UB_Imag) --- (UB_Real,UB_Imag)
| |
(LB_Real,LB_Imag) --- (UB_Real,LB_Imag)
Any Hint/help will be very useful
The Mandelbrot recurrence is zn+1 = zn2 + c.
Here's your implementation:
real= ( (real*real) - (imag*imag) + real);
imag= ( (2*real*imag)+ imag);
Problem 1. You're updating real to its next value before you've used the old value to compute the new imag.
Problem 2. Assuming you fix problem 1, you're computing zn+1 = zn2 + zn.
Here's how I'd do it using double:
int analyze(double cr, double ci) {
double zr = 0, zi = 0;
int r;
for (r = 0; (r < 50) && (zr*zr + zi*zi < 4.0); ++r) {
double zr1 = zr*zr - zi*zi + cr;
double zi1 = 2 * zr * zi + ci;
zr = zr1;
zi = zi1;
}
return r;
}
But it's easier to understand if you use the standard C99 support for complex numbers:
#include <complex.h>
int analyze(double cr, double ci) {
double complex c = cr + ci * I;
double complex z = 0;
int r;
for (r = 0; (r < 50) && (cabs(z) < 2); ++r) {
z = z * z + c;
}
return r;
}

Find local maxima in grayscale image using OpenCV

Does anybody know how to find the local maxima in a grayscale IPL_DEPTH_8U image using OpenCV? HarrisCorner mentions something like that but I'm actually not interested in corners ...
Thanks!
A pixel is considered a local maximum if it is equal to the maximum value in a 'local' neighborhood. The function below captures this property in two lines of code.
To deal with pixels on 'plateaus' (value equal to their neighborhood) one can use the local minimum property, since plateaus pixels are equal to their local minimum. The rest of the code filters out those pixels.
void non_maxima_suppression(const cv::Mat& image, cv::Mat& mask, bool remove_plateaus) {
// find pixels that are equal to the local neighborhood not maximum (including 'plateaus')
cv::dilate(image, mask, cv::Mat());
cv::compare(image, mask, mask, cv::CMP_GE);
// optionally filter out pixels that are equal to the local minimum ('plateaus')
if (remove_plateaus) {
cv::Mat non_plateau_mask;
cv::erode(image, non_plateau_mask, cv::Mat());
cv::compare(image, non_plateau_mask, non_plateau_mask, cv::CMP_GT);
cv::bitwise_and(mask, non_plateau_mask, mask);
}
}
Here's a simple trick. The idea is to dilate with a kernel that contains a hole in the center. After the dilate operation, each pixel is replaced with the maximum of it's neighbors (using a 5 by 5 neighborhood in this example), excluding the original pixel.
Mat1b kernelLM(Size(5, 5), 1u);
kernelLM.at<uchar>(2, 2) = 0u;
Mat imageLM;
dilate(image, imageLM, kernelLM);
Mat1b localMaxima = (image > imageLM);
Actually after I posted the code above I wrote a better and very very faster one ..
The code above suffers even for a 640x480 picture..
I optimized it and now it is very very fast even for 1600x1200 pic.
Here is the code :
void localMaxima(cv::Mat src,cv::Mat &dst,int squareSize)
{
if (squareSize==0)
{
dst = src.clone();
return;
}
Mat m0;
dst = src.clone();
Point maxLoc(0,0);
//1.Be sure to have at least 3x3 for at least looking at 1 pixel close neighbours
// Also the window must be <odd>x<odd>
SANITYCHECK(squareSize,3,1);
int sqrCenter = (squareSize-1)/2;
//2.Create the localWindow mask to get things done faster
// When we find a local maxima we will multiply the subwindow with this MASK
// So that we will not search for those 0 values again and again
Mat localWindowMask = Mat::zeros(Size(squareSize,squareSize),CV_8U);//boolean
localWindowMask.at<unsigned char>(sqrCenter,sqrCenter)=1;
//3.Find the threshold value to threshold the image
//this function here returns the peak of histogram of picture
//the picture is a thresholded picture it will have a lot of zero values in it
//so that the second boolean variable says :
// (boolean) ? "return peak even if it is at 0" : "return peak discarding 0"
int thrshld = maxUsedValInHistogramData(dst,false);
threshold(dst,m0,thrshld,1,THRESH_BINARY);
//4.Now delete all thresholded values from picture
dst = dst.mul(m0);
//put the src in the middle of the big array
for (int row=sqrCenter;row<dst.size().height-sqrCenter;row++)
for (int col=sqrCenter;col<dst.size().width-sqrCenter;col++)
{
//1.if the value is zero it can not be a local maxima
if (dst.at<unsigned char>(row,col)==0)
continue;
//2.the value at (row,col) is not 0 so it can be a local maxima point
m0 = dst.colRange(col-sqrCenter,col+sqrCenter+1).rowRange(row-sqrCenter,row+sqrCenter+1);
minMaxLoc(m0,NULL,NULL,NULL,&maxLoc);
//if the maximum location of this subWindow is at center
//it means we found the local maxima
//so we should delete the surrounding values which lies in the subWindow area
//hence we will not try to find if a point is at localMaxima when already found a neighbour was
if ((maxLoc.x==sqrCenter)&&(maxLoc.y==sqrCenter))
{
m0 = m0.mul(localWindowMask);
//we can skip the values that we already made 0 by the above function
col+=sqrCenter;
}
}
}
The following listing is a function similar to Matlab's "imregionalmax". It looks for at most nLocMax local maxima above threshold, where the found local maxima are at least minDistBtwLocMax pixels apart. It returns the actual number of local maxima found. Notice that it uses OpenCV's minMaxLoc to find global maxima. It is "opencv-self-contained" except for the (easy to implement) function vdist, which computes the (euclidian) distance between points (r,c) and (row,col).
input is one-channel CV_32F matrix, and locations is nLocMax (rows) by 2 (columns) CV_32S matrix.
int imregionalmax(Mat input, int nLocMax, float threshold, float minDistBtwLocMax, Mat locations)
{
Mat scratch = input.clone();
int nFoundLocMax = 0;
for (int i = 0; i < nLocMax; i++) {
Point location;
double maxVal;
minMaxLoc(scratch, NULL, &maxVal, NULL, &location);
if (maxVal > threshold) {
nFoundLocMax += 1;
int row = location.y;
int col = location.x;
locations.at<int>(i,0) = row;
locations.at<int>(i,1) = col;
int r0 = (row-minDistBtwLocMax > -1 ? row-minDistBtwLocMax : 0);
int r1 = (row+minDistBtwLocMax < scratch.rows ? row+minDistBtwLocMax : scratch.rows-1);
int c0 = (col-minDistBtwLocMax > -1 ? col-minDistBtwLocMax : 0);
int c1 = (col+minDistBtwLocMax < scratch.cols ? col+minDistBtwLocMax : scratch.cols-1);
for (int r = r0; r <= r1; r++) {
for (int c = c0; c <= c1; c++) {
if (vdist(Point2DMake(r, c),Point2DMake(row, col)) <= minDistBtwLocMax) {
scratch.at<float>(r,c) = 0.0;
}
}
}
} else {
break;
}
}
return nFoundLocMax;
}
The first question to answer would be what is "local" in your opinion. The answer may well be a square window (say 3x3 or 5x5) or circular window of a certain radius. You can then scan over the entire image with the window centered at each pixel and pick the highest value in the window.
See this for how to access pixel values in OpenCV.
This is very fast method. It stored founded maxima in a vector of
Points.
vector <Point> GetLocalMaxima(const cv::Mat Src,int MatchingSize, int Threshold, int GaussKernel )
{
vector <Point> vMaxLoc(0);
if ((MatchingSize % 2 == 0) || (GaussKernel % 2 == 0)) // MatchingSize and GaussKernel have to be "odd" and > 0
{
return vMaxLoc;
}
vMaxLoc.reserve(100); // Reserve place for fast access
Mat ProcessImg = Src.clone();
int W = Src.cols;
int H = Src.rows;
int SearchWidth = W - MatchingSize;
int SearchHeight = H - MatchingSize;
int MatchingSquareCenter = MatchingSize/2;
if(GaussKernel > 1) // If You need a smoothing
{
GaussianBlur(ProcessImg,ProcessImg,Size(GaussKernel,GaussKernel),0,0,4);
}
uchar* pProcess = (uchar *) ProcessImg.data; // The pointer to image Data
int Shift = MatchingSquareCenter * ( W + 1);
int k = 0;
for(int y=0; y < SearchHeight; ++y)
{
int m = k + Shift;
for(int x=0;x < SearchWidth ; ++x)
{
if (pProcess[m++] >= Threshold)
{
Point LocMax;
Mat mROI(ProcessImg, Rect(x,y,MatchingSize,MatchingSize));
minMaxLoc(mROI,NULL,NULL,NULL,&LocMax);
if (LocMax.x == MatchingSquareCenter && LocMax.y == MatchingSquareCenter)
{
vMaxLoc.push_back(Point( x+LocMax.x,y + LocMax.y ));
// imshow("W1",mROI);cvWaitKey(0); //For gebug
}
}
}
k += W;
}
return vMaxLoc;
}
Found a simple solution.
In this example, if you are trying to find 2 results of a matchTemplate function with a minimum distance from each other.
cv::Mat result;
matchTemplate(search, target, result, CV_TM_SQDIFF_NORMED);
float score1;
cv::Point displacement1 = MinMax(result, score1);
cv::circle(result, cv::Point(displacement1.x+result.cols/2 , displacement1.y+result.rows/2), 10, cv::Scalar(0), CV_FILLED, 8, 0);
float score2;
cv::Point displacement2 = MinMax(result, score2);
where
cv::Point MinMax(cv::Mat &result, float &score)
{
double minVal, maxVal;
cv::Point minLoc, maxLoc, matchLoc;
minMaxLoc(result, &minVal, &maxVal, &minLoc, &maxLoc, cv::Mat());
matchLoc.x = minLoc.x - result.cols/2;
matchLoc.y = minLoc.y - result.rows/2;
return minVal;
}
The process is:
Find global Minimum using minMaxLoc
Draw a filled white circle around global minimum using min distance between minima as radius
Find another minimum
The the scores can be compared to each other to determine, for example, the certainty of the match,
To find more than just the global minimum and maximum try using this function from skimage:
http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.peak_local_max
You can parameterize the minimum distance between peaks, too. And more. To find minima, use negated values (take care of the array type though, 255-image could do the trick).
You can go over each pixel and test if it is a local maxima. Here is how I would do it.
The input is assumed to be type CV_32FC1
#include <vector>//std::vector
#include <algorithm>//std::sort
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/core/core.hpp"
//structure for maximal values including position
struct SRegionalMaxPoint
{
SRegionalMaxPoint():
values(-FLT_MAX),
row(-1),
col(-1)
{}
float values;
int row;
int col;
//ascending order
bool operator()(const SRegionalMaxPoint& a, const SRegionalMaxPoint& b)
{
return a.values < b.values;
}
};
//checks if pixel is local max
bool isRegionalMax(const float* im_ptr, const int& cols )
{
float center = *im_ptr;
bool is_regional_max = true;
im_ptr -= (cols + 1);
for (int ii = 0; ii < 3; ++ii, im_ptr+= (cols-3))
{
for (int jj = 0; jj < 3; ++jj, im_ptr++)
{
if (ii != 1 || jj != 1)
{
is_regional_max &= (center > *im_ptr);
}
}
}
return is_regional_max;
}
void imregionalmax(
const cv::Mat& input,
std::vector<SRegionalMaxPoint>& buffer)
{
//find local max - top maxima
static const int margin = 1;
const int rows = input.rows;
const int cols = input.cols;
for (int i = margin; i < rows - margin; ++i)
{
const float* im_ptr = input.ptr<float>(i, margin);
for (int j = margin; j < cols - margin; ++j, im_ptr++)
{
//Check if pixel is local maximum
if ( isRegionalMax(im_ptr, cols ) )
{
cv::Rect roi = cv::Rect(j - margin, i - margin, 3, 3);
cv::Mat subMat = input(roi);
float val = *im_ptr;
//replace smallest value in buffer
if ( val > buffer[0].values )
{
buffer[0].values = val;
buffer[0].row = i;
buffer[0].col = j;
std::sort(buffer.begin(), buffer.end(), SRegionalMaxPoint());
}
}
}
}
}
For testing the code you can try this:
cv::Mat temp = cv::Mat::zeros(15, 15, CV_32FC1);
temp.at<float>(7, 7) = 1;
temp.at<float>(3, 5) = 6;
temp.at<float>(8, 10) = 4;
temp.at<float>(11, 13) = 7;
temp.at<float>(10, 3) = 8;
temp.at<float>(7, 13) = 3;
vector<SRegionalMaxPoint> buffer_(5);
imregionalmax(temp, buffer_);
cv::Mat debug;
cv::cvtColor(temp, debug, cv::COLOR_GRAY2BGR);
for (auto it = buffer_.begin(); it != buffer_.end(); ++it)
{
circle(debug, cv::Point(it->col, it->row), 1, cv::Scalar(0, 255, 0));
}
This solution does not take plateaus into account so it is not exactly the same as matlab's imregionalmax()
I think you want to use the
MinMaxLoc(arr, mask=NULL)-> (minVal, maxVal, minLoc, maxLoc)
Finds global minimum and maximum in array or subarray
function on you image

Resources