Goertzel complex signal - algorithm

I have a Goertzel algorithm realization in C. And it work correct for not complex signal.
float goertzel(int numSamples,int TARGET_FREQUENCY,int SAMPLING_RATE, cufftDoubleComplex* modData,bool Im)
{
double Z = M_PI * 2. * (double(TARGET_FREQUENCY) / double(SAMPLING_RATE));
double constantaA = 2 * cos(Z);
double TMPVAR_v0 = 0;
double TMPVAR_v1 = 0;
double TMPVAR_v2 = 0;
double resultat;
for(int n =0; n < numSamples; n++) {
if(!Im)
TMPVAR_v0 = (modData[n].x + constantaA * TMPVAR_v1 - TMPVAR_v2);
else
TMPVAR_v0 = (modData[n].y + constantaA * TMPVAR_v1 - TMPVAR_v2);
TMPVAR_v2=TMPVAR_v1;
TMPVAR_v1=TMPVAR_v0;
}
resultat = (TMPVAR_v1*TMPVAR_v1 + TMPVAR_v2*TMPVAR_v2 - constantaA * TMPVAR_v1 * TMPVAR_v2);
return resultat;
}
But i can not understand how i can use it for complex signal.
Please, help me.....

Related

use ctypes to call a dll file in Python

I write a dll file with openmp, then I use ctypes to use it in Python, but the speed becomes super slow. When I rewrite the code in C, it only takes several seconds to finish.
The main part of the dll file is:
double martini(double a[], double b[], double d_title[], double c[], double d[], double A[], int num_element)
{
double nu_start = 0;
double mu_start = 0;
double z_start = 0;
double step_nu = 2 * 3.1415926 / 100;
double step_mu = 3.1415926 / 100;
double step_z = 0;
double nu = 0;
double mu = 0;
double z = 0;
double integral_first = 0;
double d_uv = 0;
int i = 0;
int j = 0;
int k = 0;
int loop = 0;
#pragma omp parallel for default(none) shared(a, d_title, b, c, nu_start, mu_start, z_start, step_nu, step_mu) private( i,j,k,mu, nu, step_z, z, d_uv) reduction(+:integral_first)
for (loop = 0; loop < num_element; loop++)
{
for (i = 0; i < 100; i++)
{
mu = mu_start + (i + 1) * step_mu;
for (j = 0; j < 100; j++)
{
nu = nu_start + (j + 1) * step_nu;
for (k = 0; k < 1500; k++)
{
d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) + b[loop] * b[loop] * cos(mu) * cos(mu)) / (c[loop] * c[loop]);
step_z = 20 / (d_uv * 1500);
z = z_start + (k + 1) * step_z;
integral_first = integral_first + 1 / 2 * A[loop] * sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu / (c[loop] * c[loop]);
}
}
}
}
return integral_first;
}
I use CDLL() in python to load this dll file. Then this function is called. But the speed is very low.
I also write another C file where I create a, b,d... by giving them 500 random numbers. I can get the result in seconds.
Python code:
martini=CDLL('C:/Users/38914/source/repos/Dll2/x64/Release/Dll2.dll')
integral=martini.martini
integral.restype=c_double
a_func=(c_double*num_element)()
b_func=(c_double*num_element)()
d_title_func=(c_double*num_element)()
c_func=(c_double*num_element)()
d_func=(c_double*num_element)()
A_func=(c_double*num_element)()
a_func[:]=a
b_func[:]=b
d_title_func[:]=d_title
c_func[:]=c
d_func[:]=d
A_func[:]=A
num_element_func=c_int(num_element)
out=integral(a_func,b_func,d_title_func,c_func,d_func,A_func,num_element_func)
a,b,c,d,d_title and A are array I use Python to read from a txt file.

Is there a way to avoid CUDA atomicAdd in my situation?

I'm doing an operation as the figure below.
Here is my kernel.
As shown in the figure, I make a small matrix using about one million vectors and accumulate it in a large prepared matrix.
I need an idea that can improve performance without exceeding 8Gb of GPU global memory.
How can I avoid atomic operations? I use the GTX1080. Existing kernels take about 250ms.
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
if (src[idx].mask == 1)
{
// matrix width
int cols = 6 * (mw_width + 1);
// calc position for insert
int idx0 = (src[idx].fid0 - st);
if (idx0 == mw_width - 2)
{
idx0 = idx0 - 1;
}
else if (idx0 == mw_width - 1)
{
idx0 = idx0 - 2;
}
int idx1 = (src[idx].fid1 - st);
if (idx1 == mw_width - 2)
{
idx1 = idx1 - 1;
}
else if (idx1 == mw_width - 1)
{
idx1 = idx1 - 2;
}
int pos0 = idx0 * 6;
int pos1 = idx1 * 6;
// set tempolar matrix
double _A00[24 * 24];
double _A11[24 * 24];
double _A01[24 * 24];
double _b0[24];
double _b1[24];
for (int y = 0; y < 24; y++)
{
for (int x = 0; x < 24; x++)
{
_A00[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J0[x];
_A11[y * 24 + x] = src[idx].w * src[idx].J1[y] * src[idx].J1[x];
_A01[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J1[x];
}
_b0[y] = src[idx].w * src[idx].c * src[idx].J0[y];
_b1[y] = src[idx].w * src[idx].c * src[idx].J1[y];
}
// set final matrix
for (int i = 0; i < 24; i++)
{
for (int j = 0; j < 24; j++)
{
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], _A00[i * 24 + j]); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], _A11[i * 24 + j]); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], _A01[i * 24 + j]); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], _A01[j * 24 + i]); // 10
}
atomicAdd(&b[i + pos0], _b0[i]); // 0
atomicAdd(&b[i + pos1], _b1[i]); // 1
}
}
}
}
2019.3.6.
I modified the code below to see some performance improvements.
250ms -> 95ms
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
int src_idx = idx / 576;
if (src[src_idx].mask == 1)
{
int cols = 6 * (mw_width + 1);
int pos0 = src[src_idx].pos0;
int pos1 = src[src_idx].pos1;
double w = src[src_idx].w;
double c = src[src_idx].c;
int sub_idx = idx % 576;
int i = sub_idx / 24;
int j = sub_idx % 24;
double J0_i = src[src_idx].J0[i];
double J0_j = src[src_idx].J0[j];
double J1_i = src[src_idx].J1[i];
double J1_j = src[src_idx].J1[j];
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], w * J0_i * J0_j); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], w * J1_i * J1_j); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], w * J0_i * J1_j); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], w * J1_i * J0_j); // 10
if (j == 0)
{
atomicAdd(&b[i + pos0], w * c * J0_i); // 0
atomicAdd(&b[i + pos1], w * c * J1_i); // 1
}
}
}
}

Unity line renderer smooth algorithm

I need an !effective! algorithm to smooth a line renderer (basically, the given Vector3 which holds the points of the renderer)
something like that
Here is my code, but the fps with it is very low:
public static List<Vector3> MakeSmoothCurve(Vector3[] arrayToCurve, float smoothness)
{
List<Vector3> points;
List<Vector3> curvedPoints;
int pointsLength = 0;
int curvedLength = 0;
if (smoothness < 1.0f) smoothness = 1.0f;
pointsLength = arrayToCurve.Length;
curvedLength = (pointsLength * Mathf.RoundToInt(smoothness)) - 1;
curvedPoints = new List<Vector3>(curvedLength);
float t = 0.0f;
for (int pointInTimeOnCurve = 0; pointInTimeOnCurve < curvedLength + 1; pointInTimeOnCurve++)
{
t = Mathf.InverseLerp(0, curvedLength, pointInTimeOnCurve);
points = new List<Vector3>(arrayToCurve);
for (int j = pointsLength - 1; j > 0; j--)
{
for (int i = 0; i < j; i++)
{
points[i] = (1 - t) * points[i] + t * points[i + 1];
}
}
curvedPoints.Add(points[0]);
}
return (curvedPoints);
}
You can use a CurveField
https://docs.unity3d.com/ScriptReference/EditorGUILayout.CurveField.html
With that you can easily edit/test your curve and retrieve a point at given time.
https://docs.unity3d.com/ScriptReference/AnimationCurve.Evaluate.html

Why my complexity of bottom is not O(n^3)

From what I learned, the complexity of bottom-up should be n^3, However, mine shows me it's almost like O(n). I have been checking these code many times but still no clue why it's not n^3 complexity. Do I miss something here?
/**
* Using the bottom-up approach to fill in the table m and s for matrices P
* #param P an array storing the matrices chain
*/
public void matrixChainOrder(int[] P){
int n = P.length-1;
m = new int[n+1][n+1];
s = new int[n][n+1];
for (int i=1; i <= n; i++ ){
m[i][i] = 0;
}
for (int l = 2; l <= n; l++){
for (int i = 1; i <= n-l + 1; i++){
int j = i + l - 1;
m[i][j] = Integer.MAX_VALUE;
for (int k = i; k <= j -1; k++){
int q = m[i][k] + m[k+1][j] + P[i-1]*P[k]*P[j];
if (q < m[i][j]){
m[i][j] = q;
s[i][j] = k;
}
}
}
}
}
/**
* Print out the optimal parenthesization of matrices chain P
* #param s the auxiliary table storing the parenthesization
* #param i the index of the start matrix
* #param j the index of the end matrix
*/
public void printOptimalParens(int[][] s, int i, int j){
if (i == j){
System.out.print("A"+ i);
}
else{
System.out.print("(");
printOptimalParens(s, i, s[i][j]);
printOptimalParens(s, s[i][j] + 1, j);
System.out.print(")");
}
}
/**
* Compute the product of the matrices chain
* #param A The matrices chain, it is an array of matrices
* #param s the auxiliary table storing the parenthesization
* #param i the start matrix
* #param j the end matrix
* #return the product matrix of the matrices chain
*/
public long[][] matrixChainMultiply(long[][][] A, int[][] s, int i, int j){
if (i == j){
return A[i];
}
if (i + 1 == j){
return multiply(A[i], A[j]);
}
long[][] C = matrixChainMultiply(A, s, i, s[i+1][j+1]-1);
long[][] D = matrixChainMultiply(A, s, s[i+1][j+1], j);
return multiply(C, D);
}
/**
* A helper method to compute 2 matrices
* #param X the first matrix
* #param Y the secodn matrix
* #return the product of these two matrices
*/
public long[][] multiply(long[][] X, long[][] Y){
int r = X.length;
int c = X[0].length;
int c2 = Y[0].length;
long[][] B = new long[r][c2];
for (int u = 0; u < r; u++){
for (int v = 0; v < c2; v++){
for (int w = 0; w < c; w++){
B[u][v] += X[u][w] * Y[w][v];
}
}
}
return B;
}
I can show you that's not O(N) for sure.
the first cycle does one time so it's O(N-2). (you start at the number 2)
the second one does it's O(N-L-1).
the third one does it's O(J-1).
in the end it'l be something like O(N*(N-L-1)*(J-1)).
to simplify i can call it O(N^3).
Correct me if i'm wrong.
bottom-up is a type of dynamic programming, and it does not have a algorithm.
It can run at O(N) or less, or more, it depends on the problem.

How to calculate the Gaussian Filter kernel

I am working on image processing project.In that I need to implement Gaussian filter.How to calculate the 3x3, 5x5,7x7 kernels? Please help me.
http://s14.postimg.org/rwpyq8k5d/image.jpg
The code below illustrate how to calculate the Gaussian kernel with any filter size and Gaussian weighted parameter.
enter code here public static double[,] CalculateGaussianKernel(int length, double weight)
{
// define an array of two dimensions based on the length value that pass it by the user from the text box.
double[,] Kernel = new double[length, length];
double sumTotal = 0;
int kernelRadius = length / 2;
double distance = 0;
double calculatedEuler = 1.0 / (2.0 * Math.PI * Math.Pow(weight, 2)); // Gaussian Function first part
for (int filterY = -kernelRadius; filterY <= kernelRadius; filterY++)
{
for (int filterX = -kernelRadius; filterX <= kernelRadius; filterX++)
{
distance = ((filterX * filterX) + (filterY * filterY)) /(2 * (weight * weight)); // Gaussian Function Second part
Kernel[filterY + kernelRadius,filterX + kernelRadius] = calculatedEuler * Math.Exp(-distance);
sumTotal += Kernel[filterY + kernelRadius, filterX + kernelRadius];
}
}
for (int y = 0; y < length; y++)
{
for (int x = 0; x < length; x++)
{
Kernel[y, x] = Kernel[y, x] *
(1.0 / sumTotal);
}
}
return Kernel;
}

Resources