When declaring a static array as "private" before a parallel loop is perfectly equivalent to declaring the array inside the loop? - openacc

I've encountered a situation where the code generates different results in the case of having arrays defined inside the loop on index i (case #1) and in the case of declaring them outside the loop on the i index and using the clause private (case #2).
Case #2 generates the same results of the code running on CPU only.
Case #1
#pragma acc parallel loop
for (j = jbeg; j <= jend; j++){
#pragma acc loop
for (i = ibeg; i <= iend; i++){
double Rc[NFLX][NFLX];
double eta[NFLX], um[NFLX], dv[NFLX];
double lambda[NFLX], alambda[NFLX];
double fL[NFLX], fR[NFLX];
.
.
.
}
}}
Case #2
#pragma acc parallel loop
for (j = jbeg; j <= jend; j++){
double Rc[NFLX][NFLX];
double eta[NFLX], um[NFLX], dv[NFLX];
double lambda[NFLX], alambda[NFLX];
double fL[NFLX], fR[NFLX];
#pragma acc loop private(Rc[:NFLX][:NFLX], eta[:NFLX], \
um[:NFLX], lambda[:NFLX], alambda[:NFLX], \
dv[:NFLX], fL[:NFLX], fR[:NFLX])
for (i = ibeg; i <= iend; i++){
.
.
.
}
}}
I have the following values:
NFLX = 8;
jbeg = 3, jend = 258;
ibeg = 3, iend = 1026;
In which cases the two techniques are equivalent and when it is better to choose one over the other?
This is what I see with -Minfo=accel:
case #1:
71, Local memory used for Rc,dv,fR,um,lambda,alambda,fL,eta
case #2:
71, Local memory used for Rc,dv,fR,lambda,alambda,fL,eta
CUDA shared memory used for Rc,eta
Local memory used for um
CUDA shared memory used for um,lambda,alambda,dv,fL,fR
function:
/* ********************************************************************* */
void Roe_Solver (Data *d, timeStep *Dts, Grid *grid, RBox *box)
/*
* Solve the Riemann problem between L/R states using a
* Rusanov-Lax Friedrichs flux.
*********************************************************************** */
{
int i, j, k;
int ibeg = *(box->nbeg)-1, iend = *(box->nend);
int jbeg = *(box->tbeg), jend = *(box->tend);
int kbeg = *(box->bbeg), kend = *(box->bend);
int VXn = VX1, VXt = VX2, VXb = VX3;
int MXn = MX1, MXt = MX2, MXb = MX3;
int ni, nj;
double gmm = GAMMA_EOS;
double gmm1 = gmm - 1.0;
double gmm1_inv = 1.0/gmm1;
double delta = 1.e-7;
double delta_inv = 1.0/delta;
ARRAY_OFFSET (grid, ni, nj);
INDEX_CYCLE (grid->dir, VXn, VXt, VXb);
INDEX_CYCLE (grid->dir, MXn, MXt, MXb);
#pragma acc parallel loop collapse(2) present(d, Dts, grid)
for (k = kbeg; k <= kend; k++){
for (j = jbeg; j <= jend; j++){
long int offset = ni*(j + nj*k);
double * __restrict__ cmax = &Dts->cmax [offset];
double * __restrict__ SL = &d->sweep.SL [offset];
double * __restrict__ SR = &d->sweep.SR [offset];
double um[NFLX];
double fL[NFLX], fR[NFLX];
#pragma acc loop private(um[:NFLX], fL[:NFLX], fR[:NFLX])
for (i = ibeg; i <= iend; i++){
int nv;
double scrh, vel2;
double a2, a, h;
double alambda, lambda, eta;
double s, c, hl, hr;
double bmin, bmax, scrh1;
double pL, pR;
double * __restrict__ vL = d->sweep.vL [offset + i];
double * __restrict__ vR = d->sweep.vR [offset + i];
double * __restrict__ uL = d->sweep.uL [offset + i];
double * __restrict__ uR = d->sweep.uR [offset + i];
double * __restrict__ flux = d->sweep.flux[offset + i];
double a2L = SoundSpeed2 (vL);
double a2R = SoundSpeed2 (vR);
PrimToCons (vL, uL);
PrimToCons (vR, uR);
Flux (vL, uL, fL, grid->dir);
Flux (vR, uR, fR, grid->dir);
pL = vL[PRS];
pR = vR[PRS];
s = sqrt(vR[RHO]/vL[RHO]);
um[RHO] = vL[RHO]*s;
s = 1.0/(1.0 + s);
c = 1.0 - s;
um[VX1] = s*vL[VX1] + c*vR[VX1];
um[VX2] = s*vL[VX2] + c*vR[VX2];
um[VX3] = s*vL[VX3] + c*vR[VX3];
vel2 = um[VX1]*um[VX1] + um[VX2]*um[VX2] + um[VX3]*um[VX3];
hl = 0.5*(vL[VX1]*vL[VX1] + vL[VX2]*vL[VX2] + vL[VX3]*vL[VX3]);
hl += a2L*gmm1_inv;
hr = 0.5*(vR[VX1]*vR[VX1] + vR[VX2]*vR[VX2] + vR[VX3]*vR[VX3]);
hr += a2R*gmm1_inv;
h = s*hl + c*hr;
/* ----------------------------------------------------
1. the following should be equivalent to
scrh = dv[VX1]*dv[VX1] + dv[VX2]*dv[VX2] + dv[VX3]*dv[VX3];
a2 = s*a2L + c*a2R + 0.5*gmm1*s*c*scrh;
and therefore always positive.
---------------------------------------------------- */
a2 = gmm1*(h - 0.5*vel2);
a = sqrt(a2);
/* ----------------------------------------------------------------
2. define non-zero components of conservative eigenvectors Rc,
eigenvalues (lambda) and wave strenght eta = L.du
---------------------------------------------------------------- */
#pragma acc loop seq
NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]);
/* ---- (u - c_s) ---- */
SL[i] = um[VXn] - a;
/* ---- (u + c_s) ---- */
SR[i] = um[VXn] + a;
/* ---- get max eigenvalue ---- */
cmax[i] = fabs(um[VXn]) + a;
NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]) - 0.5*cmax[i]*(uR[nv] - uL[nv]);
#if DIMENSIONS > 1
/* ---------------------------------------------
3. use the HLL flux function if the interface
lies within a strong shock.
The effect of this switch is visible
in the Mach reflection test.
--------------------------------------------- */
scrh = fabs(vL[PRS] - vR[PRS]);
scrh /= MIN(vL[PRS],vR[PRS]);
if (scrh > 0.5 && (vR[VXn] < vL[VXn])){ /* -- tunable parameter -- */
bmin = MIN(0.0, SL[i]);
bmax = MAX(0.0, SR[i]);
scrh1 = 1.0/(bmax - bmin);
#pragma acc loop seq
for (nv = 0; nv < NFLX; nv++){
flux[nv] = bmin*bmax*(uR[nv] - uL[nv])
+ bmax*fL[nv] - bmin*fR[nv];
flux[nv] *= scrh1;
}
}
#endif /* DIMENSIONS > 1 */
} /* End loop on i */
}} /* End loop on j,k */
}

Technically they are equivalent, but in practice different. What's happening is that the compiler will hoist the declaration of these arrays outside of the loops. This is standard practice for the compiler and happens before the OpenACC directives are applied. What should happen is that then these arrays are implicitly privatized within the scoping unit they are declared. However the compiler doesn't currently track this so the arrays are implicitly copied into the compute region as shared arrays. If you add the flag "-Minfo=accel", you'll see the compiler feedback messages indicating the implicit copies.
I have an open issue report requesting this support, TPR #31360, however it's been a challenge to implement so not in a released compiler as of yet. Hence until/if we can fix the behavior, you'll need to manually hoist the declaration of these arrays and then add them to a "private" clause.

Related

c code is running to slow from nested for loops

my c program is running to slow (right now it is around 40 seconds without parallelization). I have tried using openmp which has brought the timing down significantly but I am looking to use simple and natural ways to make my code run faster other than using parallel for loops. The basic structure of the code is that is takes some command line arguments as inputs and then saves those inputs as variables. Then it recursively computes a variable called Rplus1 using the math.h library and the complex.h library. The problem of the code and where it is taking most of it's time is at the bottom where there are nested for loops. My goal is to get the whole code running in under 5 seconds but as of now it runs in about 40 seconds without using parallel for loops. Please Help!
#include "time.h"
#include "stdio.h"
#include "stdlib.h"
#include "complex.h"
#include "math.h"
#include "string.h"
#include "unistd.h"
#include "omp.h"
#define PI 3.14159265
int main (int argc, char *argv[]){
if(argc >= 8){
double start1 = omp_get_wtime();
// command line arguments are aligned in the following order: [theta] [number of layers in superlattice] [material_1] [lat const_1] [number of unit cells_1] [material_2] [lat const_2] [number of unit cells_2] .... [material_N] [lat const_N] [number of unit cells_N] [Log/Linear] [number of repeating superlattice layers] [yes/no]
int N;
sscanf(argv[2],"%d",&N); // Number of layers in superlattice specified by second input argument
if(strcmp(argv[argc-1],"yes") == 0) //If the substrate is included then add one more layer to the N variable
{
N = N+1;
}
int total;
sscanf(argv[argc-2],"%d",&total); // Number of repeating superlattice layers specified by second to last argument
double layers[N][6], horizangle[1001], vertangle[1001];
double complex (*F_hkl)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*F_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)),SF_table[10];// this array will hold the unit cell structure factors for all of the materials selected for each wavevector in the beam spectrum
double real, real2, lam, c_light = 299792458, h_pl = 4.135667516e-15,E = 10e3, r_0 = 2.818e-15, Lccd = 1.013;// just a few variables to hold values through calculations and constants, speed of light, plancks const, photon energy, and detector distance from sample
double angle;
double complex z;// just a variable to hold complex numbers throughout calculations
int i,j,m,n,t; // integers to index through arrays
lam = (h_pl*c_light)/E;
sscanf(argv[1],"%lf",&angle); //first argument is the angle of incidence, read it
angle = angle*(PI/180.0);
angle2 = -angle;
double (*table)[10] = malloc(10*9*sizeof(double)); // this array holds all the coefficients to calculate the atomic scattering factor below
double (*table2)[10] = malloc(10*2*sizeof(double));
FILE*datfile1 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/coef_table.bin","rb"); // read the binary file containg all the coefficients
fread(table,sizeof(double),90,datfile1);
fclose(datfile1);
FILE*datfile2 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/dispersioncs.bin","rb");
fread(table2,sizeof(double),20,datfile2);
fclose(datfile2);
// Calculate scattering factors for all elements
double a,b;
double k_z = (sin(angle)/lam)*1e-10; // incorporate angular dependence of SF but neglect 0.24 degree divergence because of approximation
for(i = 0;i<10;i++) // for each element...
{
SF_table[i] = 0;
for(j = 0;j<4;j++) // summation
{
a = table[2*j][i];
b = table[2*j+1][i];
SF_table[i] = SF_table[i] + a * exp(-b*k_z*k_z);
}
SF_table[i] = SF_table[i] + table[8][i] + table2[0][i] + table2[1][i]*I;
}
free(table);
double mm = 4.0, (*phi)[1001][1001] = malloc(N*1001*1001*sizeof(double));
for(i = 1; i < N+1; i++) // for each layer of material...
{
sscanf(argv[i*3+1],"%lf",&layers[i-1][1]); // get out of plane lattice constant
sscanf(argv[i*3+2],"%lf",&layers[i-1][2]); // get the number of unit cells in the layer
layers[i-1][1] = layers[i-1][1]*1e-10; // convert lat const input to meters
// Define reciprocal space positions at the incident angle h, k, l
layers[i-1][3] = 0; // h
layers[i-1][4] = 0; // k
double l; // l calculated for each wavevector in the spectrum because l changes with angle of incidence
for (m = 0; m < 1001; m++)
{
for (n = 0; n <1001; n++)
{
l = 4;
phi[i-1][m][n] = 2*PI*layers[i-1][1]*sin(angle)/lam; // Caculate phi for each layer
if(strcmp(argv[i*3],"GaAs") == 0)
{
F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*(SF_table[2]+SF_table[3]*cexp(I*PI*l/2));
F_0[i-1][m][n] = 0.5*8.0*(31 + table2[0][2] + table2[1][2]*I) + 0.5*8.0*(33 + table2[0][3] + table2[1][3]*I);
g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
}
if(strcmp(argv[i*3],"AlGaAs") == 0)
{
F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*((0.76*SF_table[2]+ 0.24*SF_table[4])+SF_table[3]*cexp(I*PI*l/2));
F_0[i-1][m][n] = 0.24*4.0*(13 + table2[0][4] + table2[1][4]*I) + 0.76*4.0*(31 + table2[0][2] + table2[1][2]*I) + 4.0*(33 + table2[0][3] + table2[1][3]*I);
g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
}
}
}
}
double complex (*Rplus1)[1001] = malloc(1001*1001*sizeof(double complex));
for (m = 0; m < 1001; m++)
{
for (n = 0; n <1001; n++)
{
Rplus1[m][n] = 0.0;
}
}
double stop1 = omp_get_wtime();
for(i=1;i<N;i++) // For each layer of the film
{
for(j=0;j<layers[i][2];j++) // For each unit cell
{
for (m = 0; m < 1001; m++) // For each row of the diffraction pattern
{
for (n = 0; n <1001; n++) // For each column of the diffraction pattern
{
Rplus1[m][n] = -I*g[i][m][n] + ((1-I*g_0[i][m][n])*(1-I*g_0[i][m][n]))/(I*g[i][m][n] + (cos(-2*phi[i][m][n])+I*sin(-2*phi[i][m][n]))/Rplus1[m][n]);
}
}
}
}
double stop2 = omp_get_wtime();
double elapsed1 = (double)(stop1 - start1);// Second user defined function to use Durbin and Follis recursive formula
double elapsed2 = (double)(stop2 - start1);// Second user defined function to use Durbin and Follis recursive formula
printf("main() through before diffraction function took %f seconds to run\n\n",elapsed1);
printf("main() through after diffraction function took %f seconds to run\n\n",elapsed2);
}
}

OpenACC call to cuMemFreeHost returned error

I'm using PGI C Compiler--pgcc v16.10.0 64-bit--to learn how to program with OpenACC
Here is my code to simulate the process of particle transport
typedef struct {
double position;
double direction;
double weight;
int cell;
int group;
int alive;
} Particle;
int size = 100000; // number of particles to be simulated
int tot = (int) (1.3 * size); // this variable limits the maximum of next generation particles
int capacity = 0; // this variable indicates the actual number of next generation particles
/* particles to be simulated */
Particle *par = (Particle *) malloc(size * sizeof(Particle));
/* next generation particles produced */
particle *next = (Particle *) malloc(tot * sizeof(Particle));
/* initialization */
for (int i = 0; i < size; i++){
par[i].position = rand1() * 100.0; // random number between 0.0~1.0
par[i].direction = rand2(); // random number between -1.0~1.0
par[i].weight = 1.0;
par[i].cell = 2;
par[i].group = rand1() > 0.5 ? 1 : 2;
par[i].alive = 1;
}
/* some parameters used in simulation */
double keff = 1.0;
double tracklength, collision, absorption;
/* start simulating */
int generation;
for (generation = 1; generation <= 100; generation++){
int CellID, MatID, GroupID;
int k; // k-th particle to be simulated
#pragma acc parallel copy(capacity) copyin(par[0:size],size, keff) copyout(next[0:tot])
#pragma acc loop reduction(+:tracklength, collision, absorption)
for (k = 0; k < size; k++){
/* do some calculating with par[k] */
/* secondary particle produced under certain circumstances */
if (condition){
next[capacity].position = par[k].position;
next[capacity].direction = rand2();
next[capacity].weight = 1.0;
next[capacity].cell = par[k].cell;
next[capacity].group = rand1() < 0.9 ? 1 : 2;
next[capacity].alive = 1;
capacity++;
}
}
/* after simulation of current generation, update the parameters */
keff = ........ // one formula to update keff
size = capacity;
capacity = 0;
tot = (int) (1.3 * size);
free(par);
par = next;
next = (Particle *) malloc(tot * sizeof(Particle));
}
free(par);
free(next);
i compiled the code with
pgcc -acc -Minfo=accel -ta=tesla:cc30,time -O0 main.c -o test
and get information below:
Loop carried dependence of par->alive, par->cell, par->direction, par->position, par->weight, par->group prevents parallelization
Loop carried dependence of par->direction, par->group, par->position prevents vectorization
Loop carried reuse of next->position prevents parallelization
and then run the executable ./test
an error occurred
call to cuMemFreeHost returned error 700: Illegal address during kernel execution
i have no idea how to work it out[SAD].
BTW the code runs well and returns the correct result when compiled by gcc ignoring #pragma

How can I read/transform the range images of the stanford bunny .ply-files?

I want to read the not reconstructed data from the Stanford Bunny. The point data is stored as several range images, which have to be transformed to be combined to one big point cloud, like written in the README:
These data files were obtained with a Cyberware 3030MS optical
triangulation scanner. They are stored as range images in the "ply"
format. The ".conf" file contains the transformations required to
bring each range image into a single coordinate system.
This is the .conf-file:
camera -0.0172 -0.0936 -0.734 -0.0461723 0.970603 -0.235889 0.0124573
bmesh bun000.ply 0 0 0 0 0 0 1
bmesh bun045.ply -0.0520211 -0.000383981 -0.0109223 0.00548449 -0.294635 -0.0038555 0.955586
bmesh bun090.ply 2.20761e-05 -3.34606e-05 -7.20881e-05 0.000335889 -0.708202 0.000602459 0.706009
bmesh bun180.ply 0.000116991 2.47732e-05 -4.6283e-05 -0.00215148 0.999996 -0.0015001 0.000892527
bmesh bun270.ply 0.000130273 1.58623e-05 0.000406764 0.000462632 0.707006 -0.00333301 0.7072
bmesh top2.ply -0.0530127 0.138516 0.0990356 0.908911 -0.0569874 0.154429 0.383126
bmesh top3.ply -0.0277373 0.0583887 -0.0796939 0.0598923 0.670467 0.68082 -0.28874
bmesh bun315.ply -0.00646017 -1.36122e-05 -0.0129064 0.00449209 0.38422 -0.00976512 0.923179
bmesh chin.ply 0.00435102 0.0882863 -0.108853 -0.441019 0.213083 0.00705734 0.871807
bmesh ear_back.ply -0.0829384 0.0353082 0.0711536 0.111743 0.925689 -0.215443 -0.290169
For each range image seven values are stored. But I do not know, what information can be obtained from these values.
I guess that three of them will contain some information about the translation and maybe three contain information about the rotation. But I didn't find something about the order of these values and how to transform the values to get one point cloud.
The wiki page doesn't handle with range images and I found nothing more at the Stanford pages. They just talk about, that the method of Turk94 is used to scan this data set, but the method has no information about the transformations needed. (Or I was not able to get the information out of this paper.)
Does anybody know how to read these values correctly? Why is there a transformation for the camera position? Is this just a good initial value to view the whole point cloud?
Thanks for your help.
EDIT:
Ok. At this point, I already tried to read the data and to correctly transform them, but everything did not work. I use the boost library to handle with the quaternions
Here is my code for it:
boost::math::quaternion<double> translation, quaternionRotation;
//Get Transformation
translation = boost::math::quaternion<double>(0.0, lineData[2].toDouble(), lineData[3].toDouble(), lineData[4].toDouble());
quaternionRotation = boost::math::quaternion<double>(lineData[5].toDouble(),lineData[6].toDouble(),lineData[7].toDouble(),lineData[8].toDouble());
//do some file related stuff
//...
//for each line: read the point data and transform it and store the point in a data array
pointData[j].x = stringPointData[0].toDouble();
pointData[j].y = stringPointData[1].toDouble();
pointData[j].z = stringPointData[2].toDouble();
tmpQuat = boost::math::quaternion<double> (0.0,pointData[j].x,pointData[j].y,pointData[j].z);
//first translation
tmpQuat += translation;
//then quaternion rotation
tmpQuat = (quaternionRotation * (tmpQuat) * boost::math::conj(quaternionRotation));
//read the data from quaternion to a usual type
pointData[j].x = tmpQuat.R_component_2();
pointData[j].y = tmpQuat.R_component_3();
pointData[j].z = tmpQuat.R_component_4();
I assume that the first component of the quaternion is the w component and the others refers to x, y andz like in equation 2 from here. If necessary I can provide the screenshots of the false transformations.
EDIT: It is written in the source code of zipper in the file zipper.c, that the 7 values are saved as followed:
transX transY transZ quatX quatY quatZ quatW
The quaternion is then transformed into a rotation matrix and then the rotation is performed with this new matrix. But even with this information, I am not able to transform it correctly. To test it, I implemented the function quat_to_mat() from zipper in my project:
glm::dmat4 cPlyObjectLoader::quat_to_mat(boost::math::quaternion<double> quat) const
{
float s;
float xs,ys,zs;
float wx,wy,wz;
float xx,xy,xz;
float yy,yz,zz;
glm::dmat4 mat(1.0);
s = 2 / (quat.R_component_2()*quat.R_component_2() +
quat.R_component_3()*quat.R_component_3() +
quat.R_component_4()*quat.R_component_4() +
quat.R_component_1()*quat.R_component_1());
xs = quat.R_component_2() * s;
ys = quat.R_component_3() * s;
zs = quat.R_component_4() * s;
wx = quat.R_component_1() * xs;
wy = quat.R_component_1() * ys;
wz = quat.R_component_1() * zs;
xx = quat.R_component_2() * xs;
xy = quat.R_component_2() * ys;
xz = quat.R_component_2() * zs;
yy = quat.R_component_3() * ys;
yz = quat.R_component_3() * zs;
zz = quat.R_component_4() * zs;
mat[0][0] = 1 - (yy + zz);
mat[0][1] = xy - wz;
mat[0][2] = xz + wy;
mat[0][3] = 0;
mat[1][0] = xy + wz;
mat[1][1] = 1 - (xx + zz);
mat[1][2] = yz - wx;
mat[1][3] = 0;
mat[2][0] = xz - wy;
mat[2][1] = yz + wx;
mat[2][2] = 1 - (xx + yy);
mat[2][3] = 0;
mat[3][0] = 0;
mat[3][1] = 0;
mat[3][2] = 0;
mat[3][3] = 1;
return mat;
}
Now I am doing the translation and rotation with a vector and this matrix:
quaternionRotation = boost::math::quaternion<double>(lineData[8].toDouble(),lineData[5].toDouble(),lineData[6].toDouble(),lineData[7].toDouble());
rotationMat = this->quat_to_mat(quaternionRotation);
translationVec = glm::dvec4(lineData[2].toDouble(), lineData[3].toDouble(), lineData[4].toDouble(),0.0);
//same stuff as above
//...
glm::dvec4 curPoint = glm::dvec4(pointData[j].x,pointData[j].y,pointData[j].z,1.0);
curPoint += translationVec;
curPoint = rotationMat*curPoint;
The result is different to my quaternion rotation (Why? It should be the same.), but not correct.
Debug information:
the input of all transformations is correct
the input of all points is correct
As i read from stanford 3d scan
For all the Stanford models, alignment was done using a modified ICP
algorithm, as described in this paper. These alignments are stored in
".conf" files, which list each range image in the model along with a
translation and a quaternion rotation.
Here is the link to "this paper"
Edit: The two methods are called zippering and volmetric merging
As Ello mentioned, it is written at the stanford 3D repo:
For all the Stanford models, alignment was done using a modified ICP algorithm, as described in this paper. These alignments are stored in ".conf" files, which list each range image in the model along with a translation and a quaternion rotation.
But that is not enough to understand everything of this data file.
It is correct, that the first line:
camera -0.0172 -0.0936 -0.734 -0.0461723 0.970603 -0.235889 0.0124573
stores a good initial camera position and every other line starting with bmesh refers to a .ply-file, which stores a ranged image.
The transformation values are stored as followed:
transX transY transZ quatX quatY quatZ quatW
where trans... refers to a translation value and quat... refers to a value of the quaternion. Currently, I do not know, why it doesn't work with the quaternion rotation by itself, but by transforming it into a rotation matrix with the code of zipper the transformation is correct. Be aware, that the translation is stored first, but to get a correct transformation the rotation has to be done at the beginning and the translation afterwards.
My code snippet to read the files and transform it, is the following:
boost::math::quaternion<double> translation, quaternionRotation;
//Get Transformation
translationVec = glm::dvec4(lineData[2].toDouble(), lineData[3].toDouble(), lineData[4].toDouble(),0.0);
quaternionRotation = boost::math::quaternion<double>(lineData[8].toDouble(),lineData[5].toDouble(),lineData[6].toDouble(),lineData[7].toDouble());
//calculate the unit quaternion
double magnitude = std::sqrt(
quaternionRotation.R_component_1()*quaternionRotation.R_component_1()+
quaternionRotation.R_component_2()*quaternionRotation.R_component_2()+
quaternionRotation.R_component_3()*quaternionRotation.R_component_3()+
quaternionRotation.R_component_4()*quaternionRotation.R_component_4());
quaternionRotation /= magnitude;
rotationMat = this->quat_to_mat(quaternionRotation);
//do some file related stuff
//...
//for each line: read the point data and transform it and store the point in a data array
pointData[j].x = stringPointData[0].toDouble();
pointData[j].y = stringPointData[1].toDouble();
pointData[j].z = stringPointData[2].toDouble();
//transform the curren point
glm::dvec4 curPoint = glm::dvec4(pointData[j].x,pointData[j].y,pointData[j].z,1.0);
//first rotation
curPoint = rotationMat*curPoint;
//then translation
curPoint += translationVec;
//store the data in a data array
pointData[j].x = curPoint.x;
pointData[j].y = curPoint.y;
pointData[j].z = curPoint.z;
I know, that it's not the best one, but it works. Feel free to optimize it by yourself.
Here is the file converter that I wrote. It will assemble all the scans into a single file, one point per line. It supports different file formats (including Stanford .conf files).
#include <string>
#include <vector>
#include <sstream>
#include <iostream>
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#ifndef M_PI
#define M_PI 3.14159265
#endif
class LineInput {
public:
LineInput(const std::string& filename) {
F_ = fopen(filename.c_str(), "r" ) ;
ok_ = (F_ != 0) ;
}
~LineInput() {
if(F_ != 0) {
fclose(F_); F_ = 0 ;
}
}
bool OK() const { return ok_ ; }
bool eof() const { return feof(F_) ; }
bool get_line() {
line_[0] = '\0' ;
// Skip the empty lines
while(!isprint(line_[0])) {
if(fgets(line_, MAX_LINE_LEN, F_) == 0) {
return false ;
}
}
// If the line ends with a backslash, append
// the next line to the current line.
bool check_multiline = true ;
int total_length = MAX_LINE_LEN ;
char* ptr = line_ ;
while(check_multiline) {
int L = strlen(ptr) ;
total_length -= L ;
ptr = ptr + L - 2;
if(*ptr == '\\' && total_length > 0) {
*ptr = ' ' ;
ptr++ ;
fgets(ptr, total_length, F_) ;
} else {
check_multiline = false ;
}
}
if(total_length < 0) {
std::cerr
<< "MultiLine longer than "
<< MAX_LINE_LEN << " bytes" << std::endl ;
}
return true ;
}
int nb_fields() const { return field_.size() ; }
char* field(int i) { return field_[i] ; }
int field_as_int(int i) {
int result ;
ok_ = ok_ && (sscanf(field(i), "%d", &result) == 1) ;
return result ;
}
double field_as_double(int i) {
double result ;
ok_ = ok_ && (sscanf(field(i), "%lf", &result) == 1) ;
return result ;
}
bool field_matches(int i, const char* s) {
return !strcmp(field(i), s) ;
}
void get_fields(const char* separators=" \t\r\n") {
field_.resize(0) ;
char* tok = strtok(line_,separators) ;
while(tok != 0) {
field_.push_back(tok) ;
tok = strtok(0,separators) ;
}
}
private:
enum { MAX_LINE_LEN = 65535 } ;
FILE* F_ ;
char line_[MAX_LINE_LEN] ;
std::vector<char*> field_ ;
bool ok_ ;
} ;
std::string to_string(int x, int mindigits) {
char buff[100] ;
sprintf(buff, "%03d", x) ;
return std::string(buff) ;
}
double M[4][4] ;
void transform(double* xyz) {
double xyzw[4] ;
for(unsigned int c=0; c<4; c++) {
xyzw[c] = M[3][c] ;
}
for(unsigned int j=0; j<4; j++) {
for(unsigned int i=0; i<3; i++) {
xyzw[j] += M[i][j] * xyz[i] ;
}
}
for(unsigned int c=0; c<3; c++) {
xyz[c] = xyzw[c] / xyzw[3] ;
}
}
bool read_frames_file(int no) {
std::string filename = "scan" + to_string(no,3) + ".frames" ;
std::cerr << "Reading frames from:" << filename << std::endl ;
LineInput in(filename) ;
if(!in.OK()) {
std::cerr << " ... not found" << std::endl ;
return false ;
}
while(!in.eof() && in.get_line()) {
in.get_fields() ;
if(in.nb_fields() == 17) {
int f = 0 ;
for(unsigned int i=0; i<4; i++) {
for(unsigned int j=0; j<4; j++) {
M[i][j] = in.field_as_double(f) ; f++ ;
}
}
}
}
return true ;
}
bool read_pose_file(int no) {
std::string filename = "scan" + to_string(no,3) + ".pose" ;
std::cerr << "Reading pose from:" << filename << std::endl ;
LineInput in(filename) ;
if(!in.OK()) {
std::cerr << " ... not found" << std::endl ;
return false ;
}
double xyz[3] ;
double euler[3] ;
in.get_line() ;
in.get_fields() ;
xyz[0] = in.field_as_double(0) ;
xyz[1] = in.field_as_double(1) ;
xyz[2] = in.field_as_double(2) ;
in.get_line() ;
in.get_fields() ;
euler[0] = in.field_as_double(0) * M_PI / 180.0 ;
euler[1] = in.field_as_double(1) * M_PI / 180.0 ;
euler[2] = in.field_as_double(2) * M_PI / 180.0 ;
double sx = sin(euler[0]);
double cx = cos(euler[0]);
double sy = sin(euler[1]);
double cy = cos(euler[1]);
double sz = sin(euler[2]);
double cz = cos(euler[2]);
M[0][0] = cy*cz;
M[0][1] = sx*sy*cz + cx*sz;
M[0][2] = -cx*sy*cz + sx*sz;
M[0][3] = 0.0;
M[1][0] = -cy*sz;
M[1][1] = -sx*sy*sz + cx*cz;
M[1][2] = cx*sy*sz + sx*cz;
M[1][3] = 0.0;
M[2][0] = sy;
M[2][1] = -sx*cy;
M[2][2] = cx*cy;
M[2][3] = 0.0;
M[3][0] = xyz[0];
M[3][1] = xyz[1];
M[3][2] = xyz[2];
M[3][3] = 1.0;
return true ;
}
void setup_transform_from_translation_and_quaternion(
double Tx, double Ty, double Tz,
double Qx, double Qy, double Qz, double Qw
) {
/* for unit q, just set s = 2 or set xs = Qx + Qx, etc. */
double s = 2.0 / (Qx*Qx + Qy*Qy + Qz*Qz + Qw*Qw);
double xs = Qx * s;
double ys = Qy * s;
double zs = Qz * s;
double wx = Qw * xs;
double wy = Qw * ys;
double wz = Qw * zs;
double xx = Qx * xs;
double xy = Qx * ys;
double xz = Qx * zs;
double yy = Qy * ys;
double yz = Qy * zs;
double zz = Qz * zs;
M[0][0] = 1.0 - (yy + zz);
M[0][1] = xy - wz;
M[0][2] = xz + wy;
M[0][3] = 0.0;
M[1][0] = xy + wz;
M[1][1] = 1 - (xx + zz);
M[1][2] = yz - wx;
M[1][3] = 0.0;
M[2][0] = xz - wy;
M[2][1] = yz + wx;
M[2][2] = 1 - (xx + yy);
M[2][3] = 0.0;
M[3][0] = Tx;
M[3][1] = Ty;
M[3][2] = Tz;
M[3][3] = 1.0;
}
bool read_points_file(int no) {
std::string filename = "scan" + to_string(no,3) + ".3d" ;
std::cerr << "Reading points from:" << filename << std::endl ;
LineInput in(filename) ;
if(!in.OK()) {
std::cerr << " ... not found" << std::endl ;
return false ;
}
while(!in.eof() && in.get_line()) {
in.get_fields() ;
double xyz[3] ;
if(in.nb_fields() >= 3) {
for(unsigned int c=0; c<3; c++) {
xyz[c] = in.field_as_double(c) ;
}
transform(xyz) ;
printf("%f %f %f\n",xyz[0],xyz[1],xyz[2]) ;
}
}
return true ;
}
/* only works for ASCII PLY files */
void read_ply_file(char* filename) {
std::cerr << "Reading points from:" << filename << std::endl;
LineInput in(filename) ;
if(!in.OK()) {
std::cerr << filename << ": could not open" << std::endl ;
return;
}
bool reading_vertices = false;
int nb_vertices = 0 ;
int nb_read_vertices = 0 ;
while(!in.eof() && in.get_line()) {
in.get_fields();
if(reading_vertices) {
double xyz[3] ;
for(unsigned int c=0; c<3; c++) {
xyz[c] = in.field_as_double(c) ;
}
transform(xyz) ;
printf("%f %f %f\n",xyz[0],xyz[1],xyz[2]) ;
++nb_read_vertices;
if(nb_read_vertices == nb_vertices) {
return;
}
} else if(
in.field_matches(0,"element") &&
in.field_matches(1,"vertex")
) {
nb_vertices = in.field_as_int(2);
} else if(in.field_matches(0,"end_header")) {
reading_vertices = true;
}
}
}
/* For Stanford scanning repository */
void read_conf_file(char* filename) {
LineInput in(filename) ;
if(!in.OK()) {
std::cerr << filename << ": could not open" << std::endl ;
return;
}
while(!in.eof() && in.get_line()) {
in.get_fields();
if(in.nb_fields() == 0) { continue ; }
if(in.field_matches(0,"bmesh")) {
char* filename = in.field(1);
// Translation vector
double Tx = in.field_as_double(2);
double Ty = in.field_as_double(3);
double Tz = in.field_as_double(4);
/// Quaternion
double Qx = in.field_as_double(5);
double Qy = in.field_as_double(6);
double Qz = in.field_as_double(7);
double Qw = in.field_as_double(8);
setup_transform_from_translation_and_quaternion(Tx,Ty,Tz,Qx,Qy,Qz,Qw);
read_ply_file(filename);
}
}
}
int main(int argc, char** argv) {
if(argc != 2) { return -1 ; }
if(strstr(argv[1],".conf")) {
read_conf_file(argv[1]);
} else {
int max_i = atoi(argv[1]) ;
for(int i=0; i<=max_i; i++) {
if(!read_frames_file(i)) {
read_pose_file(i) ;
}
read_points_file(i) ;
}
}
return 0 ;
}
Okay so here is my solution since none of the above worked for me (note this is in python using blender's bpy). It seems that I need to transpose the rotation part of my 4x4 transformation matrix (note I am using a standard way to convert quaternion to rotation matrix and not the one from zipper). Also note since I am using blender when importing or using any model it only stores the models local coordinates relative to the objects world transformation so you do not have to do this point = objWorld * point, it is blender specific.
#loop
for meshName, transform in zip(plyFile, transformations):
#Build Quaternion
#transform structure [x, y, z, qx, qy, qz, qw]
Rt = mathutils.Quaternion((transform[6], transform[3], transform[4], transform[5])).to_matrix().to_4x4()
Rt.normalize()
Rt.transpose()
Rt[0][3] = transform[0]
Rt[1][3] = transform[1]
Rt[2][3] = transform[2]
bpy.ops.object.select_all(action='DESELECT')
#import the ply mesh into blender
bpy.ops.import_mesh.ply(filepath=baseDir + meshName)
#get the ply object
obj = bpy.context.object
#get objects world matrix
objWorld = obj.matrix_world
for index in range(len(obj.data.vertices)):
#get local point
point = mathutils.Vector([obj.data.vertices[index].co[0],obj.data.vertices[index].co[1], obj.data.vertices[index].co[2], 1.])
#convert local point to world
point = objWorld * point
#apply ply transformation
point = Rt * point
#update the point in the mesh
obj.data.vertices[index].co[0] = point[0]
obj.data.vertices[index].co[1] = point[1]
obj.data.vertices[index].co[2] = point[2]
#all vertex positions should be updated correctly
As mentioned in other answers, the Stanford 3D repository gives some info about the data organization in the '.conf' files but, the transformation for the bunny model were not working properly when using the quaternion data provided.
I was also stuck in this registration problem for the bunny model, and based on my tests I have some extra considerations to add up. When applying the transformation - rotations to be more specific - I have realized that quaternion values were not rotating the cloud in the correct direction but, when using the corresponding Euler notation, by changing the sign of one specific axis of rotation, I got the correct registration. So, back to the quaternion notation used in the '.conf' file, after some tests I have noticed that by changing the sign of the 'w' component in the quaternion, in each 'bmesh' row, but the first (bun000.ply), the rotation by quaternion can be used.
Furthermore, for some reason, when registering the dragon (dragon_stand and dragon_side) and armadillo (armadillo_stand) stanford point clouds, in order to get the correct result I had to use a different sequence for reading the quaternion data in the ‘.conf’ file. It seems to be stored as:
tx ty tz qw qx qy qz
where 't' refers to a translation value and 'q' refers to a quaternion value. Just to be clear, I have just tested these three models, therefore, I don’t know what is the default pattern for the quaternion values. Besides, for these last two point cloud models, I did not need to change the '.conf' file.
I hope this could be useful for someone else trying to do the same
Just in case someone is looking for a full python implementation on the basis of what #DanceIgel found out, here is some code in python 3.9.1, also generating a figure in mathplotlib:
# Python 3.9.1
import numpy as np
import sys
import math
import glob
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import open3d as o3d
def get_pointcloud_files(path):
files = list()
for f in glob.glob(path + '/*.ply'):
files.append(f)
return files
def get_pointcloud_from_file(path, filename):
cloud = o3d.io.read_point_cloud(path + '/' + filename)
return cloud
def get_transformations_from_file(path, filename):
with open(path + '/' + filename) as f:
lines = (line for line in f)
source = np.loadtxt(lines, delimiter=' ', skiprows=1, dtype='str')
source = np.delete(source, 0, 1) #remove camera
filenames = source[:,0]
source = source[filenames.argsort()]
filenames = np.sort(filenames)
translations = list()
for row in source[:,1:4]:
translations.append(np.reshape(row, [3,1]).astype(np.float32))
quaternions = list()
for row in source[:,4:]:
quaternions.append(np.reshape(row, [4,1]).astype(np.float32))
return filenames, translations, quaternions
def quaternion_rotation_matrix(Q):
# Extract the values from Q
q0 = Q[3]
q1 = Q[0]
q2 = Q[1]
q3 = Q[2]
# calculate unit quarternion
magnitude = math.sqrt(q0*q0 + q1*q1 + q2*q2 + q3*q3)
q0 = q0 / magnitude
q1 = q1 / magnitude
q2 = q2 / magnitude
q3 = q3 / magnitude
# First row of the rotation matrix
r00 = 2 * (q0 * q0 + q1 * q1) - 1
r01 = 2 * (q1 * q2 - q0 * q3)
r02 = 2 * (q1 * q3 + q0 * q2)
# Second row of the rotation matrix
r10 = 2 * (q1 * q2 + q0 * q3)
r11 = 2 * (q0 * q0 + q2 * q2) - 1
r12 = 2 * (q2 * q3 - q0 * q1)
# Third row of the rotation matrix
r20 = 2 * (q1 * q3 - q0 * q2)
r21 = 2 * (q2 * q3 + q0 * q1)
r22 = 2 * (q0 * q0 + q3 * q3) - 1
# 3x3 rotation matrix
rot_matrix = np.array([[r00, r01, r02],
[r10, r11, r12],
[r20, r21, r22]])
rot_matrix = np.transpose(rot_matrix)
return rot_matrix
if __name__=="__main__": # $python visualization_bunny.py bunny/data
path = sys.argv[1]
# load transformations and filenames from file
filenames, translations, quaternions = get_transformations_from_file(path, 'bun.conf')
curr_transformation = np.zeros([3,4])
clouds = list()
for curr_filename, curr_quaternion, curr_translation in zip(filenames, quaternions, translations): # go through input files
curr_cloud = get_pointcloud_from_file(path, curr_filename)
# convert cloud to numpy
curr_cloud = np.asarray(curr_cloud.points)
# compute rotation matrix from quaternions
curr_rotation_matr = quaternion_rotation_matrix(curr_quaternion)
curr_rotation_matr = np.squeeze(curr_rotation_matr)
curr_translation = np.squeeze(curr_translation)
# create transformation matrix
curr_transformation[:,0:3] = curr_rotation_matr
curr_transformation[:,3] = curr_translation
# transform current cloud
for i in range(curr_cloud.shape[0]):
# apply rotation
curr_point = np.matmul(curr_rotation_matr, np.transpose(curr_cloud[i,:]))
# apply translation
curr_point = curr_point + curr_translation
curr_cloud[i,0] = curr_point[0]
curr_cloud[i,1] = curr_point[1]
curr_cloud[i,2] = curr_point[2]
# add current cloud to list of clouds
clouds.append(curr_cloud)
#plot separate point clouds in same graph
ax = plt.axes(projection='3d')
for cloud in clouds:
ax.plot(cloud[:,0], cloud[:,1], cloud[:,2], 'bo', markersize=0.005)
#ax.view_init(elev=90, azim=270)
ax.view_init(elev=100, azim=270)
plt.axis('off')
plt.savefig("ZZZ_Stanford_Bunny_PointCloud.png", bbox_inches='tight')
plt.show()

Mandelbrot optimization in openmp

Well i have to paralellisize the mandelbrot program in C. I think i have done it well and i cant get better times. My question if someone has an idea to improve the code, ive been thinking perhaps in nested parallel regions between the outer and insider for...
Also i have doubts if its more elegant or recommended to put all the pragmas in a single line or to write separate pragmas ( one for omp parallel and shared and private variables and a conditional, and another pragma with omp for and schedule dynamic).
Ive the doubt if constants can be used as private variables because i think is cleaner to have constants instead of defined variables.
Also i have written a conditional ( if numcpu >1) it has no sense to use parallel region and make a normal sequential execution.
Finally as i have read the dynamic chunk it depends on hardware and your system configuration... so i have left it as a constant, so it can be easily changed.
Also i adapt the number of threads to the number of processors available..
int main(int argc, char *argv[])
{
omp_set_dynamic(1);
int xactual, yactual;
//each iteration, it calculates: newz = oldz*oldz + p, where p is the current pixel, and oldz stars at the origin
double pr, pi; //real and imaginary part of the pixel p
double newRe, newIm, oldRe, oldIm; //real and imaginary parts of new and old z
double zoom = 1, moveX = -0.5, moveY = 0; //you can change these to zoom and change position
pixel_t *pixels = malloc(sizeof(pixel_t)*IMAGEHEIGHT*IMAGEWIDTH);
clock_t begin, end;
double time_spent;
begin=clock();
int numcpu;
numcpu = omp_get_num_procs();
//FILE * fp;
printf("El número de procesadores que utilizaremos es: %d", numcpu);
omp_set_num_threads(numcpu);
#pragma omp parallel shared(pixels, moveX, moveY, zoom) private(xactual, yactual, pr, pi, newRe, newIm) (if numcpu>1)
{
//int xactual=0;
// int yactual=0;
#pragma omp for schedule(dynamic, CHUNK)
//loop through every pixel
for(yactual = 0; yactual < IMAGEHEIGHT; yactual++)
for(xactual = 0; xactual < IMAGEWIDTH; xactual++)
{
//calculate the initial real and imaginary part of z, based on the pixel location and zoom and position values
pr = 1.5 * (xactual - IMAGEWIDTH / 2) / (0.5 * zoom * IMAGEWIDTH) + moveX;
pi = (yactual - IMAGEHEIGHT / 2) / (0.5 * zoom * IMAGEHEIGHT) + moveY;
newRe = newIm = oldRe = oldIm = 0; //these should start at 0,0
//"i" will represent the number of iterations
int i;
//start the iteration process
for(i = 0; i < ITERATIONS; i++)
{
//remember value of previous iteration
oldRe = newRe;
oldIm = newIm;
//the actual iteration, the real and imaginary part are calculated
newRe = oldRe * oldRe - oldIm * oldIm + pr;
newIm = 2 * oldRe * oldIm + pi;
//if the point is outside the circle with radius 2: stop
if((newRe * newRe + newIm * newIm) > 4) break;
}
// color(i % 256, 255, 255 * (i < maxIterations));
if(i == ITERATIONS)
{
//color(0, 0, 0); // black
pixels[yactual*IMAGEWIDTH+xactual][0] = 0;
pixels[yactual*IMAGEWIDTH+xactual][1] = 0;
pixels[yactual*IMAGEWIDTH+xactual][2] = 0;
}
else
{
double z = sqrt(newRe * newRe + newIm * newIm);
int brightness = 256 * log2(1.75 + i - log2(log2(z))) / log2((double)ITERATIONS);
//color(brightness, brightness, 255)
pixels[yactual*IMAGEWIDTH+xactual][0] = brightness;
pixels[yactual*IMAGEWIDTH+xactual][1] = brightness;
pixels[yactual*IMAGEWIDTH+xactual][2] = 255;
}
}
} //end of parallel region
end= clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
fprintf(stderr, "Elapsed time: %.2lf seconds.\n", time_spent);
You could extend the implementation to leverage SIMD extensions. As far as I know the latest OpenMP standard includes vector constructs. Check out this article that describes the new capabilities.
This whitepaper explains how SSE3 can be used when calculating the Mandelbrot set.

Is this part of a real IFFT process really optimal?

When calculating (I)FFT it is possible to calculate "N*2 real" data points using a ordinary complex (I)FFT of N data points.
Not sure about my terminology here, but this is how I've read it described.
There are several posts about this on stackoverflow already.
This can speed things up a bit when only dealing with such "real" data which is often the case when dealing with for example sound (re-)synthesis.
This increase in speed is offset by the need for a pre-processing step that somehow... uhh... fidaddles? the data to achieve this. Look I'm not even going to try to convince anyone I fully understand this but thanks to previously mentioned threads, I came up with the following routine, which does the job nicely (thank you!).
However, on my microcontroller this costs a bit more than I'd like even though trigonometric functions are already optimized with LUTs.
But the routine itself just looks like it should be possible to optimize mathematically to minimize processing. To me it seems similar to plain 2d rotation. I just can't quite wrap my head around it, but it just feels like this could be done with fewer both trigonometric calls and arithmetic operations.
I was hoping perhaps someone else might easily see what I don't and provide some insight into how this math may be simplified.
This particular routine is for use with IFFT, before the bit-reversal stage.
pseudo-version:
INPUT
MAG_A/B = 0 TO 1
PHA_A/B = 0 TO 2PI
INDEX = 0 TO PI/2
r = MAG_A * sin(PHA_A)
i = MAG_B * sin(PHA_B)
rsum = r + i
rdif = r - i
r = MAG_A * cos(PHA_A)
i = MAG_B * cos(PHA_B)
isum = r + i
idif = r - i
r = -cos(INDEX)
i = -sin(INDEX)
rtmp = r * isum + i * rdif
itmp = i * isum - r * rdif
OUTPUT rsum + rtmp
OUTPUT itmp + idif
OUTPUT rsum - rtmp
OUTPUT itmp - idif
original working code, if that's your poison:
void fft_nz_set(fft_complex_t complex[], unsigned bits, unsigned index, int32_t mag_lo, int32_t pha_lo, int32_t mag_hi, int32_t pha_hi) {
unsigned size = 1 << bits;
unsigned shift = SINE_TABLE_BITS - (bits - 1);
unsigned n = index; // index for mag_lo, pha_lo
unsigned z = size - index; // index for mag_hi, pha_hi
int32_t rsum, rdif, isum, idif, r, i;
r = smmulr(mag_lo, sine(pha_lo)); // mag_lo * sin(pha_lo)
i = smmulr(mag_hi, sine(pha_hi)); // mag_hi * sin(pha_hi)
rsum = r + i; rdif = r - i;
r = smmulr(mag_lo, cosine(pha_lo)); // mag_lo * cos(pha_lo)
i = smmulr(mag_hi, cosine(pha_hi)); // mag_hi * cos(pha_hi)
isum = r + i; idif = r - i;
r = -sinetable[(1 << SINE_BITS) - (index << shift)]; // cos(pi_c * (index / size) / 2)
i = -sinetable[index << shift]; // sin(pi_c * (index / size) / 2)
int32_t rtmp = smmlar(r, isum, smmulr(i, rdif)) << 1; // r * isum + i * rdif
int32_t itmp = smmlsr(i, isum, smmulr(r, rdif)) << 1; // i * isum - r * rdif
complex[n].r = rsum + rtmp;
complex[n].i = itmp + idif;
complex[z].r = rsum - rtmp;
complex[z].i = itmp - idif;
}
// For reference, this would be used as follows to generate a sawtooth (after IFFT)
void synth_sawtooth(fft_complex_t *complex, unsigned fft_bits) {
unsigned fft_size = 1 << fft_bits;
fft_sym_dc(complex, 0, 0); // sets dc bin [0]
for(unsigned n = 1, z = fft_size - 1; n <= fft_size >> 1; n++, z--) {
// calculation of amplitude/index (sawtooth) for both n and z
fft_sym_magnitude(complex, fft_bits, n, 0x4000000 / n, 0x4000000 / z);
}
}

Resources