use ctypes to call a dll file in Python - openmp

I write a dll file with openmp, then I use ctypes to use it in Python, but the speed becomes super slow. When I rewrite the code in C, it only takes several seconds to finish.
The main part of the dll file is:
double martini(double a[], double b[], double d_title[], double c[], double d[], double A[], int num_element)
{
double nu_start = 0;
double mu_start = 0;
double z_start = 0;
double step_nu = 2 * 3.1415926 / 100;
double step_mu = 3.1415926 / 100;
double step_z = 0;
double nu = 0;
double mu = 0;
double z = 0;
double integral_first = 0;
double d_uv = 0;
int i = 0;
int j = 0;
int k = 0;
int loop = 0;
#pragma omp parallel for default(none) shared(a, d_title, b, c, nu_start, mu_start, z_start, step_nu, step_mu) private( i,j,k,mu, nu, step_z, z, d_uv) reduction(+:integral_first)
for (loop = 0; loop < num_element; loop++)
{
for (i = 0; i < 100; i++)
{
mu = mu_start + (i + 1) * step_mu;
for (j = 0; j < 100; j++)
{
nu = nu_start + (j + 1) * step_nu;
for (k = 0; k < 1500; k++)
{
d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) + b[loop] * b[loop] * cos(mu) * cos(mu)) / (c[loop] * c[loop]);
step_z = 20 / (d_uv * 1500);
z = z_start + (k + 1) * step_z;
integral_first = integral_first + 1 / 2 * A[loop] * sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu / (c[loop] * c[loop]);
}
}
}
}
return integral_first;
}
I use CDLL() in python to load this dll file. Then this function is called. But the speed is very low.
I also write another C file where I create a, b,d... by giving them 500 random numbers. I can get the result in seconds.
Python code:
martini=CDLL('C:/Users/38914/source/repos/Dll2/x64/Release/Dll2.dll')
integral=martini.martini
integral.restype=c_double
a_func=(c_double*num_element)()
b_func=(c_double*num_element)()
d_title_func=(c_double*num_element)()
c_func=(c_double*num_element)()
d_func=(c_double*num_element)()
A_func=(c_double*num_element)()
a_func[:]=a
b_func[:]=b
d_title_func[:]=d_title
c_func[:]=c
d_func[:]=d
A_func[:]=A
num_element_func=c_int(num_element)
out=integral(a_func,b_func,d_title_func,c_func,d_func,A_func,num_element_func)
a,b,c,d,d_title and A are array I use Python to read from a txt file.

Related

Goertzel complex signal

I have a Goertzel algorithm realization in C. And it work correct for not complex signal.
float goertzel(int numSamples,int TARGET_FREQUENCY,int SAMPLING_RATE, cufftDoubleComplex* modData,bool Im)
{
double Z = M_PI * 2. * (double(TARGET_FREQUENCY) / double(SAMPLING_RATE));
double constantaA = 2 * cos(Z);
double TMPVAR_v0 = 0;
double TMPVAR_v1 = 0;
double TMPVAR_v2 = 0;
double resultat;
for(int n =0; n < numSamples; n++) {
if(!Im)
TMPVAR_v0 = (modData[n].x + constantaA * TMPVAR_v1 - TMPVAR_v2);
else
TMPVAR_v0 = (modData[n].y + constantaA * TMPVAR_v1 - TMPVAR_v2);
TMPVAR_v2=TMPVAR_v1;
TMPVAR_v1=TMPVAR_v0;
}
resultat = (TMPVAR_v1*TMPVAR_v1 + TMPVAR_v2*TMPVAR_v2 - constantaA * TMPVAR_v1 * TMPVAR_v2);
return resultat;
}
But i can not understand how i can use it for complex signal.
Please, help me.....

Is there a way to avoid CUDA atomicAdd in my situation?

I'm doing an operation as the figure below.
Here is my kernel.
As shown in the figure, I make a small matrix using about one million vectors and accumulate it in a large prepared matrix.
I need an idea that can improve performance without exceeding 8Gb of GPU global memory.
How can I avoid atomic operations? I use the GTX1080. Existing kernels take about 250ms.
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
if (src[idx].mask == 1)
{
// matrix width
int cols = 6 * (mw_width + 1);
// calc position for insert
int idx0 = (src[idx].fid0 - st);
if (idx0 == mw_width - 2)
{
idx0 = idx0 - 1;
}
else if (idx0 == mw_width - 1)
{
idx0 = idx0 - 2;
}
int idx1 = (src[idx].fid1 - st);
if (idx1 == mw_width - 2)
{
idx1 = idx1 - 1;
}
else if (idx1 == mw_width - 1)
{
idx1 = idx1 - 2;
}
int pos0 = idx0 * 6;
int pos1 = idx1 * 6;
// set tempolar matrix
double _A00[24 * 24];
double _A11[24 * 24];
double _A01[24 * 24];
double _b0[24];
double _b1[24];
for (int y = 0; y < 24; y++)
{
for (int x = 0; x < 24; x++)
{
_A00[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J0[x];
_A11[y * 24 + x] = src[idx].w * src[idx].J1[y] * src[idx].J1[x];
_A01[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J1[x];
}
_b0[y] = src[idx].w * src[idx].c * src[idx].J0[y];
_b1[y] = src[idx].w * src[idx].c * src[idx].J1[y];
}
// set final matrix
for (int i = 0; i < 24; i++)
{
for (int j = 0; j < 24; j++)
{
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], _A00[i * 24 + j]); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], _A11[i * 24 + j]); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], _A01[i * 24 + j]); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], _A01[j * 24 + i]); // 10
}
atomicAdd(&b[i + pos0], _b0[i]); // 0
atomicAdd(&b[i + pos1], _b1[i]); // 1
}
}
}
}
2019.3.6.
I modified the code below to see some performance improvements.
250ms -> 95ms
__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
{
int src_idx = idx / 576;
if (src[src_idx].mask == 1)
{
int cols = 6 * (mw_width + 1);
int pos0 = src[src_idx].pos0;
int pos1 = src[src_idx].pos1;
double w = src[src_idx].w;
double c = src[src_idx].c;
int sub_idx = idx % 576;
int i = sub_idx / 24;
int j = sub_idx % 24;
double J0_i = src[src_idx].J0[i];
double J0_j = src[src_idx].J0[j];
double J1_i = src[src_idx].J1[i];
double J1_j = src[src_idx].J1[j];
atomicAdd(&A[(i + pos0) * cols + (j + pos0)], w * J0_i * J0_j); // 00
atomicAdd(&A[(i + pos1) * cols + (j + pos1)], w * J1_i * J1_j); // 11
atomicAdd(&A[(i + pos0) * cols + (j + pos1)], w * J0_i * J1_j); // 01
atomicAdd(&A[(i + pos1) * cols + (j + pos0)], w * J1_i * J0_j); // 10
if (j == 0)
{
atomicAdd(&b[i + pos0], w * c * J0_i); // 0
atomicAdd(&b[i + pos1], w * c * J1_i); // 1
}
}
}
}

(Homework) Converting a function to a CUDA kernel function

I'm working on an assignment that asks to optimise this C program using CUDA parallelisation.
This is what I managed to come up with:
//...
__global__ void gpu_score_function(void *gpu_frame_pixels, void *gpu_pattern_pixels, void *gpu_results,
int frame_rowstride, int pattern_rowstride,
int pattern_width, int pattern_height,
int frame_width, int frame_height) {
if ((blockIdx.y * blockDim.y + threadIdx.y < frame_height - pattern_height) &&
(blockIdx.x * blockDim.x + threadIdx.x < frame_width - pattern_width)) {
guchar *frame_pixels = (guchar *) gpu_frame_pixels +
(blockIdx.y * blockDim.y + threadIdx.y) * frame_rowstride +
(blockIdx.x * blockDim.x + threadIdx.x) * N_CHANNELS;
guchar *pattern_pixels = (guchar *) gpu_pattern_pixels;
int *results = (int *) gpu_results;
int res = 0;
for (int y = 0; y < pattern_height; ++y) {
if (blockIdx.y * blockDim.y + threadIdx.y + y < frame_height - pattern_height) {
for (int x = 0; x < pattern_width; ++x) {
if (blockIdx.x * blockDim.x + threadIdx.x + x < frame_width - pattern_width) {
const guchar *frame_pixel = frame_pixels + x * N_CHANNELS;
const guchar *pattern_pixel = pattern_pixels + x * N_CHANNELS;
for (int c = 0; c < N_CHANNELS; ++c) {
res += (frame_pixel[c] - 128) * (pattern_pixel[c] - 128);
}
} else {
break;
}
}
frame_pixels += frame_rowstride;
pattern_pixels += pattern_rowstride;
} else {
break;
}
}
results[(blockIdx.y * blockDim.y + threadIdx.y) * (frame_width - pattern_width) + blockIdx.x * blockDim.x + threadIdx.x] = res;
}
}
int main(int argc, const char *argv[]) {
//...
/**
* CUDA
*/
void *gpu_pattern_pixels;
void *gpu_frame_pixels;
void *gpu_results;
cudaMalloc(&gpu_pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar));
cudaMalloc(&gpu_frame_pixels, frame_height * frame_rowstride * sizeof(guchar));
cudaMalloc(&gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy(gpu_pattern_pixels, (void *) pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_frame_pixels, (void *) frame_pixels, frame_height * frame_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
//Kernel configuration, where a two-dimensional grid and
//three-dimensional blocks are configured.
dim3 dimGrid(ceil((float) (frame_width - pattern_width) / 32), ceil((float) (frame_height - pattern_height) / 32));
dim3 dimBlock(32, 32);
gpu_score_function<<<dimGrid, dimBlock>>>(gpu_frame_pixels, gpu_pattern_pixels, gpu_results, frame_rowstride, pattern_rowstride, pattern_width, pattern_height, frame_width, frame_height);
cudaDeviceSynchronize();
int *results = (int *) malloc((frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy((void *) results, gpu_results,
(frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int), cudaMemcpyDeviceToHost);
int gpu_x_best, gpu_y_best;
double gpu_best_score;
for (int *cur = results; cur != results + (frame_width - pattern_width) * (frame_height - pattern_height); cur++) {
if (cur == results || *cur > gpu_best_score) {
gpu_best_score = *cur;
gpu_x_best = (cur - results) % (frame_width - pattern_width);
gpu_y_best = (cur - results) / (frame_width - pattern_width);
}
}
cudaFree(gpu_pattern_pixels);
cudaFree(gpu_frame_pixels);
cudaFree(gpu_results);
free(results);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
/**
* END CUDA
*/
//...
return 0;
}
The program doesn't segfault, cuda-memcheck gives 0 errors and the result matrix is filled.
The problem is, the results are wrong.
I'm quite sure it's some off-by-one pointer error, but I have no idea how to spot it.
I'm working on OSX 10.9, what tools could I use to debug this program?
Any help is appreciated.
I found the bug.
The two if statements inside the for loops of gpu_score_function make no sense. Deleting them solved the problem.

OpenCL kernel error on Mac OSx

I wrote some OpenCL code which works fine on LINUX, but it is failing with errors on Mac OSX. Can someone please help me to identify why these should occur. The kernel code is shown after the error. My kernel uses double, so I have the corresponding pragma at the top. But I don't know why the error shows float data type:
inline float8 __OVERLOAD__ _name(float8 x) { return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30: note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:421:29:
note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float16 __OVERLOAD__ _name(float16 x){ return _default_name(x); }
^
<program source>:206:19: error: call to '__fast_relax_log' is ambiguous
det_zkinin + log((2.0) * 3.14));
^~~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4608:22:
note: expanded from macro 'log'
#define log(__x) __fast_relax_log(__x)
^~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30:
note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:416:27:
note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float __OVERLOAD__ _name(float x) { return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30
note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
^
This is the kernel code:
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void ckf_kernel2(int dimx, int aligned_dimx,
int numOfCKF, int aligned_ckf,
int iter,
double epsilon,
__global double * yrlists,
__global double * zrlists,
__global double * rlists,
__global double * init_state,
__global double * init_var,
__global double * sing_j,
__global double * covMatrixSum,
__global double * cummulative,
__global double * temp_var,
__global double * x_k_f,
__global double * z_k_j,
__global double * crossCovMatrixSum,
__global double * z_k_f,
__global double * innCovMatrixSum,
__global double * zk_diff,
__global double * reduce_gain_matrix,
__global double * llk
)
{
int ckf_id = get_global_id(0);
if( ckf_id < numOfCKF){
for (int i = 0 ; i < dimx ; i++)
{
for (int idx = 0; idx < dimx * 2 ; idx++)
{
int column = idx % dimx;
int mode = (idx >= dimx) ? -1 : 1;
sing_j[(i * dimx * 2 + idx) * aligned_ckf + ckf_id] = temp_var[(i * dimx + column) * aligned_ckf + ckf_id] * epsilon * mode + init_state[i * aligned_ckf + ckf_id];
}
}
z_k_f[ckf_id] = 0;
innCovMatrixSum[ckf_id] = 0;
for (int idx = 0; idx < dimx * 2 ; idx++)
{
z_k_j[idx * aligned_ckf + ckf_id] = 0;
for (int i = 0 ; i < dimx ; i++)
z_k_j[idx * aligned_ckf + ckf_id] += sing_j[(i * dimx * 2 + idx) * aligned_ckf + ckf_id] * zrlists[iter * aligned_dimx + i ];
z_k_f[ckf_id] += z_k_j[idx* aligned_ckf + ckf_id] ;
innCovMatrixSum[ckf_id] += z_k_j[idx* aligned_ckf + ckf_id] * z_k_j[idx* aligned_ckf + ckf_id];
}
z_k_f[ckf_id] = z_k_f[ckf_id] / (dimx * 2);
innCovMatrixSum[ckf_id] = innCovMatrixSum[ckf_id] / (dimx * 2);
innCovMatrixSum[ckf_id] = (innCovMatrixSum[ckf_id] - z_k_f[ckf_id] *z_k_f[ckf_id]) + rlists[ckf_id];
// calcualte crossCovMatrixSum
for (int idx = 0; idx < dimx; idx ++)
{
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = 0;
for (int i = 0 ; i < 2 * dimx ; i++)
{
crossCovMatrixSum[idx * aligned_ckf + ckf_id] += sing_j[(idx * dimx*2 + i) * aligned_ckf + ckf_id ] * z_k_j[i* aligned_ckf + ckf_id];
}
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = crossCovMatrixSum[idx * aligned_ckf + ckf_id]/ (dimx * 2);
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = crossCovMatrixSum[idx * aligned_ckf + ckf_id] - x_k_f[idx* aligned_ckf + ckf_id] * z_k_f[ckf_id];
}
// calculate zk_diff
int z_check = (int)yrlists[iter];
if (z_check == -1)
zk_diff[ckf_id] = 0;
else
zk_diff[ckf_id] = yrlists[iter] - z_k_f[ckf_id];
// calculate reduce_gain_matrix and (reduce_state_matrix <==> init_state);
for (int idx = 0 ; idx < dimx; idx++)
{
reduce_gain_matrix[idx * aligned_ckf + ckf_id] = (crossCovMatrixSum[idx * aligned_ckf + ckf_id] / innCovMatrixSum[ckf_id]);
init_state[idx * aligned_ckf + ckf_id] = reduce_gain_matrix[idx * aligned_ckf + ckf_id] * zk_diff[ckf_id] + x_k_f[idx* aligned_ckf + ckf_id];
}
for (int idx = 0 ; idx < dimx; idx++)
{
init_var[idx * aligned_ckf + ckf_id ] = covMatrixSum[(idx * dimx + idx) * aligned_ckf + ckf_id] -
reduce_gain_matrix[idx * aligned_ckf + ckf_id] * innCovMatrixSum[ckf_id] *
reduce_gain_matrix[idx * aligned_ckf + ckf_id];
}
double det_zkinin = zk_diff[ckf_id] * zk_diff[ckf_id] * (1.0f /innCovMatrixSum[ckf_id]);
if (innCovMatrixSum[ckf_id] <= 0)
llk[ckf_id] = 0;
else
llk[ckf_id] = 0.5 * ((log(innCovMatrixSum[ckf_id])) +
det_zkinin + log((2.0) * 3.14));
cummulative[ckf_id] += llk[ckf_id];
}
}
I suspect you are trying to run this on an integrated Intel GPU which does not support double precision. I can only reproduce your error on my own Macbook Pro if I compile your kernel code for the Intel HD 4000 - it compiles just fine when I target the CPU or the discrete NVIDIA GPU.
You can check if the device supports double precision by querying the CL_DEVICE_DOUBLE_FP_CONFIG device information parameter:
cl_device_fp_config cfg;
clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
printf("Double FP config = %llu\n", cfg);
If this function returns the value 0, then double precision is not supported. This explains why the compiler log is only reporting float variants of the log function.

How to calculate the Gaussian Filter kernel

I am working on image processing project.In that I need to implement Gaussian filter.How to calculate the 3x3, 5x5,7x7 kernels? Please help me.
http://s14.postimg.org/rwpyq8k5d/image.jpg
The code below illustrate how to calculate the Gaussian kernel with any filter size and Gaussian weighted parameter.
enter code here public static double[,] CalculateGaussianKernel(int length, double weight)
{
// define an array of two dimensions based on the length value that pass it by the user from the text box.
double[,] Kernel = new double[length, length];
double sumTotal = 0;
int kernelRadius = length / 2;
double distance = 0;
double calculatedEuler = 1.0 / (2.0 * Math.PI * Math.Pow(weight, 2)); // Gaussian Function first part
for (int filterY = -kernelRadius; filterY <= kernelRadius; filterY++)
{
for (int filterX = -kernelRadius; filterX <= kernelRadius; filterX++)
{
distance = ((filterX * filterX) + (filterY * filterY)) /(2 * (weight * weight)); // Gaussian Function Second part
Kernel[filterY + kernelRadius,filterX + kernelRadius] = calculatedEuler * Math.Exp(-distance);
sumTotal += Kernel[filterY + kernelRadius, filterX + kernelRadius];
}
}
for (int y = 0; y < length; y++)
{
for (int x = 0; x < length; x++)
{
Kernel[y, x] = Kernel[y, x] *
(1.0 / sumTotal);
}
}
return Kernel;
}

Resources