OpenCL (JOCL) - 2D calculus over two arrays in Kernel

OpenCL (JOCL) - 2D calculus over two arrays in Kernel - max

I'm asking this here because I thought I've understood how OpenCL works but... I think there are several things I don't get.
What I want to do is to get the difference between all the values of two arrays, then calculate the hypot and finally get the maximum hypot value, so If I have:
double[] arrA = new double[]{1,2,3}
double[] arrB = new double[]{6,7,8}
Calculate
dx1 = 1 - 1; dx2 = 2 - 1; dx3 = 3 - 1, dx4= 1 - 2;... dxLast = 3 - 3
dy1 = 6 - 6; dy2 = 7 - 6; dy3 = 8 - 6, dy4= 6 - 7;... dyLast = 8 - 8
(Extreme dx and dy will get 0, but i don't care about ignoring those cases by now)
Then calculate each hypot based on hypot(dx(i), dy(i))
And once all these values where obtained, get the maximum hypot value
So, I have the next defined Kernel:
String programSource =
"#ifdef cl_khr_fp64 \n"
+ " #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n"
+ "#elif defined(cl_amd_fp64) \n"
+ " #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n"
+ "#else "
+ " #error Double precision floating point not supported by OpenCL implementation.\n"
+ "#endif \n"
+ "__kernel void "
+ "sampleKernel(__global const double *bufferX,"
+ " __global const double *bufferY,"
+ " __local double* scratch,"
+ " __global double* result,"
+ " __const int lengthX,"
+ " __const int lengthY){"
+ " const int index_a = get_global_id(0);"//Get the global indexes for 2D reference
+ " const int index_b = get_global_id(1);"
+ " const int local_index = get_local_id(0);"//Current thread id -> Should be the same as index_a * index_b + index_b;
+ " if (local_index < (lengthX * lengthY)) {"// Load data into local memory
+ " if(index_a < lengthX && index_b < lengthY)"
+ " {"
+ " double dx = (bufferX[index_b] - bufferX[index_a]);"
+ " double dy = (bufferY[index_b] - bufferY[index_a]);"
+ " scratch[local_index] = hypot(dx, dy);"
+ " }"
+ " } "
+ " else {"
+ " scratch[local_index] = 0;"// Infinity is the identity element for the min operation
+ " }"
//Make a Barrier to make sure all values were set into the local array
+ " barrier(CLK_LOCAL_MEM_FENCE);"
//If someone can explain to me the offset thing I'll really apreciate that...
//I just know there is alway a division by 2
+ " for(int offset = get_local_size(0) / 2; offset > 0; offset >>= 1) {"
+ " if (local_index < offset) {"
+ " float other = scratch[local_index + offset];"
+ " float mine = scratch[local_index];"
+ " scratch[local_index] = (mine > other) ? mine : other;"
+ " }"
+ " barrier(CLK_LOCAL_MEM_FENCE);"
//A barrier to make sure that all values where checked
+ " }"
+ " if (local_index == 0) {"
+ " result[get_group_id(0)] = scratch[0];"
+ " }"
+ "}";
For this case, the defined GWG size is (100, 100, 0) and a LWI size of (10, 10, 0).
So, for this example, both arrays have size 10 and the GWG and LWI are obtained as follows:
//clGetKernelWorkGroupInfo(kernel, device, CL.CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(buffer), null);
long kernel_work_group_size = OpenClUtil.getKernelWorkGroupSize(kernel, device.getCl_device_id(), 3);
//clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, Sizeof.size_t * numValues, Pointer.to(buffer), null);
long[] maxSize = device.getMaximumSizes();
maxSize[0] = ( kernel_work_group_size > maxSize[0] ? maxSize[0] : kernel_work_group_size);
maxSize[1] = ( kernel_work_group_size > maxSize[1] ? maxSize[1] : kernel_work_group_size);
maxSize[2] = ( kernel_work_group_size > maxSize[2] ? maxSize[2] : kernel_work_group_size);
// maxSize[2] =
long xMaxSize = (x > maxSize[0] ? maxSize[0] : x);
long yMaxSize = (y > maxSize[1] ? maxSize[1] : y);
long zMaxSize = (z > maxSize[2] ? maxSize[2] : z);
long local_work_size[] = new long[] { xMaxSize, yMaxSize, zMaxSize };
int numWorkGroupsX = 0;
int numWorkGroupsY = 0;
int numWorkGroupsZ = 0;
if(local_work_size[0] != 0)
numWorkGroupsX = (int) ((total + local_work_size[0] - 1) / local_work_size[0]);
if(local_work_size[1] != 0)
numWorkGroupsY = (int) ((total + local_work_size[1] - 1) / local_work_size[1]);
if(local_work_size[2] != 0)
numWorkGroupsZ = (int) ((total + local_work_size[2] - 1) / local_work_size[2]);
long global_work_size[] = new long[] { numWorkGroupsX * local_work_size[0],
numWorkGroupsY * local_work_size[1], numWorkGroupsZ * local_work_size[2]};
The thing is I'm not getting the espected values so I decided to make some tests based on a smaller kernel and changing the [VARIABLE TO TEST VALUES] object returned in a result array:
/**
* The source code of the OpenCL program to execute
*/
private static String programSourceA =
"#ifdef cl_khr_fp64 \n"
+ " #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n"
+ "#elif defined(cl_amd_fp64) \n"
+ " #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n"
+ "#else "
+ " #error Double precision floating point not supported by OpenCL implementation.\n"
+ "#endif \n"
+ "__kernel void "
+ "sampleKernel(__global const double *bufferX,"
+ " __global const double *bufferY,"
+ " __local double* scratch,"
+ " __global double* result,"
+ " __const int lengthX,"
+ " __const int lengthY){"
//Get the global indexes for 2D reference
+ " const int index_a = get_global_id(0);"
+ " const int index_b = get_global_id(1);"
//Current thread id -> Should be the same as index_a * index_b + index_b;
+ " const int local_index = get_local_id(0);"
// Load data into local memory
//Only print values if index_a < ArrayA length
//Only print values if index_b < ArrayB length
//Only print values if local_index < (lengthX * lengthY)
//Only print values if this is the first work group.
+ " if (local_index < (lengthX * lengthY)) {"
+ " if(index_a < lengthX && index_b < lengthY)"
+ " {"
+ " double dx = (bufferX[index_b] - bufferX[index_a]);"
+ " double dy = (bufferY[index_b] - bufferY[index_a]);"
+ " result[local_index] = hypot(dx, dy);"
+ " }"
+ " } "
+ " else {"
// Infinity is the identity element for the min operation
+ " result[local_index] = 0;"
+ " }"
The returned values are far of being the espected but, if the [VARIABLE TO TEST VALUES] is (index_a * index_b) + index_a, almost each value of the returned array has the correct (index_a * index_b) + index_a value, i mean:
result[0] -> 0
result[1] -> 1
result[2] -> 2
....
result[97] -> 97
result[98] -> 98
result[99] -> 99
but some values are: -3.350700319577517E-308....
What I'm not doing correctly???
I hope this is well explained and not that big to make you angry with me....
Thank you so much!!!!!
TomRacer

You have many problems in your code, and some of them are concept related. I think you should read the standard or OpenCL guide completely before starting to code. Because some of the system calls you are using have a different behaviour that what you expect.
Work-groups and work-items are NOT like CUDA. If you want 100x100 work-items, separated into 10x10 work-groups you use as global-size (100x100) and local-size(10x10). Unlike CUDA, where the global work item is multiplied by the local size internally.
1.1. In your test code, if you are using 10x10 with 10x10. Then you are not filling the whole space, the non filled area will still have garbage like -X.xxxxxE-308.
You should not use lengthX and lengthY and put a lot of ifs in your code. OpenCL has a method to call the kernels with offsets and with specific number of items, so you can control this from the host side. BTW doing this is a performance loss and is never a good practice since the code is less readable.
get_local_size(0) gives you the local size of axis 0 (10 in your case). What is what you do not understand in this call? Why do you divide it by 2 always?
I hope this can help you in your debugging process.
Cheers

thank you for your answer, first of all this kernel code is based on the commutative reduction code explained here: http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/.
So I'm using that code but I added some things like the 2D operations.
Regarding to the point you mentioned before:
1.1- Actually the global work group size is (100, 100, 0)... That 100 is a result of multiplying 10 x 10 where 10 is the current array size, so my global work group size is based on this rule... then the local work item size is (10, 10, 0).
Global work group size must be a multiple of local work item size, I have read this in many examples and I think this is ok.
1.2- In my test code I'm using the same arrays, in fact if I change the array size GWG size and LWI size will change dinamically.
2.1- There are not so many "if" there, there are just 3 "if", the first one checks when I must compute the hypot() based on the array objects or fill that object with zero.
The second and third "if"s are just part of the reduction algorithm that seems to be fine.
2.2- Regarding to the lengthX and lengthY yeah you are right but I haven't got that yet, how should I use that??
3.1- Yeah, I know that, but I realized that I'm not using the Y axis id so maybe there is another problem here.
3.2- The reduction algorithm iterates over each pair of elements stored in the scratch variable and checking for the maximum value between them, so for each "for" that it does it is reducing the elements to be computed to the half of the previous quantity.
Also I'm going to post a some changes on the main kernel code and in the test kernel code because there where some errors.
Greetings...!!!

Related

How it is possible to linearize this equation and to implement it in FreeFEM++ code?

I am contacting you to try to get some answers about a problem related to a non linear problem.
In fact, I tried to linearize the equation so for this I took c1 = c10 + dc1 where c1 is my variable, c10 is an initial constant and dc1 is the increment.
The corresponding code is :
//Initial value
dc=0;
//Problem
problem first(c1,c2) = int2d(Sh)(c2*dc1/dt) - int2d(Sh)(c2*dc/dt) + int2d(Sh)(R(dx(c1)dx(c2)+dy(c1)dy(c2))/(c10 + dc1)) + int2d(Sh)(ESN*(dx(c1)dx(c2)+dy(c1)dy(c2))c1) - int2d(Sh)(R(c10 + dc1)F/NDgrad(c2)‘grad(S)) - int2d(Sh)(ESFgrad(c2)’*grad(S)) + on(1,c1=C0) + on(2,c1=C0) + on(3,c1=C0) + on(4,c1=C0)
//Where dc is the incement at t-1
//and c2 is the test function added when the variational form, of the problem, was written.
problem second([Vx, Vy],[Ux, Uy]) = -int2d(Sh)(A*epsilon(Vx,Vy)'*epsilon(Ux,Uy)+C*div(Ux,Uy)*div(Vx,Vy)) + int2d(Sh)((B+A)*c1*X*div(Ux,Uy)) + on(1,Vx=0) + on(2,Vy=0)
// Where Vx, Vy are my variables and Ux, Uy are functions test (in the same way as previously).
To determine the increment dc1, I used a loop:
second;
for (iter=1;iter< niter+1;iter=iter+1)
{
first;
dc1 = c1-c10;
second;
cout << "t = " << tt << "/"<< dt*niter <<" -> residu = " << residu << endl ;
// iteration //
tt=iter*dt;
dc=dc1 ;
};
When I compiled the code, the values of c1 are completely incorrect and increase very quickly.
Could someone help me? Or give me some tips on this code?
I hope I have given enough information to allow you to understand the code and especially the error I had to write.
Thank you in advance.

Loop optimisation

I am trying to understand what cache or other optimizations could be done in the source code to get this loop faster. I think it is quite cache friendly but, are there any experts out there that could squeeze a bit more performance tuning this code?
DO K = 1, NZ
DO J = 1, NY
DO I = 1, NX
SIDEBACK = STEN(I-1,J-1,K-1) + STEN(I-1,J,K-1) + STEN(I-1,J+1,K-1) + &
STEN(I ,J-1,K-1) + STEN(I ,J,K-1) + STEN(I ,J+1,K-1) + &
STEN(I+1,J-1,K-1) + STEN(I+1,J,K-1) + STEN(I+1,J+1,K-1)
SIDEOWN = STEN(I-1,J-1,K) + STEN(I-1,J,K) + STEN(I-1,J+1,K) + &
STEN(I ,J-1,K) + STEN(I ,J,K) + STEN(I ,J+1,K) + &
STEN(I+1,J-1,K) + STEN(I+1,J,K) + STEN(I+1,J+1,K)
SIDEFRONT = STEN(I-1,J-1,K+1) + STEN(I-1,J,K+1) + STEN(I-1,J+1,K+1) + &
STEN(I ,J-1,K+1) + STEN(I ,J,K+1) + STEN(I ,J+1,K+1) + &
STEN(I+1,J-1,K+1) + STEN(I+1,J,K+1) + STEN(I+1,J+1,K+1)
RES(I,J,K) = ( SIDEBACK + SIDEOWN + SIDEFRONT ) / 27.0
END DO
END DO
END DO

Ok, I think I've tried everything I reasonably could, and my conclusion unfortunately is that there is not too much room for optimizations, unless you are willing to go into parallelization. Let's see why, let's see what you can and can't do.
Compiler optimizations
Compilers nowadays are extremely good at optimizing code, much much more than humans are. Relying on the optimizations done by the compilers also have the added benefit that they don't ruin the readability of your source code. Whatever you do, (when optimizing for speed) always try it with every reasonable combination of compiler flags. You can even go as far as to try multiple compilers. Personally I only used gfortran (included in GCC) (OS is 64-bit Windows), which I trust to have efficient and correct optimization techniques.
-O2 almost always improve the speed drastically, but even -O3 is a safe bet (among others, it includes delicious loop unrolling). For this problem, I also tried -ffast-math and -fexpensive-optimizations, they didn't have any measurable effect, but -march-corei7(cpu architecture-specific tuning, specific to Core i7) had, so I did the measurements with -O3 -march-corei7
So how fast it actually is?
I wrote the following code to test your solution and compiled it with -O3 -march-corei7. Usually it ran under 0.78-0.82 seconds.
program benchmark
implicit none
real :: start, finish
integer :: I, J, K
real :: SIDEBACK, SIDEOWN, SIDEFRONT
integer, parameter :: NX = 600
integer, parameter :: NY = 600
integer, parameter :: NZ = 600
real, dimension (0 : NX + 2, 0 : NY + 2, 0 : NZ + 2) :: STEN
real, dimension (0 : NX + 2, 0 : NY + 2, 0 : NZ + 2) :: RES
call random_number(STEN)
call cpu_time(start)
DO K = 1, NZ
DO J = 1, NY
DO I = 1, NX
SIDEBACK = STEN(I-1,J-1,K-1) + STEN(I-1,J,K-1) + STEN(I-1,J+1,K-1) + &
STEN(I ,J-1,K-1) + STEN(I ,J,K-1) + STEN(I ,J+1,K-1) + &
STEN(I+1,J-1,K-1) + STEN(I+1,J,K-1) + STEN(I+1,J+1,K-1)
SIDEOWN = STEN(I-1,J-1,K) + STEN(I-1,J,K) + STEN(I-1,J+1,K) + &
STEN(I ,J-1,K) + STEN(I ,J,K) + STEN(I ,J+1,K) + &
STEN(I+1,J-1,K) + STEN(I+1,J,K) + STEN(I+1,J+1,K)
SIDEFRONT = STEN(I-1,J-1,K+1) + STEN(I-1,J,K+1) + STEN(I-1,J+1,K+1) + &
STEN(I ,J-1,K+1) + STEN(I ,J,K+1) + STEN(I ,J+1,K+1) + &
STEN(I+1,J-1,K+1) + STEN(I+1,J,K+1) + STEN(I+1,J+1,K+1)
RES(I,J,K) = ( SIDEBACK + SIDEOWN + SIDEFRONT ) / 27.0
END DO
END DO
END DO
call cpu_time(finish)
!Use the calculated value, so the compiler doesn't optimize away everything.
!Print the original value as well, because one can never be too paranoid.
print *, STEN(1,1,1), RES(1,1,1)
print '(f6.3," seconds.")',finish-start
end program
Ok, so this is as far as the compiler can take us. What's next?
Store intermediate results?
As you might suspect from the question mark, this one didn't really work. Sorry. But let's not rush that forward.
As mentioned in the comments, your current code calculates every partial sum multiple times, meaning one iteration's STEN(I+1,J-1,K-1) + STEN(I+1,J,K-1) + STEN(I+1,J+1,K-1) will be the next iteration's STEN(I,J-1,K-1) + STEN(I,J,K-1) + STEN(I,J+1,K-1), so no need to fetch and calculate again, you can store those partial results.
The problem is, that we cannot store too many partial results. As you said, your code is already quite cache-friendly, every partial sum you store means one less array element you can store in L1 cache. We could store a few values, from the last few iterations of I (values for index I-2, I-3, etc.), but the compiler almost certainly does that already. I have 2 proofs for this suspicion. First, my manual loop unrolling made the program slower, by about 5%
DO K = 1, NZ
DO J = 1, NY
DO I = 1, NX, 8
SIDEBACK(0) = STEN(I-1,J-1,K-1) + STEN(I-1,J,K-1) + STEN(I-1,J+1,K-1)
SIDEBACK(1) = STEN(I ,J-1,K-1) + STEN(I ,J,K-1) + STEN(I ,J+1,K-1)
SIDEBACK(2) = STEN(I+1,J-1,K-1) + STEN(I+1,J,K-1) + STEN(I+1,J+1,K-1)
SIDEBACK(3) = STEN(I+2,J-1,K-1) + STEN(I+2,J,K-1) + STEN(I+2,J+1,K-1)
SIDEBACK(4) = STEN(I+3,J-1,K-1) + STEN(I+3,J,K-1) + STEN(I+3,J+1,K-1)
SIDEBACK(5) = STEN(I+4,J-1,K-1) + STEN(I+4,J,K-1) + STEN(I+4,J+1,K-1)
SIDEBACK(6) = STEN(I+5,J-1,K-1) + STEN(I+5,J,K-1) + STEN(I+5,J+1,K-1)
SIDEBACK(7) = STEN(I+6,J-1,K-1) + STEN(I+6,J,K-1) + STEN(I+6,J+1,K-1)
SIDEBACK(8) = STEN(I+7,J-1,K-1) + STEN(I+7,J,K-1) + STEN(I+7,J+1,K-1)
SIDEBACK(9) = STEN(I+8,J-1,K-1) + STEN(I+8,J,K-1) + STEN(I+8,J+1,K-1)
SIDEOWN(0) = STEN(I-1,J-1,K) + STEN(I-1,J,K) + STEN(I-1,J+1,K)
SIDEOWN(1) = STEN(I ,J-1,K) + STEN(I ,J,K) + STEN(I ,J+1,K)
SIDEOWN(2) = STEN(I+1,J-1,K) + STEN(I+1,J,K) + STEN(I+1,J+1,K)
SIDEOWN(3) = STEN(I+2,J-1,K) + STEN(I+2,J,K) + STEN(I+2,J+1,K)
SIDEOWN(4) = STEN(I+3,J-1,K) + STEN(I+3,J,K) + STEN(I+3,J+1,K)
SIDEOWN(5) = STEN(I+4,J-1,K) + STEN(I+4,J,K) + STEN(I+4,J+1,K)
SIDEOWN(6) = STEN(I+5,J-1,K) + STEN(I+5,J,K) + STEN(I+5,J+1,K)
SIDEOWN(7) = STEN(I+6,J-1,K) + STEN(I+6,J,K) + STEN(I+6,J+1,K)
SIDEOWN(8) = STEN(I+7,J-1,K) + STEN(I+7,J,K) + STEN(I+7,J+1,K)
SIDEOWN(9) = STEN(I+8,J-1,K) + STEN(I+8,J,K) + STEN(I+8,J+1,K)
SIDEFRONT(0) = STEN(I-1,J-1,K+1) + STEN(I-1,J,K+1) + STEN(I-1,J+1,K+1)
SIDEFRONT(1) = STEN(I ,J-1,K+1) + STEN(I ,J,K+1) + STEN(I ,J+1,K+1)
SIDEFRONT(2) = STEN(I+1,J-1,K+1) + STEN(I+1,J,K+1) + STEN(I+1,J+1,K+1)
SIDEFRONT(3) = STEN(I+2,J-1,K+1) + STEN(I+2,J,K+1) + STEN(I+2,J+1,K+1)
SIDEFRONT(4) = STEN(I+3,J-1,K+1) + STEN(I+3,J,K+1) + STEN(I+3,J+1,K+1)
SIDEFRONT(5) = STEN(I+4,J-1,K+1) + STEN(I+4,J,K+1) + STEN(I+4,J+1,K+1)
SIDEFRONT(6) = STEN(I+5,J-1,K+1) + STEN(I+5,J,K+1) + STEN(I+5,J+1,K+1)
SIDEFRONT(7) = STEN(I+6,J-1,K+1) + STEN(I+6,J,K+1) + STEN(I+6,J+1,K+1)
SIDEFRONT(8) = STEN(I+7,J-1,K+1) + STEN(I+7,J,K+1) + STEN(I+7,J+1,K+1)
SIDEFRONT(9) = STEN(I+8,J-1,K+1) + STEN(I+8,J,K+1) + STEN(I+8,J+1,K+1)
RES(I ,J,K) = ( SIDEBACK(0) + SIDEOWN(0) + SIDEFRONT(0) + &
SIDEBACK(1) + SIDEOWN(1) + SIDEFRONT(1) + &
SIDEBACK(2) + SIDEOWN(2) + SIDEFRONT(2) ) / 27.0
RES(I + 1,J,K) = ( SIDEBACK(1) + SIDEOWN(1) + SIDEFRONT(1) + &
SIDEBACK(2) + SIDEOWN(2) + SIDEFRONT(2) + &
SIDEBACK(3) + SIDEOWN(3) + SIDEFRONT(3) ) / 27.0
RES(I + 2,J,K) = ( SIDEBACK(2) + SIDEOWN(2) + SIDEFRONT(2) + &
SIDEBACK(3) + SIDEOWN(3) + SIDEFRONT(3) + &
SIDEBACK(4) + SIDEOWN(4) + SIDEFRONT(4) ) / 27.0
RES(I + 3,J,K) = ( SIDEBACK(3) + SIDEOWN(3) + SIDEFRONT(3) + &
SIDEBACK(4) + SIDEOWN(4) + SIDEFRONT(4) + &
SIDEBACK(5) + SIDEOWN(5) + SIDEFRONT(5) ) / 27.0
RES(I + 4,J,K) = ( SIDEBACK(4) + SIDEOWN(4) + SIDEFRONT(4) + &
SIDEBACK(5) + SIDEOWN(5) + SIDEFRONT(5) + &
SIDEBACK(6) + SIDEOWN(6) + SIDEFRONT(6) ) / 27.0
RES(I + 5,J,K) = ( SIDEBACK(5) + SIDEOWN(5) + SIDEFRONT(5) + &
SIDEBACK(6) + SIDEOWN(6) + SIDEFRONT(6) + &
SIDEBACK(7) + SIDEOWN(7) + SIDEFRONT(7) ) / 27.0
RES(I + 6,J,K) = ( SIDEBACK(6) + SIDEOWN(6) + SIDEFRONT(6) + &
SIDEBACK(7) + SIDEOWN(7) + SIDEFRONT(7) + &
SIDEBACK(8) + SIDEOWN(8) + SIDEFRONT(8) ) / 27.0
RES(I + 7,J,K) = ( SIDEBACK(7) + SIDEOWN(7) + SIDEFRONT(7) + &
SIDEBACK(8) + SIDEOWN(8) + SIDEFRONT(8) + &
SIDEBACK(9) + SIDEOWN(9) + SIDEFRONT(9) ) / 27.0
END DO
END DO
END DO
And what's worse, it's easy to show we are already pretty close the theoretical minimal possible execution time. In order to calculate all these averages, the absolute minimum we need to do, is access every element at least once, and divide them by 27.0. So you can never get faster than the following code, which executes under 0.48-0.5 seconds on my machine.
program benchmark
implicit none
real :: start, finish
integer :: I, J, K
integer, parameter :: NX = 600
integer, parameter :: NY = 600
integer, parameter :: NZ = 600
real, dimension (0 : NX + 2, 0 : NY + 2, 0 : NZ + 2) :: STEN
real, dimension (0 : NX + 2, 0 : NY + 2, 0 : NZ + 2) :: RES
call random_number(STEN)
call cpu_time(start)
DO K = 1, NZ
DO J = 1, NY
DO I = 1, NX
!This of course does not do what you want to do,
!this is just an example of a speed limit we can never surpass.
RES(I, J, K) = STEN(I, J, K) / 27.0
END DO
END DO
END DO
call cpu_time(finish)
!Use the calculated value, so the compiler doesn't optimize away everything.
print *, STEN(1,1,1), RES(1,1,1)
print '(f6.3," seconds.")',finish-start
end program
But hey, even a negative result is a result. If just accessing every element once (and dividing by 27.0) takes up more than half of the execution time, that just means memory access is the bottle neck. Then maybe you can optimize that.
Less data
If you don't need the full precision of 64-bit doubles, you can declare your array with a type of real(kind=4). But maybe your reals are already 4 bytes. In that case, I believe some Fortran implementations support non-standard 16-bit doubles, or depending on your data you can just use integers (maybe floats multiplied by a number then rounded to integer). The smaller your base type is, the more elements you can fit into the cache. The most ideal would be integer(kind=1), of course, it caused more than a 2x speed up on my machine, compared to real(kind=4). But it depends on the precision you need.
Better locality
Column major arrays are slow when you need data from neighbouring column, and row major ones are slow for neighbouring rows.
Fortunately there is a funky way to store data, called a Z-order curve, which does have applications similar to your use case in computer graphics.
I can't promise it will help, maybe it will be terribly counterproductive, but maybe not. Sorry, I didn't feel like implementing it myself, to be honest.
Parallelization
Speaking of computer graphics, this problem is trivially and extremely well parallelizable, maybe even on a GPU, but if you don't want to go that far, you can just use a normal multicore CPU. The Fortran Wiki seems like a good place to search for Fortran parallelization libraries.

Scala implementation of sobel filter

I'm looking for some help in a IT school project. We need to create a programm which can detect roads in a satelite photograph. Our group decided to use a function for detect edges. We search differents solutions and filters on Internet and we decides to use Sobel filter.
We have tried to implement this filter in Scala but it didn't work. We use differents webpages to help us, some of these are on StackOverflow (here). We use this one to help us and try to translate the code : Sobel filter in Ruby.
Start Code --
codeGrey(); // This function transform the RGB in grey level
var sobel_x: Array[Array[Double]] = Array(
Array(-1, 0, 1),
Array(-2, 0, 2),
Array(-1, 0, 1))
var sobel_y: Array[Array[Double]] = Array(
Array(1, 2, 1),
Array(0, 0, 0),
Array(-1, -2, 1))
for (x <- 1 to wrappedImage.height - 2) {
for (y <- 1 to wrappedImage.width - 2) {
var a = (image2D(x - 1)(y - 1) & 0x00FF0000) >> 16
var b = (image2D(x)(y - 1) & 0x00FF0000) >> 16
var c = (image2D(x + 1)(y - 1) & 0x00FF0000) >> 16
var d = (image2D(x - 1)(y) & 0x00FF0000) >> 16
var e = (image2D(x)(y) & 0x00FF0000) >> 16
var f = (image2D(x + 1)(y) & 0x00FF0000) >> 16
var g = (image2D(x - 1)(y + 1) & 0x00FF0000) >> 16
var h = (image2D(x)(y + 1) & 0x00FF0000) >> 16
var i = (image2D(x + 1)(y + 1) & 0x00FF0000) >> 16
var pixel_x =
(sobel_x(0)(0) * a) + (sobel_x(0)(1) * b) + (sobel_x(0)(2) * c) +
(sobel_x(1)(0) * d) + (sobel_x(1)(1) * e) + (sobel_x(1)(2) * f) +
(sobel_x(2)(0) * g) + (sobel_x(2)(1) * h) + (sobel_x(2)(2) * i);
var pixel_y =
(sobel_y(0)(0) * a) + (sobel_x(0)(1) * b) + (sobel_x(0)(2) * c) +
(sobel_y(1)(0) * d) + (sobel_x(1)(1) * e) + (sobel_x(1)(2) * f) +
(sobel_y(2)(0) * g) + (sobel_x(2)(1) * h) + (sobel_x(2)(2) * i);
var res = (Math.sqrt((pixel_x * pixel_x) + (pixel_y * pixel_y)).ceil).toInt
image2D(x)(y) = 0xFF000000 + (res * 65536 + res * 256 + res);
}
}
End Code --
The image returned by this implementation is just an image with black and white pixels and I don't know why. I've got no experience in image processing and we learned Scala 8 weeks ago so that doesn't help.
I'm sorry, my english is not perfect so please forgive me if I didn't write correctly.

I'm not sure I grasp all the details of your solution, anyway here some observation:
consider using vals instead of vars: Scala prefers
immutables and you are not really changing any of those variables.
In scala you can write nested for cycles as a single one over two
variables (check here for details:
Nested iteration in Scala). I think it makes code cleaner.
I presume image2D is the array of arrays in which you are
holding your image. In the last line of your nested for loop you are
changing the current pixel value. This is not good because you will
access that same pixel later when you calculate your a,b,..,h,i
values. The center pixel during current iteration is the side pixel
during next iteration. I think you should write the result in a
different matrix.

Quaternion product is different from the quaternion extracted from the matrix product

Yesterday I have been trying to solve the following problem: calculate the local rotation quaternion for a bone given its global rotation quaternion and the skeleton state.
I figured that the global quaternion equals to the parent bone global multiplied by the bone's local quaternion:
global = globalParent * local
After a couple of simple manipulations, I got the following:
local = (globalParent)-1 * global
I did some tests on that equation and, to my surprise, it sometimes yields the correct answer and sometimes I get the correct answer multiplied by -1. Not the local quaternion's conjugate, but the whole quaternion multiplied by -1.
So I went back to the original equation (global = globalParent * local) and tested it too. The same thing happened, sometimes the right answer and sometimes the right answer multiplied by -1.
I found that really strange and went further to make the matrix product (globalParent * local) and extract the quaternion for the result. In this situation, I always got the right answer.
Finally, my question is pretty simple. Where is the mistake in my thought process when manipulating the quaternions?
The code I used to check the things I said is the following:
{
var p = new THREE.Vector3();
var s = new THREE.Vector3();
var bone = this.get_model_bone(label);
var gm = bone.matrixWorld;
var g = new THREE.Quaternion();
gm.decompose(p, g, s);
console.log(label + " - GLOBAL: (" + g.x + ", " + g.y + ", " + g.z + ", " + g.w + ")");
var m = bone.matrix;
var q = new THREE.Quaternion();
m.decompose(p, q, s);
console.log(label + " - LOCAL: (" + q.x + ", " + q.y + ", " + q.z + ", " + q.w + ")");
if(bone.parent !== null)
{
var gpm = bone.parent.matrixWorld;
var gp = new THREE.Quaternion();
gpm.decompose(p, gp, s);
console.log(label + " - PARENT GLOBAL: (" + gp.x + ", " + gp.y + ", " + gp.z + ", " + gp.w + ")");
var productMatrix = new THREE.Matrix4().multiplyMatrices(gpm, m);
var qprod = new THREE.Quaternion();
productMatrix.decompose(p, qprod, s);
console.log(label + " - MATRIX PROD: (" + qprod.x + ", " + qprod.y + ", " + qprod.z + ", " + qprod.w + ")");
var gpq = new THREE.Quaternion().multiplyQuaternions(gp, q);
console.log(label + " - QUAT PROD: (" + gpq.x + ", " + gpq.y + ", " + gpq.z + ", " + gpq.w + ")");
}
}
In my logs, sometimes "MATRIX PROD" is different than "QUAT PROD". The model I am using can be found at:
https://raw.githubusercontent.com/poli-libras/virtual-jonah2/master/resources/model/human.js

I did some tests on that equation and, to my surprise, it sometimes yields the correct answer and sometimes I get the correct answer multiplied by -1.
Quaternion and quaternion multiplied by -1 represent exact same transformation (rotation).

What's the best way to handle an "all combinations" project?

I've been assigned a school project in which I need to come up with as many integers as possible using only the integers 2 3 4 and the operators + - * / %. I then have to output the integers with cout along with how I got that answer. For example:
cout << "2 + 3 - 4 = " << 2 + 3 - 4;
I can only use each integer once per cout statement, and there can be no duplicate answers.
Everyone else seems to be doing the "brute force" method (i.e., copying and pasting the same statements and changing the numbers and operators), but that hardly seems efficient. I figured I would try cycling through each number and operator one-by-one and checking to see if the answer has already been found, but I'm unsure of what the easiest way to do this would be.
I suppose I could use nested loops, but there's still the problem of checking to see if the answer has already been found. I tried storing the answers in a vector, but I couldn't pass the vector to a user-defined function that checked to see if a value existed in the vector.

You could use a map or a hash_map from the Standard Template Library (STL). These structures store key-value pairs efficiently. Read up on them before you use them but they might give you a good starting point. Hint: The integers you compute would probably make good keys.

Assuming you can use each of the numbers in the set(2, 3, 4) only once there are 3! ways of arranging these 3 numbers. Then there are 2 places for sign and you have total 5 symbols(+ -
* / %) so there are 5*5 = 25 ways to do that. So you have total 3! * 25 expressions.
Than you can create a hash map where key will be number and value will be the expression. If the hash map contains a key already you skip that expression.

You could try a bit of meta-programming, as follows. It has the advantage of using C itself to calculate the expressions rather than you trying to do your own evaluator (and possibly getting it wrong):
#include <stdlib.h>
#include <iostream>
#include <fstream>
using namespace std;
int main (void) {
int n1, n2, n3;
const char *ops[] = {" + ", " - ", " * ", " / ", " % ", 0};
const char **op1, **op2;
ofstream of;
of.open ("prog2.cpp", ios::out);
of << "#include <iostream>\n";
of << "using namespace std;\n";
of << "#define IXCOUNT 49\n\n";
of << "static int mkIdx (int tot) {\n";
of << " int ix = (IXCOUNT / 2) + tot;\n";
of << " if ((ix >= 0) && (ix < IXCOUNT)) return ix;\n";
of << " cout << \"Need more index space, "
<< "try \" << IXCOUNT + 1 + (ix - IXCOUNT) * 2 << \"\\n\";\n";
of << " return -1;\n";
of << "}\n\n";
of << "int main (void) {\n";
of << " int tot, ix, used[IXCOUNT];\n\n";
of << " for (ix = 0; ix < sizeof(used)/sizeof(*used); ix++)\n";
of << " used[ix] = 0;\n\n";
for (n1 = 2; n1 <= 4; n1++) {
for (n2 = 2; n2 <= 4; n2++) {
if (n2 != n1) {
for (n3 = 2; n3 <= 4; n3++) {
if ((n3 != n1) && (n3 != n2)) {
for (op1 = ops; *op1 != 0; op1++) {
for (op2 = ops; *op2 != 0; op2++) {
of << " tot = " << n1 << *op1 << n2 << *op2 << n3 << ";\n";
of << " if ((ix = mkIdx (tot)) < 0) return ix;\n";
of << " if (!used[ix])\n";
of << " cout << " << n1 << " << \"" << *op1 << "\" << "
<< n2 << " << \"" << *op2 << "\" << " << n3
<< " << \" = \" << tot << \"\\n\";\n";
of << " used[ix] = 1;\n\n";
}
}
}
}
}
}
}
of << " return 0;\n";
of << "}\n";
of.close();
system ("g++ -o prog2 prog2.cpp ; ./prog2");
return 0;
}
This gives you:
2 + 3 + 4 = 9
2 + 3 - 4 = 1
2 + 3 * 4 = 14
2 + 3 / 4 = 2
2 + 3 % 4 = 5
2 - 3 + 4 = 3
2 - 3 - 4 = -5
2 - 3 * 4 = -10
2 - 3 % 4 = -1
2 * 3 + 4 = 10
2 * 3 * 4 = 24
2 / 3 + 4 = 4
2 / 3 - 4 = -4
2 / 3 * 4 = 0
2 % 3 + 4 = 6
2 % 3 - 4 = -2
2 % 3 * 4 = 8
2 * 4 + 3 = 11
2 / 4 - 3 = -3
I'm not entirely certain of the wisdom of handing this in as an assignment however :-)

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio