Parallelizing on a 2D domain using MPI

Parallelizing on a 2D domain using MPI - algorithm

I can't seem to get this algorithm to work and I believe that it may be due to 'race condition' but I could be wrong:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
#define BILLION 1000000000L
double f(double, double);
double g(double, double);
int main(int argc, char** argv){
FILE *myA, *myB;
int rank, size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (rank == 0){
myA = fopen("myA.py", "w");
myB = fopen("myB.py", "w");
}
int m = 255; // Max number of x values
int n = 255; // Max number of y values
int Tmax = 5;//10000; // Max number of time steps
double a = 0, b = 2.5; // starting and ending points along x-axis
double c = 0, d = 2.5; // starting and ending points along y-axis
double dx = (b - a)/m; // x partition width
double dy = (d - c)/n; // y partition width
double dt = 1.; // t partition width
double D_u = 0.00002; // diffusion coefficient
double alpha_u = D_u*dt/(dx*dx), gamma_u = D_u*dt/(dy*dy), beta_u = 1 - 2*alpha_u - 2*gamma_u; // coeffs for fwd Euler method
double D_v = 0.00001; // diffusion coefficient
double alpha_v = D_v*dt/(dx*dx), gamma_v = D_v*dt/(dy*dy), beta_v = 1 - 2*alpha_v - 2*gamma_v; // coeffs for fwd Euler method
// Parameters:
double F = 0.040;
double K = 0.063;
// Domain:
double u[m+1][n+1]; // soln to the diffusion equation
double utmp[m+1][n+1]; // temp storage
double v[m+1][n+1]; // soln to the diffusion equation
double vtmp[m+1][n+1]; // temp storage
int i, j, k;
// initialize time variables
struct timespec begin, end;
double time_lapsed;
// seed rand
srand(time(NULL));
double noise;
double lowest = -1./100.;
double highest = 1./100.;
double range = (highest - lowest);
// divide up the domain evenly among groups
int Np = floor((double)m/size); // Number of rows per processor
//int res = m % size/2; // in case extra row in subgroup
//int bigres = n % 2; // in case extra row overall
int istart = rank*Np;
int iend;
if (rank == 0){
istart = 1;
iend = (rank + 1)*Np;
}
else if (rank == size-1){
iend = m;
}
else {
iend = (rank + 1)*Np;
}
if (rank == 0){
fprintf(myA,"from numpy import array\n");
fprintf(myA,"\ndef myAi():\n");
fprintf(myA,"\treturn array([ ");
clock_gettime(CLOCK_MONOTONIC, &begin); // start timing u
}
// Initialization for u:
for (i = 0; i <= m; i += 1){
if (rank == 0){
fprintf(myA,"[ ");
}
for (j = 0; j <= n; j += 1){
// create square
if ((i >= 117 && i <= 137) && (j >= 117 && j <= 137)){
noise = (lowest + range*rand()/(RAND_MAX + 1.0));
if (abs(noise) > 0.01){
printf("noise: %f\n",noise);
}
utmp[i][j] = 1./2 + noise*(1./2.);//f(a + i*dx,c + j*dy);
u[i][j] = utmp[i][j];
}
else{
utmp[i][j] = 1.;//f(a + i*dx,c + j*dy);
u[i][j] = utmp[i][j];
}
if (rank == 0){
// print matrix entries
if (j != n){
fprintf(myA,"%f, ",utmp[i][j]);
}
else{
fprintf(myA,"%f ",utmp[i][j]);
}
}
}
if (rank == 0){
if (i != m){
fprintf(myA,"],\n");
}
else{
fprintf(myA,"]");
}
}
MPI_Bcast(&u[i][0],(n+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&utmp[i][0],(n+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
}
if (rank == 0){
fprintf(myA,"])\n");
clock_gettime(CLOCK_MONOTONIC, &end);
time_lapsed = (end.tv_sec - begin.tv_sec) + (double)(end.tv_nsec - begin.tv_nsec)/BILLION;
printf("\nprint 'Time to initialize u:',%f,'seconds.'\n",time_lapsed);
clock_gettime(CLOCK_MONOTONIC, &begin); // start timing v
fprintf(myB,"from numpy import array\n");
fprintf(myB,"\ndef myBi():\n");
fprintf(myB,"\treturn array([ ");
}
// Initialization for v:
for (i = 0; i <= m; i += 1){
if (rank == 0){
fprintf(myB,"[ ");
}
for (j = 0; j <= n; j += 1){
// create square
if ((i >= 117 && i <= 137) && (j >= 117 && j <= 137)){
noise = (lowest + range*rand()/(RAND_MAX + 1.0));
vtmp[i][j] = 1./4 + noise*(1./4.);//g(a + i*dx,c + j*dy);
if (abs(noise) > 0.01){
printf("noise: %f\n",noise);
}
v[i][j] = vtmp[i][j];
}
else{
vtmp[i][j] = 0.;//g(a + i*dx,c + j*dy);
v[i][j] = vtmp[i][j];
}
if (rank == 0){
// print matrix entries
if (j != n){
fprintf(myB,"%f, ",vtmp[i][j]);
}
else{
fprintf(myB,"%f ",vtmp[i][j]);
}
}
}
if (rank == 0){
if (i != m){
fprintf(myB,"],\n");
}
else{
fprintf(myB,"]");
}
}
MPI_Bcast(&v[i][0],(n+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&vtmp[i][0],(n+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
}
if (rank == 0){
fprintf(myB,"])\n");
clock_gettime(CLOCK_MONOTONIC, &end);
time_lapsed = (end.tv_sec - begin.tv_sec) + (double)(end.tv_nsec - begin.tv_nsec)/BILLION;
printf("\nprint 'Time to initialize v:',%f,'seconds.'\n",time_lapsed);
}
MPI_Barrier(MPI_COMM_WORLD);
// All together now...
if (iend > m/2){
if (rank == size-1){
for (k = 1; k <= Tmax; k++){
i = istart;
for (i = istart; i < iend-1; i++){
for (j = 1; j < n-1; j++){
// Do usual computation with u_i,j = alpha * (u_i-1,j + u_i+1,j) +
u[i][j] = alpha_u*(utmp[i-1][j] + utmp[i+1][j]) + beta_u*utmp[i][j] + gamma_u*(utmp[i][j-1] + utmp[i][j+1]) - u[i][j]*v[i][j]*v[i][j] + F*(1. - u[i][j]);
v[i][j] = alpha_v*(vtmp[i-1][j] + vtmp[i+1][j]) + beta_v*vtmp[i][j] + gamma_v*(vtmp[i][j-1] + vtmp[i][j+1]) + u[i][j]*v[i][j]*v[i][j] - (F+K)*v[i][j];
}
// left-right Periodic boundary conditions:
u[i][n-1] = alpha_u*(utmp[i-1][n-1] + utmp[i+1][n-1]) + beta_u*utmp[i][n-1] + gamma_u*(utmp[i][n-2] + utmp[i][0]) - u[i][n-1]*v[i][n-1]*v[i][n-1] + F*(1. - u[i][n-1]);
v[i][n-1] = alpha_v*(vtmp[i-1][n-1] + vtmp[i+1][n-1]) + beta_v*vtmp[i][n-1] + gamma_v*(vtmp[i][n-2] + vtmp[i][0]) + u[i][j]*v[i][n-1]*v[i][n-1] - (F+K)*v[i][n-1];
}
// top-bottom Periodic Boundary conditions:
for (j = 1; j < n-1; j++){
u[iend-1][j] = alpha_u*(utmp[iend-2][j] + utmp[0][j]) + beta_u*utmp[iend-1][j] + gamma_u*(utmp[iend-1][j-1] + utmp[iend-1][j+1]) - u[iend-1][j]*v[iend-1][j]*v[iend-1][j] + F*(1. - u[iend-1][j]);
v[iend-1][j] = alpha_v*(vtmp[iend-2][j] + vtmp[0][j]) + beta_v*vtmp[iend-1][j] + gamma_v*(vtmp[iend-1][j-1] + vtmp[iend-1][j+1]) + u[iend-1][j]*v[iend-1][j]*v[iend-1][j] - (F+K)*v[iend-1][j];
}
// top-bottom & left-right Periodic Boundary Conditions
u[iend-1][n-1] = alpha_u*(utmp[iend-2][n-1] + utmp[0][n-1]) + beta_u*utmp[iend-1][n-1] + gamma_u*(utmp[iend-1][n-2] + utmp[iend-1][0]) - u[iend-1][n-1]*v[iend-1][n-1]*v[iend-1][n-1] + F*(1. - u[iend-1][n-1]);
v[iend-1][n-1] = alpha_v*(vtmp[iend-2][n-1] + vtmp[0][n-1]) + beta_v*vtmp[iend-1][n-1] + gamma_v*(vtmp[iend-1][n-2] + vtmp[iend-1][0]) + u[iend-1][n-1]*v[iend-1][n-1]*v[iend-1][n-1] - (F+K)*v[iend-1][n-1];
i = istart;
for (i = istart; i <= iend; i++){ //istart; i <= iend; i++){
for (j = 0; j <= n; j++){
utmp[i][j] = u[i][j];
vtmp[i][j] = v[i][j];
}
}
}
}
else{
for (k = 1; k <= Tmax; k++){
i = istart;
for (i = istart; i <= iend-1; i++){
for (j = 1; j < n-1; j++){
// Do usual computation with u_i,j = alpha * (u_i-1,j + u_i+1,j) +
u[i][j] = alpha_u*(utmp[i-1][j] + utmp[i+1][j]) + beta_u*utmp[i][j] + gamma_u*(utmp[i][j-1] + utmp[i][j+1]) - u[i][j]*v[i][j]*v[i][j] + F*(1. - u[i][j]);
v[i][j] = alpha_v*(vtmp[i-1][j] + vtmp[i+1][j]) + beta_v*vtmp[i][j] + gamma_v*(vtmp[i][j-1] + vtmp[i][j+1]) + u[i][j]*v[i][j]*v[i][j] - (F+K)*v[i][j];
}
// left-right Periodic boundary conditions:
u[i][n-1] = alpha_u*(utmp[i-1][n-1] + utmp[i+1][n-1]) + beta_u*utmp[i][n-1] + gamma_u*(utmp[i][n-2] + utmp[i][0]) - u[i][n-1]*v[i][n-1]*v[i][n-1] + F*(1. - u[i][n-1]);
v[i][n-1] = alpha_v*(vtmp[i-1][n-1] + vtmp[i+1][n-1]) + beta_v*vtmp[i][n-1] + gamma_v*(vtmp[i][n-2] + vtmp[i][0]) + u[i][j]*v[i][n-1]*v[i][n-1] - (F+K)*v[i][n-1];
}
i = istart;
for (i = istart; i <= iend; i++){
for (j = 0; j <= n; j++){
utmp[i][j] = u[i][j];
vtmp[i][j] = v[i][j];
}
}
}
}
}
else {
int count;
for (k = 1; k <= Tmax; k++){
count = iend-1;
while (count >= istart){
//printf("i = %d\n",i);
for (j = 1; j < n-1; j++){
// Do usual computation with u_i,j = alpha * (u_i-1,j + u_i+1,j) +
u[count][j] = alpha_u*(utmp[count-1][j] + utmp[count+1][j]) + beta_u*utmp[count][j] + gamma_u*(utmp[count][j-1] + utmp[count][j+1]) - u[count][j]*v[count][j]*v[count][j] + F*(1. - u[count][j]);
v[count][j] = alpha_v*(vtmp[count-1][j] + vtmp[count+1][j]) + beta_v*vtmp[count][j] + gamma_v*(vtmp[count][j-1] + vtmp[count][j+1]) + u[count][j]*v[count][j]*v[count][j] - (F+K)*v[count][j];
}
// left-right Periodic boundary conditions:
u[count][n-1] = alpha_u*(utmp[count-1][n-1] + utmp[count+1][n-1]) + beta_u*utmp[count][n-1] + gamma_u*(utmp[count][n-2] + utmp[count][0]) - u[count][n-1]*v[count][n-1]*v[count][n-1] + F*(1. - u[count][n-1]);
v[count][n-1] = alpha_v*(vtmp[count-1][n-1] + vtmp[count+1][n-1]) + beta_v*vtmp[count][n-1] + gamma_v*(vtmp[count][n-2] + vtmp[count][0]) + u[count][j]*v[count][n-1]*v[count][n-1] - (F+K)*v[count][n-1];
count = count-1;
}
i = istart;
for (i = istart; i <= iend; i++){
for (j = 0; j <= n; j++){
utmp[i][j] = u[i][j];
vtmp[i][j] = v[i][j];
}
}
}
}
if (rank == 0){
clock_gettime(CLOCK_MONOTONIC, &end);
time_lapsed = (end.tv_sec - begin.tv_sec) + (double)(end.tv_nsec - begin.tv_nsec)/BILLION;
printf("\nprint 'Time for algorithm to complete:',%f,'seconds.'\n",time_lapsed);
fprintf(myA,"\n");
fprintf(myA,"\ndef myAf():\n");
fprintf(myA,"\treturn array([ ");
for (i = 0; i <= m; i++){
fprintf(myA,"[ ");
for (j = 0; j <= n; j++){
if (j != n){
fprintf(myA,"%f, ",utmp[i][j]);
}
else{
fprintf(myA,"%f ",utmp[i][j]);
}
}
if (i != m){
fprintf(myA,"],\n");
}
else{
fprintf(myA,"]");
}
}
fprintf(myA,"])\n");
fprintf(myB,"\ndef myBf():\n");
fprintf(myB,"\treturn array([");
for (i = 0; i <= m; i++){
fprintf(myB,"[ ");
for (j = 0; j <= n; j++){
if (j != n){
fprintf(myB,"%f, ",vtmp[i][j]);
}
else{
fprintf(myB,"%f ",vtmp[i][j]);
}
}
if (i != m){
fprintf(myB,"],\n");
}
else{
fprintf(myB,"]");
}
}
fprintf(myB,"])\n");
fclose(myA);
fclose(myB);
}
MPI_Finalize();
return 0;
}
// For experimentation with different initial conditions
double f(double x, double y){
return x - x*x + y - y*y; //sin(x*x + y*y);
//exp(20*(x-1./2)*(x-1./2) - 20*(y-1./2)*(y-1./2));//x - x*x + y - y*y;
}
double g(double x, double y){
return sin(x*x + y*y); //sin(x*x + y*y);
//exp(20*(x-1./2)*(x-1./2) - 20*(y-1./2)*(y-1./2));//x - x*x + y - y*y;
}
The algorithm is forward Euler method on a periodic 2D domain (the 2D arrays) and for clarity I left out a lot of the parts, unless more is needed. The initial and final results will be output to a file by the master processor (rank 0 as in code) ready to be plotted.
The idea that I have in mind here is to divide the domain among processors into (# of rows)/(# of processors) chunk sizes with the first half of all processors doing the top half of the domain (starting at the center to the top). Then, the other half of the processors doing the bottom half of the domain (starting at the center to the bottom).
However, only the bottom half of the domain is being updated which leads me to believe that some sort of 'race condition' is going on.
--EDIT--
Original code is being used instead.
I think I know what the problem is. Each processor has it's own 'local' copy of the domain that it's updating. Hence when rank 0 is printing to file, it's printing it's own 'local' version of the domain, which on two processors I would see half of the 'entire picture'.
However what I want is for each processor to update it's piece of the domain then have processor 0 print the entire updated domain to file. How might I go about doing this if this is the issue?

OK, I finally had the chance to read the code, and yes, you're correct in your analysis: since process #0 only has its own share of the domain updated, what it prints is only relevant for this part, not for the whole domain. Therefore, you have two options for your code:
To keep its current structure and have process #0 to collect every data from the other processes prior to printing the result into the file; or
To go for a fully parallel approach and have all processes generating their own data, doing the computation, and finally printing their own share in parallel.
Needless to say that the second approach is far better and far more effective too. Moreover, it breaks this useless master-slave approach that you have at the moment (sorry I couldn't resist mentioning that).
So, supposing you'd go for approach number two, what would it translate into?
Well,
You would compute your indexes limits pretty-much the way you did it so far, and allocate only the share of the compute array the current process is responsible of plus an extra layer surrounding it, called the ghost layer. This means that if you need to go for larger domains, by increasing the number of compute nodes, you'll also increase the overall amount of data you will be able to deal with. The code becomes scalable in that regard. And since your code's algorithm looks suspiciously like a 5 points stencil, your ghost layers will be 1 cell wide, which won't represent a lot of data transfers.
You would initialise your data and start computing it. Simply, after each iteration of the k time loop, you'd have to exchange data between the various processes for the ghost layers, to propagate what comes from the adjacent domains. NB that this is an issue you already have in your current code (since you skipped this mandatory stage), which makes it wrong irrespective of the final printing issue.
Finally all processes would write in parallel their own share of the domain, using MPI-IO. This is done by creating a per-process "view" of the output file, where the data will fit to assemble the overall result.
Altogether, this represents a bit of work (more than I can allocate for answering a SO question). However, even if you decided not to go for this option, fixing your code (notably managing the data exchanges at each time step) will be almost as lengthy. So I really encourage you to "think parallel" and make it right from all points of view.
Alternatively, if you don't want to take the long road of full MPI parallelisation, and assuming memory consumption is not an issue for you, you can try an OpenMP parallelisation which should be quite straightforward, starting from the sequential version of the code. However, since your code is very likely memory bound, don't expect too much benefit from the OpenMP parallelisation.

Related

Algorithm time optimization for a very simple algorithm

I have this piece of code I developed in an algorithm.
It finds the first element of an array that is greater than a given one for a fixed amount.
It iterates using all the array elements as a comparing item and print the result for any given comparing item
for(i=0; i<N; i++) {
for (t = i + 1; t < N - 1; t++) {
if (H[t] - H[i] > VSQR)
break;
}
printf("%d ", t - 1); // print the result
}
The algorithm results too slow for big arrays, but I cannot find any optimization.

The idea of the O(nlogn) algorithm:
First we need to realize that only the heights we can see from some position matter (assuming bigger heights hide smaller heights, similar to the sky scraper logic puzzle). That's because if an height is high enough and meets our constraints we would stop there and if it isn't high enough then surely the smaller heights behind it don't matter.
If we only look at the heights that matter, they will be ordered from smallest to highest. So if we only have those heights we can do a binary search on them. We get the heights that matter by going backwards and discarding all heights behind us that are smaller or equal to the current height.
After that we do a binary search on the remaining heights to find the correct one.
Finally we insert the current height into the heights that matter.
We can implement this using arrays. The heights that matter will be stored in decreasing order (highest first, lowest last). This means that to discard all heights smaller or equal to the current one we can do a binary search and just set the size to a different value, so all smaller ones are not in our scope any more. Then we can do a binary search on that array to find the correct height. We use a second array just to store the index of the heights that matter, so we can get the result immediately. Finally to insert the current height we just add it at the end of the heights array, because it is now the smallest height that matters and the array is sorted in decreasing order.
The time complexity is O(nlogn) because the binary search to find the values to discard is O(logn) and the binary search to find the result is also O(logn). We do this n times. O(n) * (O(logn) + O(logn)) = O(2nlogn) = O(nlogn)
So it could be implemented like this:
int[] heights = new int[N]; // we need to add at most N values
int[] indices = new int[N]; // therefore just make array length N
int[] result = new int[N];
heights[0] = H[N - 1];
indices[0] = N - 1;
result[N - 1] = N - 1;
int size = 1;
for (int i = N - 2; i >= 0; i--) {
size = binarySearch(heights, H[i], 0, size) + 1; // discards all values <= H[i]
if (size == 0) {
result[i] = N - 1;
} else {
int x = binarySearch(heights, H[i] + VSQR, 0, size); // find result
result[i] = x < 0 ? N - 1 : indices[x] - 1;
}
heights[size] = H[i]; // add height at the end
indices[size] = i;
size++;
}
// binarySearch(array, value, from, to) returns the index of the smallest
// number which is greater than 'value' in the range 'from' <= i < 'to'
// or returns -1 if there is none
Old answer, before realizing AVL tree is not needed:
It is possible to get O(nlogn) instead of O(n^2) by using an AVL tree and going backwards.
The height value is used for the positioning in the tree, but along with this value we also store the index in the node. Also create an array for the results.
Then for each height (going backwards from the last one until the first one) do the following:
Remove all items <= H[i] from the tree. Find and set the result using the tree (that's why we also need the index in the node, so we can get the result easily). The value we are looking for is the smallest value > H[i] + VSQR (if the tree is empty or if there is no value big enough then the result is N - 1). Insert H[i] into the tree.

After you posting more info this is what you can do:
find all local min,max O(n)
so loop through your array and create an list of indexes for local min and local max the edges included. This will create a set of ranges 2 per each hill.
for given i binary search t so it matches your wanted result O(log(n))
binary search is applicable only on sorted array however you got the list of ranges where each range is ordered so binary search can be used. So find your range (also can be binary search of the ranges) and then binary search in the found range.
So this will lead to O(N+log(N)) complexity however The first step is needed to be done just once so if that is considered as part of initialization and you make many queries like this you can consider your resulting (amortized) complexity to be O(log(N))
In case your query can go pass one hill to another you need to search also neighboring ranges which would lead to O(M.log(N)) where M is the number of ranges which is also 2*hills and usually M << N.

I see your code is c++. if then this code will help you.
#include <iostream>
#include <vector>
using namespace std;
struct Item
{
int index;
int value;
};
int find_index(vector<Item> arr, int n, int K, int start = 0)
{
// Lower and upper bounds
int end = n - 1;
// Traverse the search space
while (start <= end)
{
int mid = (start + end) / 2;
// If K is found
if (arr[mid].value == K)
return mid;
else if (arr[mid].value < K)
start = mid + 1;
else
end = mid - 1;
}
// Return insert position
return end + 1;
}
int firstGreaterIndex[1000000];
int anyFirstGreaterIndex[1000000];
vector<Item>
greaterValues[1000000];
void fillGreaterValuesAtBinaryIndex(int arr[], int N)
{
int i, j, wi, iEndV;
struct Item iItem;
for (i = 0; i < N; i++)
{
greaterValues[i].empty();
wi = i + 1;
iItem.index = i;
iItem.value = arr[i];
greaterValues[i]
.push_back(iItem);
while (wi)
{
iEndV = greaterValues[wi - 1].back().value;
if (iEndV < arr[i])
{
iItem.index = i;
iItem.value = arr[i];
greaterValues[wi - 1].push_back(iItem);
}
wi = (wi - 1) & wi;
}
}
}
void fillBehindFirstGreaterIndex(int arr[], int N, int VSQR)
{
int i, wi;
for (i = 0; i < N; i++)
{
wi = i + 1;
firstGreaterIndex[i] = -1;
while (wi <= N)
{
int firstPosition = -1;
if (wi - 1 == i)
{
firstPosition = find_index(greaterValues[wi - 1], greaterValues[wi - 1].size(), arr[i] + VSQR, 1);
}
else
{
firstPosition = find_index(greaterValues[wi - 1], greaterValues[wi - 1].size(), arr[i] + VSQR);
}
if (firstPosition >= 0 && firstPosition < greaterValues[wi - 1].size())
{
firstGreaterIndex[i] = greaterValues[wi - 1][firstPosition].index;
break;
}
wi = (wi << 1) - ((wi - 1) & wi);
}
}
}
void fillAnyFirstGreaterIndex(int arr[], int N, int VSQR)
{
int i, wi;
for (i = 0; i < N; i++)
{
wi = 1;
anyFirstGreaterIndex[i] = -1;
while (wi <= N)
{
int firstPosition = -1;
if (wi - 1 == i)
{
firstPosition = find_index(greaterValues[wi - 1], greaterValues[wi - 1].size(), arr[i] + VSQR, 1);
}
else
{
firstPosition = find_index(greaterValues[wi - 1], greaterValues[wi - 1].size(), arr[i] + VSQR);
}
if (firstPosition >= 0 && firstPosition < greaterValues[wi - 1].size())
{
anyFirstGreaterIndex[i] = greaterValues[wi - 1][firstPosition].index;
break;
}
wi = (wi << 1) - ((wi - 1) & wi);
}
}
}
int main()
{
int i;
int N = 10, VSQR = 2;
int H[] = {5, 7, 14, 8, 9, 10, 11, 12, 1, 20};
fillGreaterValuesAtBinaryIndex(H, N);
fillBehindFirstGreaterIndex(H, N, VSQR);
fillAnyFirstGreaterIndex(H, N, VSQR);
for (i = 0; i < N; i++)
{
cout << i << " behind first greater index: " << firstGreaterIndex[i] << endl;
cout << i << " any first greater index: " << anyFirstGreaterIndex[i] << endl;
/* code */
}
}
if javascript:
function find_index(arr, n, K, start = 0)
{
// Lower and upper bounds
let end = n - 1;
// Traverse the search space
while (start <= end)
{
let mid = Math.floor((start + end) / 2);
// If K is found
if (arr[mid].value == K)
return mid;
else if (arr[mid].value < K)
start = mid + 1;
else
end = mid - 1;
}
// Return insert position
return end + 1;
}
function fillGreaterValuesAtBinaryIndex(arr, N)
{
let i, wi, iEndV;
let iItem = {};
let greaterValues = [];
for (i = 0; i < N; i++)
{
greaterValues.push([]);
wi = i + 1;
greaterValues[i]
.push({index: i, value: arr[i]});
while (wi)
{
iEndV = greaterValues[wi - 1][greaterValues[wi - 1].length - 1].value;
if (iEndV < arr[i])
{
iItem.index = i;
iItem.value = arr[i];
greaterValues[wi - 1].push({index: i, value: arr[i]});
}
wi = (wi - 1) & wi;
}
}
return greaterValues;
}
function fillBehindFirstGreaterIndex(arr, N, VSQR, greaterValues)
{
let i, wi;
let firstGreaterIndex = [];
for (i = 0; i < N; i++)
{
wi = i + 1;
firstGreaterIndex[i] = -1;
while (wi <= N)
{
let firstPosition = -1;
if (wi - 1 == i)
{
firstPosition = find_index(greaterValues[wi - 1], greaterValues[wi - 1].length, arr[i] + VSQR, 1);
}
else
{
firstPosition = find_index(greaterValues[wi - 1], greaterValues[wi - 1].length, arr[i] + VSQR);
}
if (firstPosition >= 0 && firstPosition < greaterValues[wi - 1].length)
{
firstGreaterIndex[i] = greaterValues[wi - 1][firstPosition].index;
break;
}
wi = (wi << 1) - ((wi - 1) & wi);
}
}
return firstGreaterIndex;
}
function fillAnyFirstGreaterIndex(arr, N, VSQR, greaterValues)
{
let i, wi;
let anyFirstGreaterIndex = [];
for (i = 0; i < N; i++)
{
wi = 1;
anyFirstGreaterIndex[i] = -1;
while (wi <= N)
{
let firstPosition = -1;
if (wi - 1 == i)
{
firstPosition = find_index(greaterValues[wi - 1], greaterValues[wi - 1].length, arr[i] + VSQR, 1);
}
else
{
firstPosition = find_index(greaterValues[wi - 1], greaterValues[wi - 1].length, arr[i] + VSQR);
}
if (firstPosition >= 0 && firstPosition < greaterValues[wi - 1].length)
{
anyFirstGreaterIndex[i] = greaterValues[wi - 1][firstPosition].index;
break;
}
wi = (wi << 1) - ((wi - 1) & wi);
}
}
return anyFirstGreaterIndex;
}
function main()
{
let i;
let N = 10, VSQR = 2;
let H = [5, 7, 14, 8, 9, 10, 11, 12, 1, 20];
let greaterValues = fillGreaterValuesAtBinaryIndex(H, N);
let behindFirstGreaterIndex = fillBehindFirstGreaterIndex(H, N, VSQR, greaterValues);
let anyFirstGreaterIndex = fillAnyFirstGreaterIndex(H, N, VSQR, greaterValues);
console.log("behindFirstGreaterIndex", behindFirstGreaterIndex);
console.log("anyFirstGreaterIndex", anyFirstGreaterIndex);
}
main();

C++ Dynamic Programming: error in traversing the grid

Here's a question 8 from 2018 AIME Paper : A frog is positioned at the origin of the coordinate plane. From the point (x, y), the frog can jump to any of the points (x + 1, y), (x + 2, y), (x, y + 1), or (x, y + 2). Find the number of distinct sequences of jumps in which the frog begins at (0, 0) and ends at (x, y).
It felt that it can be solved using dynamic programming but my code seems to have an error which I cannot debug. This is how I approached the problem:
If f[i][j] denotes the number of ways to reach grid-point (i, j) from (0, 0) then
f[i][j] = f[i - 1][j] + f[i - 2][j] + f[j - 1][i] + f[j - 2][i]
and we have to assign values of f[][] for the base cases..
I don't think there's an issue with the logic. But the outputs are terrible.
Here's my code : https://ideone.com/lhhMUL
#include <bits/stdc++.h>
using namespace std;
int main() {
int n, x, y;
cin >> n >> x >> y;
int f[n][n];
f[0][1] = f[1][0] = 1;
f[0][2] = f[2][0] = 2;
f[1][2] = f[2][1] = 5;
for (int i = 2; i <= x - 1; i++) {
for (int j = 2; j <= y - 1; j++) {
f[i][j] = f[i - 1][j]
+ f[i - 2][j]
+ f[j - 1][i]
+ f[j - 2][i];
}
}
cout << f[y][x];
return 0;
}

Two bugs I see are
j and i are reversed in your recursion equation
Initial values of f (for example f[3][1] ) are never calculated. They are just random values of what was in memory when the arrays were allocated.
#include <bits/stdc++.h>
using namespace std;
int main()
{
int n,x,y; cin>>n>>x>>y;
int f[n][n];
f[0][0]=1;
f[1][0]=1;
f[0][1]=1;
f[1][1]=2;
for(int i = 2; i <= x; i ++ ) {
f[i][0] = f[i-1][0] + f[i-2][0];
}
for(int i = 2; i <= x; i ++ ) {
f[i][1] = f[i-1][1] + f[i-2][1] + f[i][0];
}
for(int j = 2; j <= y; j ++ ) {
f[0][j] = f[0][j-1] + f[0][j-2];
}
for(int j = 2; j <= y; j ++ ) {
f[1][j] = f[1][j-1] + f[1][j-2] + f[0][j];
}
for (int i=2; i<=x; i++)
for (int j=2; j<=y; j++) {
f[i][j]=f[i-1][j]+f[i-2][j]+f[i][j-1]+f[i][j-2];
// cout << i << " " << j << " " << f[i][j] << endl;
}
cout<< f[x][y];
return 0;
}

Iterative version of Damerau–Levenshtein distance

Levenshtein distance can be computed iteratively using two rows this way:
https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
I came across the Optimal String alignment distance that does take into account the transposition. Wikipedia says that it can be computed using a straightforward extension of the regular Levenshtein algorithm:
if i > 1 and j > 1 and a[i-1] = b[j-2] and a[i-2] = b[j-1] then
d[i, j] := minimum(d[i, j],
d[i-2, j-2] + cost) // transposition
However, I'm not able to port the pseudo-code algorithm's extension on that page to the iterative version's code. Any help is greatly appreciated.

You need three rows to compute this new version, I can't check the code but I am quite confident about it:
int DamerauLevenshteinDistance(string s, string t)
{
// degenerate cases
if (s == t) return 0;
if (s.Length == 0) return t.Length;
if (t.Length == 0) return s.Length;
// create two work vectors of integer distances
int[] v0 = new int[t.Length + 1];
int[] v1 = new int[t.Length + 1];
int[] v2 = new int[t.Length + 1];
// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s
// the distance is just the number of characters to delete from t
for (int i = 0; i < v0.Length; i++)
v0[i] = i;
// compute v1
v1[0] = 0;
// use formula to fill in the rest of the row
for (int j = 0; j < t.Length; j++)
{
var cost = (s[0] == t[j]) ? 0 : 1;
v1[j + 1] = Minimum(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost);
}
if (s.Length == 1) {
return v1[t.Length];
}
for (int i = 1; i < s.Length; i++)
{
// calculate v2 (current row distances) from the previous rows v0 and v1
// first element of v2 is A[i+1][0]
// edit distance is delete (i+1) chars from s to match empty t
v2[0] = i + 1;
// use formula to fill in the rest of the row
for (int j = 0; j < t.Length; j++)
{
var cost = (s[i] == t[j]) ? 0 : 1;
v2[j + 1] = Minimum(v2[j] + 1, v1[j + 1] + 1, v1[j] + cost);
if (j > 0 && s[i] = t[j-1] && s[i-1] = t[j])
v2[j + 1] = Minimum(v2[j+1],
v0[j-1] + cost);
}
// copy v2 (current row) to v1 (previous row) and v1 to v0 for next iteration
for (int j = 0; j < v0.Length; j++)
v0[j] = v1[j];
v1[j] = v2[j];
}
return v2[t.Length];
}
The original code is coming from the wikipedia implementation mentioned above.

Maximum span in two arrays with equal sum

This is programming puzzle. We have two arrays A and B. Both contains 0's and 1's only.
We have to two indices i, j such that
a[i] + a[i+1] + .... a[j] = b[i] + b[i+1] + ... b[j].
Also we have to maximize this difference between i and j. Looking for O(n) solution.
I found O(n^2) solution but not getting O(n).

Best solution is O(n)
First let c[i] = a[i] - b[i], then question become find i, j, which sum(c[i], c[i+1], ..., c[j]) = 0, and max j - i.
Second let d[0] = 0, d[i + 1] = d[i] + c[i], i >= 0, then question become find i, j, which d[j + 1] == d[i], and max j - i.
The value of d is in range [-n, n], so we can use following code to find the answer
answer = 0, answer_i = 0, answer_j = 0
sumHash[2n + 1] set to -1
for (x <- 0 to n) {
if (sumHash[d[x]] == -1) {
sumHash[d[x]] = x
} else {
y = sumHash[d[x]]
// find one answer (y, x), compare to current best
if (x - y > answer) {
answer = x - y
answer_i = y
answer_j = y
}
}
}

Here is an O(n) solution.
I use the fact that sum[i..j] = sum[j] - sum[i - 1].
I keep the leftmost position of each found sum.
int convertToPositiveIndex(int index) {
return index + N;
}
int mostLeft[2 * N + 1];
memset(mostLeft, -1, sizeof(mostLeft));
int bestLen = 0, bestStart = -1, bestEnd = -1;
int sumA = 0, sumB = 0;
for (int i = 0; i < N; i++) {
sumA += A[i];
sumB += B[i];
int diff = sumA - sumB;
int diffIndex = convertToPositiveIndex(diff);
if (mostLeft[diffIndex] != -1) {
//we have found the sequence mostLeft[diffIndex] + 1 ... i
//now just compare it with the best one found so far
int currentLen = i - mostLeft[diffIndex];
if (currentLen > bestLen) {
bestLen = currentLen;
bestStart = mostLeft[diffIndex] + 1;
bestEnd = i;
}
}
if (mostLeft[diffIndex] == -1) {
mostLeft[diffIndex] = i;
}
}
cout << bestStart << " " << bestEnd << " " << bestLen << endl;
P.S. mostLeft array is 2 * N + 1, because of the negatives.

This is a fairly straightforward O(N) solution:
let sa = [s1, s2, s3.. sn] where si = sum(a[0:i]) and similar for sb
then sum(a[i:j]) = sa[j]-sa[i]
and sum(b[i:j]) = sb[j] - sb[i]
Note that because the sums only increase by 1 each time, we know 0 <= sb[N], sa[N] <=N
difference_array = [d1, d2, .. dn] where di = sb[i] - sa[i] <= N
note if di = dj, then sb[i] - sa[i] = sb[j] - sa[j] which means they have the same sum (rearrange to get sum(b[i:j]) and sum(a[i:j]) from above).
Now for each difference we need its max position occurrence and min position occurrence
Now for each difference di, the difference between max - min, is an i-j section of equal sum. Find the maximum max-min value and you're done.
sample code that should work:
a = []
b = []
sa = [0]
sb = [0]
for i in a:
sa.append(sa[-1] + i)
for i in b:
sb.append(sb[-1] + i)
diff = [sai-sbi for sai, sbi in zip(sa, sb)]
min_diff_pos = {}
max_diff_pos = {}
for pos, d in enumerate(diff):
if d in min_diff_pos:
max_diff_pos[d] = pos
else:
min_diff_pos[d] = pos
ans = min(max_diff_pos[d] - min_diff_pos[d] for d in diff)

Basically, my solution goes like this.
Take a variable to take care of the difference since the beginning.
int current = 0;
for index from 0 to length
if a[i] == 0 && b[i] == 1
current--;
else if a[i] == 1 && b[i] == 0
current++;
else
// nothing;
Find the positions where the variable has the same value, which indicates that there are equal 1s and 0s in between.
Pseudo Code:
Here is my primary solution:
int length = min (a.length, b.length);
int start[] = {-1 ... -1}; // from -length to length
start[0] = -1;
int count[] = {0 ... 0}; // from -length to length
int current = 0;
for (int i = 0; i < length; i++) {
if (a[i] == 0 && b[i] == 1)
current--;
else if (a[i] == 1 && b[i] == 0)
current++;
else
; // nothing
if (start[current] == -1) // index can go negative here, take care
start[current] = current;
else
count[current] = i - start[current];
}
return max_in(count[]);

Getting the submatrix with maximum sum?

Input: A 2-dimensional array NxN - Matrix - with positive and negative elements.Output: A submatrix of any size such that its summation is the maximum among all possible submatrices.
Requirement: Algorithm complexity to be of O(N^3)
History: With the help of the Algorithmist, Larry and a modification of Kadane's Algorithm, i managed to solve the problem partly which is determining the summation only - below in Java.
Thanks to Ernesto who managed to solve the rest of the problem which is determining the boundaries of the matrix i.e. top-left, bottom-right corners - below in Ruby.

Here's an explanation to go with the posted code. There are two key tricks to make this work efficiently: (I) Kadane's algorithm and (II) using prefix sums. You also need to (III) apply the tricks to the matrix.
Part I: Kadane's algorithm
Kadane's algorithm is a way to find a contiguous subsequence with maximum sum. Let's start with a brute force approach for finding the max contiguous subsequence and then consider optimizing it to get Kadane's algorithm.
Suppose you have the sequence:
-1, 2, 3, -2
For the brute force approach, walk along the sequence generating all possible subsequences as shown below. Considering all possibilities, we can start, extend, or end a list with each step.
At index 0, we consider appending the -1
-1, 2, 3, -2
^
Possible subsequences:
-1 [sum -1]
At index 1, we consider appending the 2
-1, 2, 3, -2
^
Possible subsequences:
-1 (end) [sum -1]
-1, 2 [sum 1]
2 [sum 2]
At index 2, we consider appending the 3
-1, 2, 3, -2
^
Possible subsequences:
-1, (end) [sum -1]
-1, 2 (end) [sum -1]
2 (end) [sum 2]
-1, 2, 3 [sum 4]
2, 3 [sum 5]
3 [sum 3]
At index 3, we consider appending the -2
-1, 2, 3, -2
^
Possible subsequences:
-1, (end) [sum -1]
-1, 2 (end) [sum 1]
2 (end) [sum 2]
-1, 2 3 (end) [sum 4]
2, 3 (end) [sum 5]
3, (end) [sum 3]
-1, 2, 3, -2 [sum 2]
2, 3, -2 [sum 3]
3, -2 [sum 1]
-2 [sum -2]
For this brute force approach, we finally pick the list with the best sum, (2, 3), and that's the answer. However, to make this efficient, consider that you really don't need to keep every one of the lists. Out of the lists that have not ended, you only need to keep the best one, the others cannot do any better. Out of the lists that have ended, you only might need to keep the best one, and only if it's better than ones that have not ended.
So, you can keep track of what you need with just a position array and a sum array. The position array is defined like this: position[r] = s keeps track of the list which ends at r and starts at s. And, sum[r] gives a sum for the subsequence ending at index r. This is optimized approach is Kadane's algorithm.
Running through the example again keeping track of our progress this way:
At index 0, we consider appending the -1
-1, 2, 3, -2
^
We start a new subsequence for the first element.
position[0] = 0
sum[0] = -1
At index 1, we consider appending the 2
-1, 2, 3, -2
^
We choose to start a new subsequence because that gives a higher sum than extending.
position[0] = 0 sum[0] = -1
position[1] = 1 sum[1] = 2
At index 2, we consider appending the 3
-1, 2, 3, -2
^
We choose to extend a subsequence because that gives a higher sum than starting a new one.
position[0] = 0 sum[0] = -1
position[1] = 1 sum[1] = 2
position[2] = 1 sum[2] = 5
Again, we choose to extend because that gives a higher sum that starting a new one.
-1, 2, 3, -2
^
position[0] = 0 sum[0] = -1
position[1] = 1 sum[1] = 2
position[2] = 1 sum[2] = 5
positions[3] = 3 sum[3] = 3
Again, the best sum is 5 and the list is from index 1 to index 2, which is (2, 3).
Part II: Prefix sums
We want to have a way to compute the sum along a row, for any start point to any endpoint. I want to compute that sum in O(1) time rather than just adding, which takes O(m) time where m is the number of elements in the sum. With some precomputing, this can be achieved. Here's how. Suppose you have a matrix:
a d g
b e h
c f i
You can precompute this matrix:
a d g
a+b d+e g+h
a+b+c d+e+f g+h+i
Once that is done you can get the sum running along any column from any start to endpoint in the column just by subtracting two values.
Part III: Bringing tricks together to find the max submatrix
Assume that you know the top and bottom row of the max submatrix. You could do this:
Ignore rows above your top row and ignore rows below your bottom
row.
With what matrix remains, consider the using sum of each column to
form a sequence (sort of like a row that represents multiple rows).
(You can compute any element of this sequence rapidly with the prefix
sums approach.)
Use Kadane's approach to figure out best subsequence in this
sequence. The indexes you get will tell you the left and right
positions of the best submatrix.
Now, what about actually figuring out the top and bottom row? Just try all possibilities. Try putting the top anywhere you can and putting the bottom anywhere you can, and run the Kadane-base procedure described previously for every possibility. When you find a max, you keep track of the top and bottom position.
Finding the row and column takes O(M^2) where M is the number of rows. Finding the column takes O(N) time where N is the number of columns. So total time is O(M^2 * N). And, if M=N, the time required is O(N^3).

About recovering the actual submatrix, and not just the maximum sum, here's what I got. Sorry I do not have time to translate my code to your java version, so I'm posting my Ruby code with some comments in the key parts
def max_contiguous_submatrix_n3(m)
rows = m.count
cols = rows ? m.first.count : 0
vps = Array.new(rows)
for i in 0..rows
vps[i] = Array.new(cols, 0)
end
for j in 0...cols
vps[0][j] = m[0][j]
for i in 1...rows
vps[i][j] = vps[i-1][j] + m[i][j]
end
end
max = [m[0][0],0,0,0,0] # this is the result, stores [max,top,left,bottom,right]
# these arrays are used over Kadane
sum = Array.new(cols) # obvious sum array used in Kadane
pos = Array.new(cols) # keeps track of the beginning position for the max subseq ending in j
for i in 0...rows
for k in i...rows
# Kadane over all columns with the i..k rows
sum.fill(0) # clean both the sum and pos arrays for the upcoming Kadane
pos.fill(0)
local_max = 0 # we keep track of the position of the max value over each Kadane's execution
# notice that we do not keep track of the max value, but only its position
sum[0] = vps[k][0] - (i==0 ? 0 : vps[i-1][0])
for j in 1...cols
value = vps[k][j] - (i==0 ? 0 : vps[i-1][j])
if sum[j-1] > 0
sum[j] = sum[j-1] + value
pos[j] = pos[j-1]
else
sum[j] = value
pos[j] = j
end
if sum[j] > sum[local_max]
local_max = j
end
end
# Kadane ends here
# Here's the key thing
# If the max value obtained over the past Kadane's execution is larger than
# the current maximum, then update the max array with sum and bounds
if sum[local_max] > max[0]
# sum[local_max] is the new max value
# the corresponding submatrix goes from rows i..k.
# and from columns pos[local_max]..local_max
# the array below contains [max_sum,top,left,bottom,right]
max = [sum[local_max], i, pos[local_max], k, local_max]
end
end
end
return max # return the array with [max_sum,top,left,bottom,right]
end
Some notes for clarification:
I use an array to store all the values pertaining to the result for convenience. You can just use five standalone variables: max, top, left, bottom, right. It's just easier to assign in one line to the array and then the subroutine returns the array with all the needed information.
If you copy and paste this code in a text-highlight-enabled editor with Ruby support you'll obviously understand it better. Hope this helps!

There are already plenty of answers, but here is another Java implementation I wrote. It compares 3 solutions:
Naïve (brute force) - O(n^6) time
The obvious DP solution - O(n^4) time and O(n^3) space
The more clever DP solution based on Kadane's algorithm - O(n^3) time and O(n^2) space
There are sample runs for n = 10 thru n = 70 in increments of 10 with a nice output comparing run time and space requirements.
Code:
public class MaxSubarray2D {
static int LENGTH;
final static int MAX_VAL = 10;
public static void main(String[] args) {
for (int i = 10; i <= 70; i += 10) {
LENGTH = i;
int[][] a = new int[LENGTH][LENGTH];
for (int row = 0; row < LENGTH; row++) {
for (int col = 0; col < LENGTH; col++) {
a[row][col] = (int) (Math.random() * (MAX_VAL + 1));
if (Math.random() > 0.5D) {
a[row][col] = -a[row][col];
}
//System.out.printf("%4d", a[row][col]);
}
//System.out.println();
}
System.out.println("N = " + LENGTH);
System.out.println("-------");
long start, end;
start = System.currentTimeMillis();
naiveSolution(a);
end = System.currentTimeMillis();
System.out.println(" run time: " + (end - start) + " ms no auxiliary space requirements");
start = System.currentTimeMillis();
dynamicProgammingSolution(a);
end = System.currentTimeMillis();
System.out.println(" run time: " + (end - start) + " ms requires auxiliary space for "
+ ((int) Math.pow(LENGTH, 4)) + " integers");
start = System.currentTimeMillis();
kadane2D(a);
end = System.currentTimeMillis();
System.out.println(" run time: " + (end - start) + " ms requires auxiliary space for " +
+ ((int) Math.pow(LENGTH, 2)) + " integers");
System.out.println();
System.out.println();
}
}
// O(N^2) !!!
public static void kadane2D(int[][] a) {
int[][] s = new int[LENGTH + 1][LENGTH]; // [ending row][sum from row zero to ending row] (rows 1-indexed!)
for (int r = 0; r < LENGTH + 1; r++) {
for (int c = 0; c < LENGTH; c++) {
s[r][c] = 0;
}
}
for (int r = 1; r < LENGTH + 1; r++) {
for (int c = 0; c < LENGTH; c++) {
s[r][c] = s[r - 1][c] + a[r - 1][c];
}
}
int maxSum = Integer.MIN_VALUE;
int maxRowStart = -1;
int maxColStart = -1;
int maxRowEnd = -1;
int maxColEnd = -1;
for (int r1 = 1; r1 < LENGTH + 1; r1++) { // rows 1-indexed!
for (int r2 = r1; r2 < LENGTH + 1; r2++) { // rows 1-indexed!
int[] s1 = new int[LENGTH];
for (int c = 0; c < LENGTH; c++) {
s1[c] = s[r2][c] - s[r1 - 1][c];
}
int max = 0;
int c1 = 0;
for (int c = 0; c < LENGTH; c++) {
max = s1[c] + max;
if (max <= 0) {
max = 0;
c1 = c + 1;
}
if (max > maxSum) {
maxSum = max;
maxRowStart = r1 - 1;
maxColStart = c1;
maxRowEnd = r2 - 1;
maxColEnd = c;
}
}
}
}
System.out.print("KADANE SOLUTION | Max sum: " + maxSum);
System.out.print(" Start: (" + maxRowStart + ", " + maxColStart +
") End: (" + maxRowEnd + ", " + maxColEnd + ")");
}
// O(N^4) !!!
public static void dynamicProgammingSolution(int[][] a) {
int[][][][] dynTable = new int[LENGTH][LENGTH][LENGTH + 1][LENGTH + 1]; // [row][col][height][width]
int maxSum = Integer.MIN_VALUE;
int maxRowStart = -1;
int maxColStart = -1;
int maxRowEnd = -1;
int maxColEnd = -1;
for (int r = 0; r < LENGTH; r++) {
for (int c = 0; c < LENGTH; c++) {
for (int h = 0; h < LENGTH + 1; h++) {
for (int w = 0; w < LENGTH + 1; w++) {
dynTable[r][c][h][w] = 0;
}
}
}
}
for (int r = 0; r < LENGTH; r++) {
for (int c = 0; c < LENGTH; c++) {
for (int h = 1; h <= LENGTH - r; h++) {
int rowTotal = 0;
for (int w = 1; w <= LENGTH - c; w++) {
rowTotal += a[r + h - 1][c + w - 1];
dynTable[r][c][h][w] = rowTotal + dynTable[r][c][h - 1][w];
}
}
}
}
for (int r = 0; r < LENGTH; r++) {
for (int c = 0; c < LENGTH; c++) {
for (int h = 0; h < LENGTH + 1; h++) {
for (int w = 0; w < LENGTH + 1; w++) {
if (dynTable[r][c][h][w] > maxSum) {
maxSum = dynTable[r][c][h][w];
maxRowStart = r;
maxColStart = c;
maxRowEnd = r + h - 1;
maxColEnd = c + w - 1;
}
}
}
}
}
System.out.print(" DP SOLUTION | Max sum: " + maxSum);
System.out.print(" Start: (" + maxRowStart + ", " + maxColStart +
") End: (" + maxRowEnd + ", " + maxColEnd + ")");
}
// O(N^6) !!!
public static void naiveSolution(int[][] a) {
int maxSum = Integer.MIN_VALUE;
int maxRowStart = -1;
int maxColStart = -1;
int maxRowEnd = -1;
int maxColEnd = -1;
for (int rowStart = 0; rowStart < LENGTH; rowStart++) {
for (int colStart = 0; colStart < LENGTH; colStart++) {
for (int rowEnd = 0; rowEnd < LENGTH; rowEnd++) {
for (int colEnd = 0; colEnd < LENGTH; colEnd++) {
int sum = 0;
for (int row = rowStart; row <= rowEnd; row++) {
for (int col = colStart; col <= colEnd; col++) {
sum += a[row][col];
}
}
if (sum > maxSum) {
maxSum = sum;
maxRowStart = rowStart;
maxColStart = colStart;
maxRowEnd = rowEnd;
maxColEnd = colEnd;
}
}
}
}
}
System.out.print(" NAIVE SOLUTION | Max sum: " + maxSum);
System.out.print(" Start: (" + maxRowStart + ", " + maxColStart +
") End: (" + maxRowEnd + ", " + maxColEnd + ")");
}
}

Here is a Java version of Ernesto implementation with some modifications:
public int[][] findMaximumSubMatrix(int[][] matrix){
int dim = matrix.length;
//computing the vertical prefix sum for columns
int[][] ps = new int[dim][dim];
for (int i = 0; i < dim; i++) {
for (int j = 0; j < dim; j++) {
if (j == 0) {
ps[j][i] = matrix[j][i];
} else {
ps[j][i] = matrix[j][i] + ps[j - 1][i];
}
}
}
int maxSum = matrix[0][0];
int top = 0, left = 0, bottom = 0, right = 0;
//Auxiliary variables
int[] sum = new int[dim];
int[] pos = new int[dim];
int localMax;
for (int i = 0; i < dim; i++) {
for (int k = i; k < dim; k++) {
// Kadane over all columns with the i..k rows
reset(sum);
reset(pos);
localMax = 0;
//we keep track of the position of the max value over each Kadane's execution
// notice that we do not keep track of the max value, but only its position
sum[0] = ps[k][0] - (i==0 ? 0 : ps[i-1][0]);
for (int j = 1; j < dim; j++) {
if (sum[j-1] > 0){
sum[j] = sum[j-1] + ps[k][j] - (i==0 ? 0 : ps[i-1][j]);
pos[j] = pos[j-1];
}else{
sum[j] = ps[k][j] - (i==0 ? 0 : ps[i-1][j]);
pos[j] = j;
}
if (sum[j] > sum[localMax]){
localMax = j;
}
}//Kadane ends here
if (sum[localMax] > maxSum){
/* sum[localMax] is the new max value
the corresponding submatrix goes from rows i..k.
and from columns pos[localMax]..localMax
*/
maxSum = sum[localMax];
top = i;
left = pos[localMax];
bottom = k;
right = localMax;
}
}
}
System.out.println("Max SubMatrix determinant = " + maxSum);
//composing the required matrix
int[][] output = new int[bottom - top + 1][right - left + 1];
for(int i = top, k = 0; i <= bottom; i++, k++){
for(int j = left, l = 0; j <= right ; j++, l++){
output[k][l] = matrix[i][j];
}
}
return output;
}
private void reset(int[] a) {
for (int index = 0; index < a.length; index++) {
a[index] = 0;
}
}

With the help of the Algorithmist and Larry and a modification of Kadane's Algorithm, here is my solution:
int dim = matrix.length;
//computing the vertical prefix sum for columns
int[][] ps = new int[dim][dim];
for (int i = 0; i < dim; i++) {
for (int j = 0; j < dim; j++) {
if (j == 0) {
ps[j][i] = matrix[j][i];
} else {
ps[j][i] = matrix[j][i] + ps[j - 1][i];
}
}
}
int maxSoFar = 0;
int min , subMatrix;
//iterate over the possible combinations applying Kadane's Alg.
for (int i = 0; i < dim; i++) {
for (int j = i; j < dim; j++) {
min = 0;
subMatrix = 0;
for (int k = 0; k < dim; k++) {
if (i == 0) {
subMatrix += ps[j][k];
} else {
subMatrix += ps[j][k] - ps[i - 1 ][k];
}
if(subMatrix < min){
min = subMatrix;
}
if((subMatrix - min) > maxSoFar){
maxSoFar = subMatrix - min;
}
}
}
}
The only thing left is to determine the submatrix elements, i.e: the top left and the bottom right corner of the submatrix. Anyone suggestion?

this is my implementation of 2D Kadane algorithm. I think it is more clear. The concept is based on just kadane algorithm. The first and second loop of the main part (that is in the bottom of the code) is to pick every combination of the rows and 3rd loop is to use 1D kadane algorithm by every following column sum (that can be computed in const time because of preprocessing of matrix by subtracting values from two picked (from combintation) rows). Here is the code:
int [][] m = {
{1,-5,-5},
{1,3,-5},
{1,3,-5}
};
int N = m.length;
// summing columns to be able to count sum between two rows in some column in const time
for (int i=0; i<N; ++i)
m[0][i] = m[0][i];
for (int j=1; j<N; ++j)
for (int i=0; i<N; ++i)
m[j][i] = m[j][i] + m[j-1][i];
int total_max = 0, sum;
for (int i=0; i<N; ++i) {
for (int k=i; k<N; ++k) { //for each combination of rows
sum = 0;
for (int j=0; j<N; j++) { //kadane algorithm for every column
sum += i==0 ? m[k][j] : m[k][j] - m[i-1][j]; //for first upper row is exception
total_max = Math.max(sum, total_max);
}
}
}
System.out.println(total_max);

I am going to post an answer here and can add actual c++ code if it is requested because I had recently worked through this. Some rumors of a divide and conqueror that can solve this in O(N^2) are out there but I haven't seen any code to support this. In my experience the following is what I have found.
O(i^3j^3) -- naive brute force method
o(i^2j^2) -- dynamic programming with memoization
O(i^2j) -- using max contiguous sub sequence for an array
if ( i == j )
O(n^6) -- naive
O(n^4) -- dynamic programming
O(n^3) -- max contiguous sub sequence

Have a look at JAMA package; I believe it will make your life easier.

Here is the C# solution. Ref: http://www.algorithmist.com/index.php/UVa_108
public static MaxSumMatrix FindMaxSumSubmatrix(int[,] inMtrx)
{
MaxSumMatrix maxSumMtrx = new MaxSumMatrix();
// Step 1. Create SumMatrix - do the cumulative columnar summation
// S[i,j] = S[i-1,j]+ inMtrx[i-1,j];
int m = inMtrx.GetUpperBound(0) + 2;
int n = inMtrx.GetUpperBound(1)+1;
int[,] sumMatrix = new int[m, n];
for (int i = 1; i < m; i++)
{
for (int j = 0; j < n; j++)
{
sumMatrix[i, j] = sumMatrix[i - 1, j] + inMtrx[i - 1, j];
}
}
PrintMatrix(sumMatrix);
// Step 2. Create rowSpans starting each rowIdx. For these row spans, create a 1-D array r_ij
for (int x = 0; x < n; x++)
{
for (int y = x; y < n; y++)
{
int[] r_ij = new int[n];
for (int k = 0; k < n; k++)
{
r_ij[k] = sumMatrix[y + 1,k] - sumMatrix[x, k];
}
// Step 3. Find MaxSubarray of this r_ij. If the sum is greater than the last recorded sum =>
// capture Sum, colStartIdx, ColEndIdx.
// capture current x as rowTopIdx, y as rowBottomIdx.
MaxSum currMaxSum = KadanesAlgo.FindMaxSumSubarray(r_ij);
if (currMaxSum.maxSum > maxSumMtrx.sum)
{
maxSumMtrx.sum = currMaxSum.maxSum;
maxSumMtrx.colStart = currMaxSum.maxStartIdx;
maxSumMtrx.colEnd = currMaxSum.maxEndIdx;
maxSumMtrx.rowStart = x;
maxSumMtrx.rowEnd = y;
}
}
}
return maxSumMtrx;
}
public static void PrintMatrix(int[,] matrix)
{
int endRow = matrix.GetUpperBound(0);
int endCol = matrix.GetUpperBound(1);
PrintMatrix(matrix, 0, endRow, 0, endCol);
}
public static void PrintMatrix(int[,] matrix, int startRow, int endRow, int startCol, int endCol)
{
StringBuilder sb = new StringBuilder();
for (int i = startRow; i <= endRow; i++)
{
sb.Append(Environment.NewLine);
for (int j = startCol; j <= endCol; j++)
{
sb.Append(string.Format("{0} ", matrix[i,j]));
}
}
Console.WriteLine(sb.ToString());
}
// Given an NxN matrix of positive and negative integers, write code to find the sub-matrix with the largest possible sum
public static MaxSum FindMaxSumSubarray(int[] inArr)
{
int currMax = 0;
int currStartIndex = 0;
// initialize maxSum to -infinity, maxStart and maxEnd idx to 0.
MaxSum mx = new MaxSum(int.MinValue, 0, 0);
// travers through the array
for (int currEndIndex = 0; currEndIndex < inArr.Length; currEndIndex++)
{
// add element value to the current max.
currMax += inArr[currEndIndex];
// if current max is more that the last maxSum calculated, set the maxSum and its idx
if (currMax > mx.maxSum)
{
mx.maxSum = currMax;
mx.maxStartIdx = currStartIndex;
mx.maxEndIdx = currEndIndex;
}
if (currMax < 0) // if currMax is -ve, change it back to 0
{
currMax = 0;
currStartIndex = currEndIndex + 1;
}
}
return mx;
}
struct MaxSum
{
public int maxSum;
public int maxStartIdx;
public int maxEndIdx;
public MaxSum(int mxSum, int mxStart, int mxEnd)
{
this.maxSum = mxSum;
this.maxStartIdx = mxStart;
this.maxEndIdx = mxEnd;
}
}
class MaxSumMatrix
{
public int sum = int.MinValue;
public int rowStart = -1;
public int rowEnd = -1;
public int colStart = -1;
public int colEnd = -1;
}

Here is my solution. It's O(n^3) in time and O(n^2) space.
https://gist.github.com/toliuweijing/6097144
// 0th O(n) on all candidate bottoms #B.
// 1th O(n) on candidate tops #T.
// 2th O(n) on finding the maximum #left/#right match.
int maxRect(vector<vector<int> >& mat) {
int n = mat.size();
vector<vector<int> >& colSum = mat;
for (int i = 1 ; i < n ; ++i)
for (int j = 0 ; j < n ; ++j)
colSum[i][j] += colSum[i-1][j];
int optrect = 0;
for (int b = 0 ; b < n ; ++b) {
for (int t = 0 ; t <= b ; ++t) {
int minLeft = 0;
int rowSum[n];
for (int i = 0 ; i < n ; ++i) {
int col = t == 0 ? colSum[b][i] : colSum[b][i] - colSum[t-1][i];
rowSum[i] = i == 0? col : col + rowSum[i-1];
optrect = max(optrect, rowSum[i] - minLeft);
minLeft = min(minLeft, rowSum[i]);
}
}
}
return optrect;
}

I would just parse the NxN array removing the -ves whatever remains is the highest sum of a sub matrix.
The question doesn't say you have to leave the original matrix intact or that the order matters.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio