Assume I have these two arrays:
float arr[] = {40.4357,40.6135,40.2477,40.2864,39.3449,39.8901,40.103,39.9959,39.7863,39.9102,39.2652,39.2688,39.5147,38.2246,38.5376,38.4512,38.9951,39.0999,39.3057,38.53,38.2761,38.1722,37.8816,37.6521,37.8306,38.0853,37.9644,38.0626,38.0567,38.3518,38.4044,38.3553,38.4978,38.3768,38.2058,38.3175,38.3123,38.262,38.0093,38.3685,38.0111,38.4539,38.8122,39.1413,38.9409,39.2043,39.3538,39.4123,39.3628,39.2825,39.1898,39.0431,39.0634,38.5993,38.252,37.3793,36.6334,36.4009,35.2822,34.4262,34.2119,34.1552,34.3325,33.9626,33.2661,32.3819,35.1959,36.7602,37.9039,37.8103,37.5832,37.9718,38.3111,38.9323,38.6763,39.1163,38.8469,39.805,40.2627,40.3689,40.4064,40.0558,40.815,41.0234,41.0128,41.0296,41.0927,40.7046,40.6775,40.2711,40.1283,39.7518,40.0145,40.0394,39.8461,39.6317,39.5548,39.1996,38.9861,38.8507,38.8603,38.483,38.4711,38.4214,38.4286,38.5766,38.7532,38.7905,38.6029,38.4635,38.1403,36.6844,36.616,36.4053,34.7934,34.0226,33.0505,33.4978,34.6106,35.284,35.7535,35.3541,35.5481,35.4086,35.7096,36.0526,36.1222,35.9408,36.1007,36.7952,36.99,37.1024,37.0993,37.3144,36.6951,37.1213,38.0026,38.1266,39.2538,38.8963,39.0158,38.6235,38.7908,38.6041,38.4489,38.3207,37.7398,38.5304,38.925,38.7249,38.9221,39.1704,39.5113,40.0613,39.3602,39.8689,39.973,40.0524,40.0025,40.7584,40.9714,40.9106,40.9685,40.6554,39.7314,39.0044,38.7183,38.5163,38.6101,38.2004,38.7606,38.7532,37.8903,37.8403,38.5368,39.0462,38.8279,39.0748,39.2907,38.5447,38.423,38.5624,38.476,38.5784,39.0905,39.379,39.4739,39.5774,40.7036,40.3044,39.6162,39.9967,40.0562,39.3426,38.666,38.7561,39.2823,38.8548,37.6214,37.8188,38.1086,38.3619,38.5472,38.1357,38.1422,37.95,37.1837,37.4636,36.8852,37.1617,37.5051,37.7724,38.0879,37.7197,38.0422,37.8551,38.5688,38.8388};
float pattern[] = {38.6434,38.1409,37.3391,37.5457,37.7487,37.7499,37.6121,37.4789,37.5821,37.6541,38.0365,37.7907,37.9932,37.9945,37.7032,37.3556,37.6359,37.5412,37.5296,37.8829,38.3797,38.4452,39.0929,39.1233,39.3014,39.0317,38.903,38.8221,39.045,38.6944,39.0699,39.0978,38.9877,38.8123,38.7491,38.5888,38.7875,38.2086,37.7484,37.3961,36.8663,36.2607,35.8838,35.3297,35.5574,35.7239};
Ives uploaded this example graph:
As you can see in the graph pattern almost fits in the array at index 17
Whats the best and fastest way to find this index? And is there a way to have a confidence for there match fuse the values are not equal as you can see?
If the starting index is your only degree of freedom, you can just try each index and calculate the sum of squared errors for each of the data points. In Python this could look like this:
data = [40.4357,40.6135,40.2477,...]
pattern = [38.6434,38.1409,37.3391,37.5457,37.7487,...]
best_ind, best_err = 0, 1e9999
for i in range(len(data) - len(pattern)):
subdata = data[i : i + len(pattern)]
err = sum((d-p)**2 for (d, p) in zip(subdata, pattern))
if err < best_err:
best_ind, best_err = i, err
Result:
>>> print best_ind, best_err
17 21.27929269
The straightforward algorithm is to chose a measure of convergence ( how you describe similarity, this might be average of errors, or their squared values or any other function suitable for your purposes) and apply steps
Let i = 0 be an integer index, M be a container of size = length(data) - length(pattern) + 1 to store the measurings
If i < size then shift your pattern by i, otherwise go to step 5
Calculate the measure of similarity and store into M
i = i + 1, go to 2 and repeat
Chose the index of minimum value in M
It's a one-liner in Python, using the fact that tuples are sorted lexicographically:
In [1]:
import numpy as np
arr = np.array( [ 40.4357,40.6135,40.2477,40.2864,39.3449,39.8901,40.103,39.9959,39.7863,39.9102,39.2652,39.2688,39.5147,38.2246,38.5376,38.4512,38.9951,39.0999,39.3057,38.53,38.2761,38.1722,37.8816,37.6521,37.8306,38.0853,37.9644,38.0626,38.0567,38.3518,38.4044,38.3553,38.4978,38.3768,38.2058,38.3175,38.3123,38.262,38.0093,38.3685,38.0111,38.4539,38.8122,39.1413,38.9409,39.2043,39.3538,39.4123,39.3628,39.2825,39.1898,39.0431,39.0634,38.5993,38.252,37.3793,36.6334,36.4009,35.2822,34.4262,34.2119,34.1552,34.3325,33.9626,33.2661,32.3819,35.1959,36.7602,37.9039,37.8103,37.5832,37.9718,38.3111,38.9323,38.6763,39.1163,38.8469,39.805,40.2627,40.3689,40.4064,40.0558,40.815,41.0234,41.0128,41.0296,41.0927,40.7046,40.6775,40.2711,40.1283,39.7518,40.0145,40.0394,39.8461,39.6317,39.5548,39.1996,38.9861,38.8507,38.8603,38.483,38.4711,38.4214,38.4286,38.5766,38.7532,38.7905,38.6029,38.4635,38.1403,36.6844,36.616,36.4053,34.7934,34.0226,33.0505,33.4978,34.6106,35.284,35.7535,35.3541,35.5481,35.4086,35.7096,36.0526,36.1222,35.9408,36.1007,36.7952,36.99,37.1024,37.0993,37.3144,36.6951,37.1213,38.0026,38.1266,39.2538,38.8963,39.0158,38.6235,38.7908,38.6041,38.4489,38.3207,37.7398,38.5304,38.925,38.7249,38.9221,39.1704,39.5113,40.0613,39.3602,39.8689,39.973,40.0524,40.0025,40.7584,40.9714,40.9106,40.9685,40.6554,39.7314,39.0044,38.7183,38.5163,38.6101,38.2004,38.7606,38.7532,37.8903,37.8403,38.5368,39.0462,38.8279,39.0748,39.2907,38.5447,38.423,38.5624,38.476,38.5784,39.0905,39.379,39.4739,39.5774,40.7036,40.3044,39.6162,39.9967,40.0562,39.3426,38.666,38.7561,39.2823,38.8548,37.6214,37.8188,38.1086,38.3619,38.5472,38.1357,38.1422,37.95,37.1837,37.4636,36.8852,37.1617,37.5051,37.7724,38.0879,37.7197,38.0422,37.8551,38.5688,38.8388] )
pattern = np.array( [ 38.6434,38.1409,37.3391,37.5457,37.7487,37.7499,37.6121,37.4789,37.5821,37.6541,38.0365,37.7907,37.9932,37.9945,37.7032,37.3556,37.6359,37.5412,37.5296,37.8829,38.3797,38.4452,39.0929,39.1233,39.3014,39.0317,38.903,38.8221,39.045,38.6944,39.0699,39.0978,38.9877,38.8123,38.7491,38.5888,38.7875,38.2086,37.7484,37.3961,36.8663,36.2607,35.8838,35.3297,35.5574,35.7239 ] )
min( ( ( ( arr[i:i+len(pattern)] - pattern ) ** 2 ).mean(), i ) for i in xrange(len(arr)-len(pattern)) )
Out[5]:
(0.46259331934782588, 17)
where 0.46 is the minimal mean squared error, and 17 is the position of the minimum in arr.
I have two matrices in Matlab A and B, which have equal number of columns but different number of rows. The number of rows in B is also less than the number of rows in A. B is actually a subset of A.
How can I remove those rows efficiently from A, where the values in columns 1 and 2 of A are equal to the values in columns 1 and 2 of matrix B?
At the moment I'm doing this:
for k = 1:size(B, 1)
A(find((A(:,1) == B(k,1) & A(:,2) == B(k,2))), :) = [];
end
and Matlab complains that this is inefficient and that I should try to use any, but I'm not sure how to do it with any. Can someone help me out with this? =)
I tried this, but it doesn't work:
A(any(A(:,1) == B(:,1) & A(:,2) == B(:,2), 2), :) = [];
It complains the following:
Error using ==
Matrix dimensions must agree.
Example of what I want:
A-B in the results means that the rows of B are removed from A. The same goes with A-C.
try using setdiff. for example:
c=setdiff(a,b,'rows')
Note, if order is important use:
c = setdiff(a,b,'rows','stable')
Edit: reading the edited question and the comments to this answer, the specific usage of setdiff you look for is (as noticed by Shai):
[temp c] = setdiff(a(:,1:2),b(:,1:2),'rows','stable')
c = a(c,:)
Alternative solution:
you can just use ismember:
a(~ismember(a(:,1:2),b(:,1:2),'rows'),:)
Use bsxfun:
compare = bsxfun( #eq, permute( A(:,1:2), [1 3 2]), permute( B(:,1:2), [3 1 2] ) );
twoEq = all( compare, 3 );
toRemove = any( twoEq, 2 );
A( toRemove, : ) = [];
Explaining the code:
First we use bsxfun to compare all pairs of first to column of A and B, resulting with compare of size numRowsA-by-numRowsB-by-2 with true where compare( ii, jj, kk ) = A(ii,kk) == B(jj,kk).
Then we use all to create twoEq of size numRowsA-by-numRowsB where each entry indicates if both corresponding entries of A and B are equal.
Finally, we use any to select rows of A that matches at least one row of B.
What's wrong with original code:
By removing rows of A inside a loop (i.e., A( ... ) = []) you actually resizing A at almost each iteration. See this post on why exactly this is a bad practice.
Using setdiff
In order to use setdiff (as suggested by natan) on only the first two columns you'll need use it's second output argument:
[ignore, ia] = setdiff( A(:,1:2), B(:,1:2), 'rows', 'stable' );
A = A( ia, : ); % keeping only relevant rows, beyond first two columns.
Here's another bsxfun implementation -
A(~any(squeeze(all(bsxfun(#eq,A(:,1:2),permute(B(:,1:2),[3 2 1])),2)),2),:)
One more that is dangerously close to Shai's solution, but still avoids two permute to one permute -
A(~any(all(bsxfun(#eq,A(:,1:2),permute(B(:,1:2),[3 2 1])),2),3),:)
Given a set of n numbers (1 <= n <= 100) where each number is an integer between 1 and 450,we need to distribute those set of numbers into two sets A and B, such that the following two cases hold true:
The total numbers in each set differ by at most 1.
The sum of all the numbers in A is as nearly equal as possible to the sum of all the numbers in B i.e. the distribution should be fair.
Can someone please suggest an efficient algorithm for solving the above problem ?
Thank You.
Since the numbers are small it is not NP-complete.
To solve it you can use dynamic programming:
Make a three-dimensional table of booleans
where true at t[s, n, i] means that the sum s can be reached with a subset of n elements below index i.
To compute the value for t[s, n, i] check t[s, n, i-1] and t[s - a[i], n-1, i-1].
Then look through the table at second index n/2 to find the best solution.
Edit: You actually don't need the complete table at once. You can make a two dimensional table t_i[s, n] for each index i and compute the table for i from the table for i-1, so you only need two of these two-dimensional tables, which saves a lot of memory. (Thanks to Martin Hock.)
This is a constrained version of the Number Partioning Problem. Usually the goal is to find any 2 disjoint subsets that minimize the difference of the sums. Your problem is constrained in the sense you only consider 1 possiblity: 2 sets of size N/2 (or 1 set of N/2 and one set of N/2+1 if the total number if uneven). This dramatically reduces the search space, but I can't thnk of a good algorithm at the moment, I'll think about it.
If the numbers are sequential then you just alternate assigning them between A and B.
I suspect they are not, in which case...
Assign the largest unassigned number to the group with the lowest sum unless the difference in size of the the groups is less than or equal to count of unassigned numbers (in which case assign all of the remaining numbers to smaller group).
It won't find the best solution in all cases, but its close and simple.
Never mind, I thought the numbers were sequential. This looks kind of like the Knapsack Problem, which is NP hard.
The numbers are sequential?
Put the largest number in A
Put the next largest number in B
Put the next largest number in B
Put the next largest number in A
Repeat step 1 until all the numbers are assigned.
Proof:
After every multiple of 4 numbers has been assigned, A and B both contain the same number of items, and the sum of the items in each group are the same because
(n) + (n - 3) == (n - 1) + (n - 2)
In the last iteration we are at Step 1 above and we have either 0, 1 1, 2 [1,2], or 3 [1,2,3] numbers remaining.
In case 0, we are done and the groups are equal in count and weight.
In case 1, we assign the number 1 to group A. Group A has one more item and one more weight. This is as fair as we can get in this situation.
In case 2, we assign the number 2 to group A and the number 1 to group B. Now the groups have the same number of items and group A has one extra weight. Again, this is as fair as we can get.
In case 3, assign the number 3 to group A, and assign numbers 2 and 1 to group B. Now the groups have the same weight (3 == 2 + 1) and group B has one extra item.
First, find a solution to the problem without the first constraint (i.e. - making sums as close as possible). This problem can be solved using DP approach (you can read more about DP here, and the first problem - about coins - is very similar to yours).
Once you can solve it, you can add one more state to your DP - the number of persons selected to the subset already. This gives you a N^3 algorithm.
I have an algorithm for you. It is using a lot of recursive and iterative concepts.
Assuming you have n number Xn with 1 <= n <= 100 and 1 <= Xn <= 450.
If n < 3 then distribute numbers and stop algorithm,
If n > 2 then sort your list of number in ascending order,
Compute the total sum S of all numbers,
Then divide the previous total S by (n - n%2)/2 and obtain the A value,
Now we will create couple of numbers which addition will be as near as possible as A. Get the first number and find a second number in order to obtain a sum S1 as near as possible than A. Put S1 in a new list of number and keep in memory how the sum was computed in order to have the base numbers later.
Execute 5. until numbers in the list is < 2. Then put the remaining numbers to the sum list and restart algorithm to point 1. with new list.
Example:
Assuming: n = 7 and numbers are 10, 75, 30, 45, 25, 15, 20
Pass 1:
Since n > 2 so sort the list : 10, 15, 20, 25, 30, 45, 75
Sum S = 220
A = 220 / ((7-1)/2) = 73
Couples:
10 & 75 => 85
15 & 45 => 60
20 & 30 => 50
Remaining numbers are < 2 so add 25 in the sum list : 85(10,75), 60(15,45), 50(20,30), 25(25)
Pass 2:
n = 4 and numbers are 85, 60, 50, 25
List count is > 2 so sort list : 25(25), 50(20,30), 60(15,45), 85(10,75)
Sum S is still the same (S=220) but A must be recompute : A = 220 / ((4-0)/2) = 110
Couples:
25 & 85 => 110
50 & 60 => 110
The Sum list is : 110(25(25),85(10,75)), 110(50(20,30),60(15,45))
Pass 3:
n = 2 and numbers are 110, 110
n < 3 so distribute numbers:
A = 25, 10, 75
B = 20, 30, 15, 45
This works on each scenario I have tested.
your requirement in #2 needs clarification, because:
"The sum of all the numbers in A is as nearly equal as possible to the sum of all the numbers in B" is clear, but then your statement "the distribution should be fair" makes everything unclear. What does 'fair' exactly mean? Does the process need a random element in it?
#ShreevatsaR notes that the algorithm below is known as the greedy algorithm. It does not do very well with certain inputs (I tried 10 different sets of randomly generated sets of inputs of size 100 and in all cases, the sums were very close which led me to think sorting the input was enough for the success of this algorithm).
See also "The Easiest Hard Problem", American Scientist, March-April 2002, recommended by ShreevatsaR.
#!/usr/bin/perl
use strict;
use warnings;
use List::Util qw( sum );
my #numbers = generate_list();
print "#numbers\n\n";
my (#A, #B);
my $N = #numbers;
while ( #numbers ) {
my $n = pop #numbers;
printf "Step: %d\n", $N - #numbers;
{
no warnings 'uninitialized';
if ( sum(#A) < sum(#B) ) {
push #A, $n;
}
else {
push #B, $n;
}
printf "A: %s\n\tsum: %d\n\tnum elements: %d\n",
"#A", sum(#A), scalar #A;
printf "B: %s\n\tsum: %d\n\tnum elements: %d\n\n",
"#B", sum(#B), scalar #B;
}
}
sub generate_list { grep { rand > 0.8 } 1 .. 450 }
Note that generate_list returns a list in ascending order.
I assume the numbers are not sequential, and you can't re-balance?
Because of constraint 1, you're going to need to switch buckets every other insertion, always. So every time you're not forced to pick a bucket, pick a logical bucket (where adding the number would make the sum closer to the other bucket). If this bucket isn't the same one as your previous bucket, you get another turn where you're not forced.
Any dual knapsack algorithm will do (regardless of distribution of numbers).
Simulated Annealing can quite quickly find better and better answers. You could keep 1. true while improving the nearness of 2.
If you need the perfect answer then you have to generate and loop through all of the possible sets of answers. If a pretty good answer is all you need then a technique like simulated annealing is the way to go. Heres some C code that uses a very primitive cooling schedule to find an answer.
#include <stdio.h>
#include <stdlib.h>
#define MAXPAR 50
#define MAXTRIES 10000000
int data1[] = {192,130,446,328,40,174,218,31,59,234,26,365,253,11,198,98,
279,6,276,72,219,15,192,289,289,191,244,62,443,431,363,10
} ;
int data2[] = { 1,2,3,4,5,6,7,8,9 } ;
// What does the set sum to
int sumSet ( int data[], int len )
{
int result = 0 ;
for ( int i=0; i < len; ++i )
result += data[i] ;
return result ;
}
// Print out a set
void printSet ( int data[], int len )
{
for ( int i=0; i < len; ++i )
printf ( "%d ", data[i] ) ;
printf ( " Sums to %d\n", sumSet ( data,len ) ) ;
}
// Partition the values using simulated annealing
void partition ( int data[], size_t len )
{
int set1[MAXPAR] = {0} ; // Parttition 1
int set2[MAXPAR] = {0} ; // Parttition 2
int set1Pos, set2Pos, dataPos, set1Len, set2Len ; // Data about the partitions
int minDiff ; // The best solution found so far
int sum1, sum2, diff ;
int tries = MAXTRIES ; // Don't loop for ever
set1Len = set2Len = -1 ;
dataPos = 0 ;
// Initialize the two partitions
while ( dataPos < len )
{
set1[++set1Len] = data[dataPos++] ;
if ( dataPos < len )
set2[++set2Len] = data[dataPos++] ;
}
// Very primitive simulated annealing solution
sum1 = sumSet ( set1, set1Len ) ;
sum2 = sumSet ( set2, set2Len ) ;
diff = sum1 - sum2 ; // The initial difference - we want to minimize this
minDiff = sum1 + sum2 ;
printf ( "Initial diff is %d\n", diff ) ;
// Loop until a solution is found or all are tries are exhausted
while ( diff != 0 && tries > 0 )
{
// Look for swaps that improves the difference
int newDiff, newSum1, newSum2 ;
set1Pos = rand() % set1Len ;
set2Pos = rand() % set2Len ;
newSum1 = sum1 - set1[set1Pos] + set2[set2Pos] ;
newSum2 = sum2 + set1[set1Pos] - set2[set2Pos] ;
newDiff = newSum1 - newSum2 ;
if ( abs ( newDiff ) < abs ( diff ) || // Is this a better solution?
tries/100 > rand() % MAXTRIES ) // Or shall we just swap anyway - chance of swap decreases as tries reduces
{
int tmp = set1[set1Pos] ;
set1[set1Pos] = set2[set2Pos] ;
set2[set2Pos] = tmp ;
diff = newDiff ;
sum1 = newSum1 ;
sum2 = newSum2 ;
// Print it out if its the best we have seen so far
if ( abs ( diff ) < abs ( minDiff ) )
{
minDiff = diff ;
printSet ( set1, set1Len ) ;
printSet ( set2, set2Len ) ;
printf ( "diff of %d\n\n", abs ( diff ) ) ;
}
}
--tries ;
}
printf ( "done\n" ) ;
}
int main ( int argc, char **argv )
{
// Change this to init rand from the clock say if you don't want the same
// results repoduced evert time!
srand ( 12345 ) ;
partition ( data1, 31 ) ;
partition ( data2, 9 ) ;
return 0;
}
I would give genetic algorithms a try, as this seems a very nice problem to apply them.
The codification is just a binary string of length N, meaning 0 being in the first group and 1 in the second group. Give a negative fitness when the number of elements in each group differs, and a positive fitness when the sums are similar... Something like:
fitness(gen) = (sum(gen)-n/2))^2 + (sum(values[i]*(-1)**gen[i] for i in 0..n))^2
(And minimize the fitness)
Of course this can give you a sub-optimal answer, but for large real world problems it's usually enough.