sequence coverage algorithm - algorithm

I'm looking for an algorithm to solve the following problem:
Given the sequence n containing digits from 0 to 9 and m other sequences, find the smallest (containing the lowest amount) series of sequences that is equal to n.
Example
n = 123456
m1 = 12
m2 = 34
m3 = 56
m4 = 3456
output = m1 + m4 = 123456
Things I've thought of so far
Basic greedy technique using FSM or a trie tree to find the longest sequence fitting in the beginning:
while n not null
longest = the longest sequence fitting in the beginning of n
print longest
n = n - longest
Counterexample
n = 123456
m1 = 12
m2 = 1
m3 = 23456
m4 = 3
m5 = 4
m6 = 5
m7 = 6
algorithm will find m1 + m4 + m5 + m6 + m7 (12 + 3 + 4 + 5 + 6)
algorithm should find m2 + m3 (1 + 23456)
Another greedy method
array = {n} #this represents words to be matched
while array not empty
(word, index) = find longest sequence covering part of any word in array and its index
split array[index] into two words - first before found word, second after it
if any of those split items is null
remove it
Counterexample
n = 12345678
m1 = 3456
m2 = 1
m3 = 2
m4 = 7
m5 = 8
m6 = 123
m7 = 45
m8 = 678
algorithm will find m2 + m3 + m1 + m4 + m5 (1 + 2 + 3456 + 7 + 8)
algorithm should find m6 + m7 + m8 (123 + 45 + 678)

You can use dynamic programming to compute the result step by step.
Lets define s(i) the shortest sequence which generate the first i chars of n.
With the data of the last example, the values of s(i) are :
s(0) = { }
s(1) = { m2 }
s(2) = { m2 + m3 }
s(3) = { m6 }
s(4) = { } (no sequence can generate "1234")
s(5) = { m6 + m7 }
s(6) = { m2 + m3 + m1 }
s(7) = { m2 + m3 + m1 + m4 }
s(8) = { m6 + m7 + m8 }
You have to compute s(1) to s(n) in order. At each step i you look to all the sequences starting from s(0) to s(i-1) and keep the shortest.
For example, for s(8), you find that you have two solutions :
s(8) = s(5) + m8
s(8) = s(7) + m5
And you keep the shortest.
The algorithm :
function computeBestSequence(word, list of subsequences m)
let s(0) := {}
for i from 1 to n // We will compute s(i)
let minSize := +inf.
for j from 0 to i - 1
for all sequences mx from m1 to m9
if s(j) + mx = the first i char of word
if size of s(j) + mx is less than minSize
minSize := size of s(j) + mx
s(i) := s(j) + mx
Edit :
The algorithm can be simplified to use only two loops :
let s(0) := {}
for i from 1 to n // We will compute s(i)
let minSize := +inf.
for all sequences mx from m1 to m9
let j := i - mx.length
if s(j) + mx = the first i char of word
if size of s(j) + mx is less than minSize
minSize := size of s(j) + mx
s(i) := s(j) + mx

Based on obourgains answer java code of his edit version:
import java.util.LinkedList;
import java.util.List;
public class BestSequence {
public static List<String> match(String word, List<String> subsequences) {
int n = word.length();
//let s(0) := {}
List<List<String>> s = new LinkedList<>();
for(int i = 0; i <= n; i++)
s.add(new LinkedList<>());
//for i from 1 to n
for(int i = 1; i <= n; i++) {
//let minSize := +inf.
int minSize = Integer.MAX_VALUE;
//for all sequences mx from m1 to m9
for(String mx : subsequences) {
//let j := i - mx.length
int j = i - mx.length();
if(j < 0)
continue;
//if s(j) + mx = the first i char of word
if(word.substring(0, i).equals(concat(s.get(j)) + mx)) {
//if size of s(j) + mx is less than minSize
if(s.get(j).size() + 1 < minSize) {
//minSize := size of s(j) + mx
minSize = s.get(j).size() + 1;
//s(i) := s(j) + mx
List<String> sj = new LinkedList<>(s.get(j));
sj.add(mx);
s.set(i, sj);
}
}
}
}
return s.get(n);
}
private static String concat(List<String> strs) {
String s = "";
for(String str : strs) {
s += str;
}
return s;
}
}
The test:
#Test
public void bestSequenceTest() {
List<String> l = BestSequence.match("123456", Arrays.asList("12", "34", "56", "3456"));
System.out.println(l);
}
The output:
[12, 3456]

Related

Reverse the isometric projection algorithm

I've got this code:
const a = 2; // always > 0 and known in advance
const b = 3; // always > 0 and known in advance
const c = 4; // always > 0 and known in advance
for (let x = 0; x <= a; x++) {
for (let y = 0; y <= b; y++) {
for (let z = 0; z <= c; z++) {
for (let p = 0; p <= 1; p++) {
for (let q = 0; q <= 2; q++) {
let u = b + x - y + p;
let v = a + b + 2 * c - x - y - 2 * z + q;
let w = c + x + y - z;
}
}
}
}
}
The code generates (a+1)*(b+1)*(c+1)*2*3 triplets of (u,v,w), each of them is unique. And because of that fact, I think it should be possible to write reversed version of this algorithm that will calculate x,y,z,p,q based on u,v,w. I understand that there are only 3 equations and 5 variables to get, but known boundaries for x,y,z,p,q and the fact that all variables are integers should probably help.
for (let u = ?; u <= ?; u++) {
for (let v = ?; v <= ?; v++) {
for (let w = ?; w <= ?; w++) {
x = ?;
y = ?;
z = ?;
p = ?;
q = ?;
}
}
}
I even managed to produce the first line: for (let u = 0; u <= a + b + 1; u++) by taking the equation for u and finding min and max but I'm struggling with moving forward. I understand that min and max values for v are depending on u, but can't figure out the formulas.
Examples are in JS, but I will be thankful for any help in any programming language or even plain math formulas.
If anyone is interested in what this code is actually about - it projects voxel 3d model to triangles on a plain. u,v are resulting 2d coordinates and w is distance from the camera. Reversed algorithm will be actually a kind of raytracing.
UPDATE: Using line equations from 2 points I managed to create minmax conditions for v and code now looks like this:
for (let u = 0; u <= a + b + 1; u++) {
let minv = u <= a ? a - u : -a + u - 1;
let maxv = u <= b ? a + 2 * c + u + 2 : a + 2 * b + 2 * c - u + 3;
for (let v = minv; v <= maxv; v++) {
...
}
}
I think I know what to do with x, y, z, p, q on the last step so the problem left is minw and maxw. As far as I understand those values should depend both on u and v and I must use plane equations?
If the triplets are really unique (didn't check that) and if p and q always go up to 1 and 2 (respectively), then you can "group" triplets together and go up the loop chain.
We'll first find the 3 triplets that where made in the same "q loop" : the triplets make with the same x,y,z,p. As only q change, the only difference will be v, and it will be 3 consecutive numbers.
For that, let's group triplets such that, in a group, all triplets have the same u and same w. Then we sort triplets in groups by their v parameters, and we group them 3 by 3. Inside each group it's easy to assign the correct q variable to each triplet.
Then reduce the groups of 3 into the first triplet (the one with q == 0). We start over to assign the p variable : Group all triplets such that they have same v and w inside a group. Then sort them by the u value, and group them 2 by 2. This let's us find their p value. Remember that each triplet in the group of 3 (before reducing) has that same p value.
Then, for each triplet, we have found p and q. We solve the 3 equation for x,y,z :
z = -1 * ((v + w) - a - b - 3c -q)/3
y = (w - u + z + b - c - p)/2
x = u + y - b - p
After spending some time with articles on geometry and with the huge help from Wolfram Alpha, I managed to write needed equations myself. And yes, I had to use plane equations.
const a = 2; // always > 0 and known in advance
const b = 3; // always > 0 and known in advance
const c = 4; // always > 0 and known in advance
const minu = 0;
const maxu = a + b + 1;
let minv, maxv, minw, maxw;
let x, y, z, p, q;
for (let u = minu; u <= maxu; u++) {
if (u <= a) {
minv = a - u;
} else {
minv = -a + u - 1;
}
if (u <= b) {
maxv = a + 2 * c + u + 2;
} else {
maxv = a + 2 * b + 2 * c - u + 3;
}
for (let v = minv; v <= maxv; v++) {
if (u <= b && v >= a + u + 1) {
minw = (-a + 2 * b - 3 * u + v - 2) / 2;
} else if (u > b && v >= a + 2 * b - u + 2) {
minw = (-a - 4 * b + 3 * u + v - 5) / 2;
} else {
minw = a + b - v;
}
if (u <= a && v <= a + 2 * c - u + 1) {
maxw = (-a + 2 * b + 3 * u + v - 1) / 2;
} else if (u > a && v <= -a + 2 * c + u) {
maxw = (5 * a + 2 * b - 3 * u + v + 2) / 2;
} else {
maxw = a + b + 3 * c - v + 2;
}
minw = Math.round(minw);
maxw = Math.round(maxw);
for (let w = minw; w <= maxw; w++) {
z = (a + b + 3 * c - v - w + 2) / 3;
q = Math.round(2 - (z % 1) * 3);
z = Math.floor(z);
y = (a + 4 * b + q - 3 * u - v + 2 * w + 3) / 6;
p = 1 - (y % 1) * 2;
y = Math.floor(y);
x = (a - 2 * b - 3 * p + q + 3 * u - v + 2 * w) / 6;
x = Math.round(x);
}
}
}
This code passes my tests, but if someone can create better solution, I would be very interested.

which solution is better in terms of space.time complexity?

i have 2 lists of integers. they are both sorted already. I want to find the elements (one from each list) that add up to a given number.
-first idea is to iterate over first list and use binary search to look for the number needed to sum to the given number. i know this will take nlogn time.
the other is to store one of the lists in a hashtable/map (i dont really know the difference) and iterate over other list and look up the needed value. does this take n time? and n memory?
overall which would be better?
You are comparing it right. But both has different aspects. Hashing is not a good choice if you have memory constraints. But if you have plenty of memory then yes, you can afford to do that.
Also you will see many times in Computer Science the notion of space-time tradeoff. It will always be some gain by losing some. Hashing runs in O(n) and space complexity is O(n). But in case of searching only O(nlogn) time complexity but space complexity is O(1)
Long story short, scenario lets you decide which one to select. I have shown just one aspect. There can be many. Know the constraints and tradeoffs of each and you will be able to decide it.
A better solution : (Time complexity: O(n) Space complexity: O(1))
Suppose there are 2 array a and b.
Now WLOG suppose a is sorted in ascending and another in descending (Even if it is not the case we can traverse it accordingly).
index1=0;index2=0; // considered 0 indexing
while(index1 <= N1-1 && index2 <= N2-1)
{
if ((a[index1] + b[index2]) == x)
// success
else if ((a[index1] + b[index2]) > x)
index2++;
else
index1++;
}
//failure no such element.
Sort list A in ascending order, and list B in descending order. Set a = 1 and b = 1.
If A[a] + B[b] = T, record the pair, increment a, and repeat.
Otherwise, A[a] + B[b] < T, increment a, and repeat from 1.
Otherwise, A[a] + B[b] > T, increment b, and repeat from 1.
Naturally, if a or b exceeds the size of A or B, respectively, terminate.
Example:
A = 1, 2, 2, 6, 8, 10, 11
B = 9, 8, 4, 3, 1, 1
T = 10
a = 1, b = 1
A[a] + B[b] = A[1] + B[1] = 10; record; a = a + 1 = 2; repeat.
A[a] + B[b] = A[2] + B[1] = 11; b = b + 1 = 2; repeat.
A[a] + B[b] = A[2] + B[2] = 10; record; a = a + 1 = 3; repeat.
A[a] + B[b] = A[3] + B[2] = 10; record; a = a + 1 = 4; repeat.
A[a] + B[b] = A[4] + B[2] = 14; b = b + 1 = 3; repeat.
A[a] + B[b] = A[4] + B[3] = 10; record; a = a + 1 = 5; repeat.
A[a] + B[b] = A[5] + B[3] = 12; b = b + 1 = 4; repeat.
A[a] + B[b] = A[5] + B[4] = 11; b = b + 1 = 5; repeat.
A[a] + B[b] = A[5] + B[5] = 9; a = a + 1 = 6; repeat.
A[a] + B[b] = A[6] + B[5] = 11; b = b + 1 = 6; repeat.
A[a] + B[b] = A[6] + B[6] = 11; b = b + 1 = 7; repeat.
Terminate.
You can do this without additional space if instead of having B sorted in descending order, you set b = |B| and decrement it instead of incrementing it, effectively reading it backwards.
The above procedure misses out on some duplicate answers where B has a string of duplicate values, for instance:
A = 2, 2, 2
B = 8, 8, 8
The algorithm as described above will yield three pairs, but you might want nine. This can be fixed by detecting this case, keeping separate counters ca and cb for the lengths of the runs of A[a] and B[b] you have seen, and adding ca * cb - ca copies of the last pair you added to the bag. In this example:
A = 2, 2, 2
B = 8, 8, 8
a = 1, b = 1
ca = 0, cb = 0
A[a] + B[b] = 10; record pair, a = a + 1 = 2, ca = ca + 1 = 2, repeat.
A[a] + B[b] = 10; record pair, a = a + 1 = 3, ca = ca + 1 = 2, repeat.
A[a] + B[b] = 10; record pair, a = a + 1 = 4;
a exceeds bounds, value of A[a] changed;
increment b to count run of B's;
b = b + 1 = 2, cb = cb + 1 = 2
b = b + 1 = 3, cb = cb + 1 = 3
b = b + 1 = 4;
b exceeds bounds, value of B[b] changed;
add ca * cb - ca = 3 * 3 - 3 = 6 copies of pair (2, 8).

Understanding Spark correlation algorithm

I was reading Spark correlation algorithm source code and while going through the code, I coulddn't understand this particular peace of code.
This is from the file : org/apache/spark/mllib/linalg/BLAS.scala
def spr(alpha: Double, v: Vector, U: Array[Double]): Unit = {
val n = v.size
v match {
case DenseVector(values) =>
NativeBLAS.dspr("U", n, alpha, values, 1, U)
case SparseVector(size, indices, values) =>
val nnz = indices.length
var colStartIdx = 0
var prevCol = 0
var col = 0
var j = 0
var i = 0
var av = 0.0
while (j < nnz) {
col = indices(j)
// Skip empty columns.
colStartIdx += (col - prevCol) * (col + prevCol + 1) / 2
av = alpha * values(j)
i = 0
while (i <= j) {
U(colStartIdx + indices(i)) += av * values(i)
i += 1
}
j += 1
prevCol = col
}
}
}
I do not know Scala and that could be the reason I could not understand it. Can someone explain what is happening here.
It is being called from Rowmatrix.scala
def computeGramianMatrix(): Matrix = {
val n = numCols().toInt
checkNumColumns(n)
// Computes n*(n+1)/2, avoiding overflow in the multiplication.
// This succeeds when n <= 65535, which is checked above
val nt = if (n % 2 == 0) ((n / 2) * (n + 1)) else (n * ((n + 1) / 2))
// Compute the upper triangular part of the gram matrix.
val GU = rows.treeAggregate(new BDV[Double](nt))(
seqOp = (U, v) => {
BLAS.spr(1.0, v, U.data)
U
}, combOp = (U1, U2) => U1 += U2)
RowMatrix.triuToFull(n, GU.data)
}
The correlation is defined here:
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
The final goal is to understand the Spark correlation algorithm.
Update 1: Relevent paper https://stanford.edu/~rezab/papers/linalg.pdf

How to find ith item in zigzag ordering?

A question last week defined the zig zag ordering on an n by m matrix and asked how to list the elements in that order.
My question is how to quickly find the ith item in the zigzag ordering? That is, without traversing the matrix (for large n and m that's much too slow).
For example with n=m=8 as in the picture and (x, y) describing (row, column)
f(0) = (0, 0)
f(1) = (0, 1)
f(2) = (1, 0)
f(3) = (2, 0)
f(4) = (1, 1)
...
f(63) = (7, 7)
Specific question: what is the ten billionth (1e10) item in the zigzag ordering of a million by million matrix?
Let's assume that the desired element is located in the upper half of the matrix. The length of the diagonals are 1, 2, 3 ..., n.
Let's find the desired diagonal. It satisfies the following property:
sum(1, 2 ..., k) >= pos but sum(1, 2, ..., k - 1) < pos. The sum of 1, 2, ..., k is k * (k + 1) / 2. So we just need to find the smallest integer k such that k * (k + 1) / 2 >= pos. We can either use a binary search or solve this quadratic inequality explicitly.
When we know the k, we just need to find the pos - (k - 1) * k / 2 element of this diagonal. We know where it starts and where we should move(up or down, depending on the parity of k), so we can find the desired cell using a simple formula.
This solution has an O(1) or an O(log n) time complexity(it depends on whether we use a binary search or solve the inequation explicitly in step 2).
If the desired element is located in the lower half of the matrix, we can solve this problem for a pos' = n * n - pos + 1 and then use symmetry to get the solution to the original problem.
I used 1-based indexing in this solution, using 0-based indexing might require adding +1 or -1 somewhere, but the idea of the solution is the same.
If the matrix is rectangular, not square, we need to consider the fact the length of diagonals look this way: 1, 2, 3, ..., m, m, m, .., m, m - 1, ..., 1(if m <= n) when we search for the k, so the sum becomes something like k * (k + 1) / 2 if k <= m and k * (k + 1) / 2 + m * (k - m) otherwise.
import math, random
def naive(n, m, ord, swap = False):
dx = 1
dy = -1
if swap:
dx, dy = dy, dx
cur = [0, 0]
for i in range(ord):
cur[0] += dy
cur[1] += dx
if cur[0] < 0 or cur[1] < 0 or cur[0] >= n or cur[1] >= m:
dx, dy = dy, dx
if cur[0] >= n:
cur[0] = n - 1
cur[1] += 2
if cur[1] >= m:
cur[1] = m - 1
cur[0] += 2
if cur[0] < 0: cur[0] = 0
if cur[1] < 0: cur[1] = 0
return cur
def fast(n, m, ord, swap = False):
if n < m:
x, y = fast(m, n, ord, not swap)
return [y, x]
alt = n * m - ord - 1
if alt < ord:
x, y = fast(n, m, alt, swap if (n + m) % 2 == 0 else not swap)
return [n - x - 1, m - y - 1]
if ord < (m * (m + 1) / 2):
diag = int((-1 + math.sqrt(1 + 8 * ord)) / 2)
parity = (diag + (0 if swap else 1)) % 2
within = ord - (diag * (diag + 1) / 2)
if parity: return [diag - within, within]
else: return [within, diag - within]
else:
ord -= (m * (m + 1) / 2)
diag = int(ord / m)
within = ord - diag * m
diag += m
parity = (diag + (0 if swap else 1)) % 2
if not parity:
within = m - within - 1
return [diag - within, within]
if __name__ == "__main__":
for i in range(1000):
n = random.randint(3, 100)
m = random.randint(3, 100)
ord = random.randint(0, n * m - 1)
swap = random.randint(0, 99) < 50
na = naive(n, m, ord, swap)
fa = fast(n, m, ord, swap)
assert na == fa, "(%d, %d, %d, %s) ==> (%s), (%s)" % (n, m, ord, swap, na, fa)
print fast(1000000, 1000000, 9999999999, False)
print fast(1000000, 1000000, 10000000000, False)
So the 10-billionth element (the one with ordinal 9999999999), and the 10-billion-first element (the one with ordinal 10^10) are:
[20331, 121089]
[20330, 121090]
An analytical solution
In the general case, your matrix will be divided in 3 areas:
an initial triangle t1
a skewed part mid where diagonals have a constant length
a final triangle t2
Let's call p the index of your diagonal run.
We want to define two functions x(p) and y(p) that give you the column and row of the pth cell.
Initial triangle
Let's look at the initial triangular part t1, where each new diagonal is one unit longer than the preceding.
Now let's call d the index of the diagonal that holds the cell, and
Sp = sum(di) for i in [0..p-1]
We have p = Sp + k, with 0 <=k <= d and
Sp = d(d+1)/2
if we solve for d, it brings
d²+d-2p = 0, a quadratic equation where we retain only the positive root:
d = (-1+sqrt(1+8*p))/2
Now we want the highest integer value closest to d, which is floor(d).
In the end, we have
p = d + k with d = floor((-1+sqrt(1+8*p))/2) and k = p - d(d+1)/2
Let's call
o(d) the function that equals 1 if d is odd and 0 otherwise, and
e(d) the function that equals 1 if d is even and 0 otherwise.
We can compute x(p) and y(p) like so:
d = floor((-1+sqrt(1+8*p))/2)
k = p - d(d+1)/2
o = d % 2
e = 1 - o
x = e*d + (o-e)*k
y = o*d + (e-o)*k
even and odd functions are used to try to salvage some clarity, but you can replace
e(p) with 1 - o(p) and have slightly more efficient but less symetric formulaes for x and y.
Middle part
let's consider the smallest matrix dimension s, i.e. s = min (m,n).
The previous formulaes hold until x or y (whichever comes first) reaches the value s.
The upper bound of p such as x(i) <= s and y(i) <= s for all i in [0..p]
(i.e. the cell indexed by p is inside the initial triangle t1) is given by
pt1 = s(s+1)/2.
For p >= pt1, diagonal length remains equal to s until we reach the second triangle t2.
when inside mid, we have:
p = s(s+1)/2 + ds + k with k in [0..s[.
which yields:
d = floor ((p - s(s+1)/2)/s)
k = p - ds
We can then use the same even/odd trick to compute x(p) and y(p):
p -= s(s+1)/2
d = floor (p / s)
k = p - d*s
o = (d+s) % 2
e = 1 - o
x = o*s + (e-o)*k
y = e*s + (o-e)*k
if (n > m)
x += d+e
y -= e
else
y += d+o
x -= o
Final triangle
Using symetry, we can calculate pt2 = m*n - s(s+1)/2
We now face nearly the same problem as for t1, except that the diagonal may run in the same direction as for t1 or in the reverse direction (if n+m is odd).
Using symetry tricks, we can compute x(p) and y(p) like so:
p = n*m -1 - p
d = floor((-1+sqrt(1+8*p))/2)
k = p - d*(d+1)/2
o = (d+m+n) % 2
e = 1 - $o;
x = n-1 - (o*d + (e-o)*k)
y = m-1 - (e*d + (o-e)*k)
Putting all together
Here is a sample c++ implementation.
I used 64 bits integers out of sheer lazyness. Most could be replaced by 32 bits values.
The computations could be made more effective by precomputing a few more coefficients.
A good part of the code could be factorized, but I doubt it is worth the effort.
Since this is just a quick and dirty proof of concept, I did not optimize it.
#include <cstdio> // printf
#include <algorithm> // min
using namespace std;
typedef long long tCoord;
void panic(const char * msg)
{
printf("PANIC: %s\n", msg);
exit(-1);
}
struct tPoint {
tCoord x, y;
tPoint(tCoord x = 0, tCoord y = 0) : x(x), y(y) {}
tPoint operator+(const tPoint & p) const { return{ x + p.x, y + p.y }; }
bool operator!=(const tPoint & p) const { return x != p.x || y != p.y; }
};
class tMatrix {
tCoord n, m; // dimensions
tCoord s; // smallest dimension
tCoord pt1, pt2; // t1 / mid / t2 limits for p
public:
tMatrix(tCoord n, tCoord m) : n(n), m(m)
{
s = min(n, m);
pt1 = (s*(s + 1)) / 2;
pt2 = n*m - pt1;
}
tPoint diagonal_cell(tCoord p)
{
tCoord x, y;
if (p < pt1) // inside t1
{
tCoord d = (tCoord)floor((-1 + sqrt(1 + 8 * p)) / 2);
tCoord k = p - (d*(d + 1)) / 2;
tCoord o = d % 2;
tCoord e = 1 - o;
x = o*d + (e - o)*k;
y = e*d + (o - e)*k;
}
else if (p < pt2) // inside mid
{
p -= pt1;
tCoord d = (tCoord)floor(p / s);
tCoord k = p - d*s;
tCoord o = (d + s) % 2;
tCoord e = 1 - o;
x = o*s + (e - o)*k;
y = e*s + (o - e)*k;
if (m > n) // vertical matrix
{
x -= o;
y += d + o;
}
else // horizontal matrix
{
x += d + e;
y -= e;
}
}
else // inside t2
{
p = n * m - 1 - p;
tCoord d = (tCoord)floor((-1 + sqrt(1 + 8 * p)) / 2);
tCoord k = p - (d*(d + 1)) / 2;
tCoord o = (d + m + n) % 2;
tCoord e = 1 - o;
x = n - 1 - (o*d + (e - o)*k);
y = m - 1 - (e*d + (o - e)*k);
}
return{ x, y };
}
void check(void)
{
tPoint move[4] = { { 1, 0 }, { -1, 1 }, { 1, -1 }, { 0, 1 } };
tPoint pos;
tCoord dir = 0;
for (tCoord p = 0; p != n * m ; p++)
{
tPoint dc = diagonal_cell(p);
if (pos != dc) panic("zot!");
pos = pos + move[dir];
if (dir == 0)
{
if (pos.y == m - 1) dir = 2;
else dir = 1;
}
else if (dir == 3)
{
if (pos.x == n - 1) dir = 1;
else dir = 2;
}
else if (dir == 1)
{
if (pos.y == m - 1) dir = 0;
else if (pos.x == 0) dir = 3;
}
else
{
if (pos.x == n - 1) dir = 3;
else if (pos.y == 0) dir = 0;
}
}
}
};
void main(void)
{
const tPoint dim[] = { { 10, 10 }, { 11, 11 }, { 10, 30 }, { 30, 10 }, { 10, 31 }, { 31, 10 }, { 11, 31 }, { 31, 11 } };
for (tPoint d : dim)
{
printf("Checking a %lldx%lld matrix...", d.x, d.y);
tMatrix(d.x, d.y).check();
printf("done\n");
}
tCoord p = 10000000000;
tMatrix matrix(1000000, 1000000);
tPoint cell = matrix.diagonal_cell(p);
printf("Coordinates of %lldth cell: (%lld,%lld)\n", p, cell.x, cell.y);
}
Results are checked against "manual" sweep of the matrix.
This "manual" sweep is a ugly hack that won't work for a one-row or one-column matrix, though diagonal_cell() does work on any matrix (the "diagonal" sweep becomes linear in that case).
The coordinates found for the 10.000.000.000th cell of a 1.000.000x1.000.000 matrix seem consistent, since the diagonal d on which the cell stands is about sqrt(2*1e10), approx. 141421, and the sum of cell coordinates is about equal to d (121090+20330 = 141420). Besides, it is also what the two other posters report.
I would say there is a good chance this lump of obfuscated code actually produces an O(1) solution to your problem.

How to calculate the index (lexicographical order) when the combination is given

I know that there is an algorithm that permits, given a combination of number (no repetitions, no order), calculates the index of the lexicographic order.
It would be very useful for my application to speedup things...
For example:
combination(10, 5)
1 - 1 2 3 4 5
2 - 1 2 3 4 6
3 - 1 2 3 4 7
....
251 - 5 7 8 9 10
252 - 6 7 8 9 10
I need that the algorithm returns the index of the given combination.
es: index( 2, 5, 7, 8, 10 ) --> index
EDIT: actually I'm using a java application that generates all combinations C(53, 5) and inserts them into a TreeMap.
My idea is to create an array that contains all combinations (and related data) that I can index with this algorithm.
Everything is to speedup combination searching.
However I tried some (not all) of your solutions and the algorithms that you proposed are slower that a get() from TreeMap.
If it helps: my needs are for a combination of 5 from 53 starting from 0 to 52.
Thank you again to all :-)
Here is a snippet that will do the work.
#include <iostream>
int main()
{
const int n = 10;
const int k = 5;
int combination[k] = {2, 5, 7, 8, 10};
int index = 0;
int j = 0;
for (int i = 0; i != k; ++i)
{
for (++j; j != combination[i]; ++j)
{
index += c(n - j, k - i - 1);
}
}
std::cout << index + 1 << std::endl;
return 0;
}
It assumes you have a function
int c(int n, int k);
that will return the number of combinations of choosing k elements out of n elements.
The loop calculates the number of combinations preceding the given combination.
By adding one at the end we get the actual index.
For the given combination there are
c(9, 4) = 126 combinations containing 1 and hence preceding it in lexicographic order.
Of the combinations containing 2 as the smallest number there are
c(7, 3) = 35 combinations having 3 as the second smallest number
c(6, 3) = 20 combinations having 4 as the second smallest number
All of these are preceding the given combination.
Of the combinations containing 2 and 5 as the two smallest numbers there are
c(4, 2) = 6 combinations having 6 as the third smallest number.
All of these are preceding the given combination.
Etc.
If you put a print statement in the inner loop you will get the numbers
126, 35, 20, 6, 1.
Hope that explains the code.
Convert your number selections to a factorial base number. This number will be the index you want. Technically this calculates the lexicographical index of all permutations, but if you only give it combinations, the indexes will still be well ordered, just with some large gaps for all the permutations that come in between each combination.
Edit: pseudocode removed, it was incorrect, but the method above should work. Too tired to come up with correct pseudocode at the moment.
Edit 2: Here's an example. Say we were choosing a combination of 5 elements from a set of 10 elements, like in your example above. If the combination was 2 3 4 6 8, you would get the related factorial base number like so:
Take the unselected elements and count how many you have to pass by to get to the one you are selecting.
1 2 3 4 5 6 7 8 9 10
2 -> 1
1 3 4 5 6 7 8 9 10
3 -> 1
1 4 5 6 7 8 9 10
4 -> 1
1 5 6 7 8 9 10
6 -> 2
1 5 7 8 9 10
8 -> 3
So the index in factorial base is 1112300000
In decimal base, it's
1*9! + 1*8! + 1*7! + 2*6! + 3*5! = 410040
This is Algorithm 2.7 kSubsetLexRank on page 44 of Combinatorial Algorithms by Kreher and Stinson.
r = 0
t[0] = 0
for i from 1 to k
if t[i - 1] + 1 <= t[i] - 1
for j from t[i - 1] to t[i] - 1
r = r + choose(n - j, k - i)
return r
The array t holds your values, for example [5 7 8 9 10]. The function choose(n, k) calculates the number "n choose k". The result value r will be the index, 251 for the example. Other inputs are n and k, for the example they would be 10 and 5.
zero-base,
# v: array of length k consisting of numbers between 0 and n-1 (ascending)
def index_of_combination(n,k,v):
idx = 0
for p in range(k-1):
if p == 0: arrg = range(1,v[p]+1)
else: arrg = range(v[p-1]+2, v[p]+1)
for a in arrg:
idx += combi[n-a, k-1-p]
idx += v[k-1] - v[k-2] - 1
return idx
Null Set has the right approach. The index corresponds to the factorial-base number of the sequence. You build a factorial-base number just like any other base number, except that the base decreases for each digit.
Now, the value of each digit in the factorial-base number is the number of elements less than it that have not yet been used. So, for combination(10, 5):
(1 2 3 4 5) == 0*9!/5! + 0*8!/5! + 0*7!/5! + 0*6!/5! + 0*5!/5!
== 0*3024 + 0*336 + 0*42 + 0*6 + 0*1
== 0
(10 9 8 7 6) == 9*3024 + 8*336 + 7*42 + 6*6 + 5*1
== 30239
It should be pretty easy to calculate the index incrementally.
If you have a set of positive integers 0<=x_1 < x_2< ... < x_k , then you could use something called the squashed order:
I = sum(j=1..k) Choose(x_j,j)
The beauty of the squashed order is that it works independent of the largest value in the parent set.
The squashed order is not the order you are looking for, but it is related.
To use the squashed order to get the lexicographic order in the set of k-subsets of {1,...,n) is by taking
1 <= x1 < ... < x_k <=n
compute
0 <= n-x_k < n-x_(k-1) ... < n-x_1
Then compute the squashed order index of (n-x_k,...,n-k_1)
Then subtract the squashed order index from Choose(n,k) to get your result, which is the lexicographic index.
If you have relatively small values of n and k, you can cache all the values Choose(a,b) with a
See Anderson, Combinatorics on Finite Sets, pp 112-119
I needed also the same for a project of mine and the fastest solution I found was (Python):
import math
def nCr(n,r):
f = math.factorial
return f(n) / f(r) / f(n-r)
def index(comb,n,k):
r=nCr(n,k)
for i in range(k):
if n-comb[i]<k-i:continue
r=r-nCr(n-comb[i],k-i)
return r
My input "comb" contained elements in increasing order You can test the code with for example:
import itertools
k=3
t=[1,2,3,4,5]
for x in itertools.combinations(t, k):
print x,index(x,len(t),k)
It is not hard to prove that if comb=(a1,a2,a3...,ak) (in increasing order) then:
index=[nCk-(n-a1+1)Ck] + [(n-a1)C(k-1)-(n-a2+1)C(k-1)] + ... =
nCk -(n-a1)Ck -(n-a2)C(k-1) - .... -(n-ak)C1
There's another way to do all this. You could generate all possible combinations and write them into a binary file where each comb is represented by it's index starting from zero. Then, when you need to find an index, and the combination is given, you apply a binary search on the file. Here's the function. It's written in VB.NET 2010 for my lotto program, it works with Israel lottery system so there's a bonus (7th) number; just ignore it.
Public Function Comb2Index( _
ByVal gAr() As Byte) As UInt32
Dim mxPntr As UInt32 = WHL.AMT.WHL_SYS_00 '(16.273.488)
Dim mdPntr As UInt32 = mxPntr \ 2
Dim eqCntr As Byte
Dim rdAr() As Byte
modBinary.OpenFile(WHL.WHL_SYS_00, _
FileMode.Open, FileAccess.Read)
Do
modBinary.ReadBlock(mdPntr, rdAr)
RP: If eqCntr = 7 Then GoTo EX
If gAr(eqCntr) = rdAr(eqCntr) Then
eqCntr += 1
GoTo RP
ElseIf gAr(eqCntr) < rdAr(eqCntr) Then
If eqCntr > 0 Then eqCntr = 0
mxPntr = mdPntr
mdPntr \= 2
ElseIf gAr(eqCntr) > rdAr(eqCntr) Then
If eqCntr > 0 Then eqCntr = 0
mdPntr += (mxPntr - mdPntr) \ 2
End If
Loop Until eqCntr = 7
EX: modBinary.CloseFile()
Return mdPntr
End Function
P.S. It takes 5 to 10 mins to generate 16 million combs on a Core 2 Duo. To find the index using binary search on file takes 397 milliseconds on a SATA drive.
Assuming the maximum setSize is not too large, you can simply generate a lookup table, where the inputs are encoded this way:
int index(a,b,c,...)
{
int key = 0;
key |= 1<<a;
key |= 1<<b;
key |= 1<<c;
//repeat for all arguments
return Lookup[key];
}
To generate the lookup table, look at this "banker's order" algorithm. Generate all the combinations, and also store the base index for each nItems. (For the example on p6, this would be [0,1,5,11,15]). Note that by you storing the answers in the opposite order from the example (LSBs set first) you will only need one table, sized for the largest possible set.
Populate the lookup table by walking through the combinations doing Lookup[combination[i]]=i-baseIdx[nItems]
EDIT: Never mind. This is completely wrong.
Let your combination be (a1, a2, ..., ak-1, ak) where a1 < a2 < ... < ak. Let choose(a,b) = a!/(b!*(a-b)!) if a >= b and 0 otherwise. Then, the index you are looking for is
choose(ak-1, k) + choose(ak-1-1, k-1) + choose(ak-2-1, k-2) + ... + choose (a2-1, 2) + choose (a1-1, 1) + 1
The first term counts the number of k-element combinations such that the largest element is less than ak. The second term counts the number of (k-1)-element combinations such that the largest element is less than ak-1. And, so on.
Notice that the size of the universe of elements to be chosen from (10 in your example) does not play a role in the computation of the index. Can you see why?
Sample solution:
class Program
{
static void Main(string[] args)
{
// The input
var n = 5;
var t = new[] { 2, 4, 5 };
// Helping transformations
ComputeDistances(t);
CorrectDistances(t);
// The algorithm
var r = CalculateRank(t, n);
Console.WriteLine("n = 5");
Console.WriteLine("t = {2, 4, 5}");
Console.WriteLine("r = {0}", r);
Console.ReadKey();
}
static void ComputeDistances(int[] t)
{
var k = t.Length;
while (--k >= 0)
t[k] -= (k + 1);
}
static void CorrectDistances(int[] t)
{
var k = t.Length;
while (--k > 0)
t[k] -= t[k - 1];
}
static int CalculateRank(int[] t, int n)
{
int k = t.Length - 1, r = 0;
for (var i = 0; i < t.Length; i++)
{
if (t[i] == 0)
{
n--;
k--;
continue;
}
for (var j = 0; j < t[i]; j++)
{
n--;
r += CalculateBinomialCoefficient(n, k);
}
n--;
k--;
}
return r;
}
static int CalculateBinomialCoefficient(int n, int k)
{
int i, l = 1, m, x, y;
if (n - k < k)
{
x = k;
y = n - k;
}
else
{
x = n - k;
y = k;
}
for (i = x + 1; i <= n; i++)
l *= i;
m = CalculateFactorial(y);
return l/m;
}
static int CalculateFactorial(int n)
{
int i, w = 1;
for (i = 1; i <= n; i++)
w *= i;
return w;
}
}
The idea behind the scenes is to associate a k-subset with an operation of drawing k-elements from the n-size set. It is a combination, so the overall count of possible items will be (n k). It is a clue that we could seek the solution in Pascal Triangle. After a while of comparing manually written examples with the appropriate numbers from the Pascal Triangle, we will find the pattern and hence the algorithm.
I used user515430's answer and converted to python3. Also this supports non-continuous values so you could pass in [1,3,5,7,9] as your pool instead of range(1,11)
from itertools import combinations
from scipy.special import comb
from pandas import Index
debugcombinations = False
class IndexedCombination:
def __init__(self, _setsize, _poolvalues):
self.setsize = _setsize
self.poolvals = Index(_poolvalues)
self.poolsize = len(self.poolvals)
self.totalcombinations = 1
fast_k = min(self.setsize, self.poolsize - self.setsize)
for i in range(1, fast_k + 1):
self.totalcombinations = self.totalcombinations * (self.poolsize - fast_k + i) // i
#fill the nCr cache
self.choose_cache = {}
n = self.poolsize
k = self.setsize
for i in range(k + 1):
for j in range(n + 1):
if n - j >= k - i:
self.choose_cache[n - j,k - i] = comb(n - j,k - i, exact=True)
if debugcombinations:
print('testnth = ' + str(self.testnth()))
def get_nth_combination(self,index):
n = self.poolsize
r = self.setsize
c = self.totalcombinations
#if index < 0 or index >= c:
# raise IndexError
result = []
while r:
c, n, r = c*r//n, n-1, r-1
while index >= c:
index -= c
c, n = c*(n-r)//n, n-1
result.append(self.poolvals[-1 - n])
return tuple(result)
def get_n_from_combination(self,someset):
n = self.poolsize
k = self.setsize
index = 0
j = 0
for i in range(k):
setidx = self.poolvals.get_loc(someset[i])
for j in range(j + 1, setidx + 1):
index += self.choose_cache[n - j, k - i - 1]
j += 1
return index
#just used to test whether nth_combination from the internet actually works
def testnth(self):
n = 0
_setsize = self.setsize
mainset = self.poolvals
for someset in combinations(mainset, _setsize):
nthset = self.get_nth_combination(n)
n2 = self.get_n_from_combination(nthset)
if debugcombinations:
print(str(n) + ': ' + str(someset) + ' vs ' + str(n2) + ': ' + str(nthset))
if n != n2:
return False
for x in range(_setsize):
if someset[x] != nthset[x]:
return False
n += 1
return True
setcombination = IndexedCombination(5, list(range(1,10+1)))
print( str(setcombination.get_n_from_combination([2,5,7,8,10])))
returns 188

Resources