Algorithm to divide text into 3 evenly-sized groups - algorithm

I'm would like to create an algorithm that will divide text into 3-evenly sized groups (based on text length). Since this will be put to use for line-breaks, the order of the text needs to be maintained.
For instance this string:
Just testing to see how this works.
would sort to:
Just testing // 12 characters
to see how // 10 characters
this works. // 11 characters
Any ideas?

The "minimum raggedness" dynamic program, also from the Wikipedia article on word wrap, can be adapted to your needs. Set LineWidth = len(text)/n - 1 and ignore the comment about infinite penalties for exceeding the line width; use the definition of c(i, j) as is with P = 2.
Code. I took the liberty of modifying the DP always to return exactly n lines, at the cost of increasing the running time from O(#words ** 2) to O(#words ** 2 * n).
def minragged(text, n=3):
"""
>>> minragged('Just testing to see how this works.')
['Just testing', 'to see how', 'this works.']
>>> minragged('Just testing to see how this works.', 10)
['', '', 'Just', 'testing', 'to', 'see', 'how', 'this', 'works.', '']
"""
words = text.split()
cumwordwidth = [0]
# cumwordwidth[-1] is the last element
for word in words:
cumwordwidth.append(cumwordwidth[-1] + len(word))
totalwidth = cumwordwidth[-1] + len(words) - 1 # len(words) - 1 spaces
linewidth = float(totalwidth - (n - 1)) / float(n) # n - 1 line breaks
def cost(i, j):
"""
cost of a line words[i], ..., words[j - 1] (words[i:j])
"""
actuallinewidth = max(j - i - 1, 0) + (cumwordwidth[j] - cumwordwidth[i])
return (linewidth - float(actuallinewidth)) ** 2
# best[l][k][0] is the min total cost for words 0, ..., k - 1 on l lines
# best[l][k][1] is a minimizing index for the start of the last line
best = [[(0.0, None)] + [(float('inf'), None)] * len(words)]
# xrange(upper) is the interval 0, 1, ..., upper - 1
for l in xrange(1, n + 1):
best.append([])
for j in xrange(len(words) + 1):
best[l].append(min((best[l - 1][k][0] + cost(k, j), k) for k in xrange(j + 1)))
lines = []
b = len(words)
# xrange(upper, 0, -1) is the interval upper, upper - 1, ..., 1
for l in xrange(n, 0, -1):
a = best[l][b][1]
lines.append(' '.join(words[a:b]))
b = a
lines.reverse()
return lines
if __name__ == '__main__':
import doctest
doctest.testmod()

You can try the next simple heuristic for starters: Place to iterators in n/3 and 2n/3 and search for the closest space near each of them.

From http://en.wikipedia.org/wiki/Word_wrap:
SpaceLeft := LineWidth
for each Word in Text
if Width(Word) > SpaceLeft
insert line break before Word in Text
SpaceLeft := LineWidth - Width(Word)
else
SpaceLeft := SpaceLeft - (Width(Word) + SpaceWidth)
This method is used by many modern word processors, such as OpenOffice.org Writer and Microsoft Word. This algorithm is optimal in that it always puts the text on the minimum number of lines.

The answer from "someone" works fine. However, I had problems translating this into SWIFT code. Here is my translation for all those that are interested.
import Foundation
class SplitText{
typealias MinRag = (Float, Int) // meaning (cost for line (so far), word index)
// from http://stackoverflow.com/questions/6426017/word-wrap-to-x-lines-instead-of-maximum-width-least-raggedness?lq=1
class func splitText(text:String, numberOfLines:Int)-> [String]{
//preparations
var words = split(text, maxSplit:100, allowEmptySlices: false, isSeparator:{(s:Character)-> Bool in return s == " " || s == "\n"})
var cumwordwidth = [Int](); //cummulative word widths
cumwordwidth.append(0);
for word in words{
cumwordwidth.append(cumwordwidth[cumwordwidth.count - 1] + count(word));
}
var totalwidth = cumwordwidth[cumwordwidth.count - 1] + count(words) - 1;
var linewidth:Float = Float(totalwidth - (numberOfLines - 1)) / Float(numberOfLines)
// cost function for one line for words i .. j
var cost = { (i:Int,j:Int)-> Float in
var actuallinewidth = max(j - i - 1, 0) + (cumwordwidth[j] - cumwordwidth[i]);
var remainingWidth: Float = linewidth - Float(actuallinewidth)
return remainingWidth * remainingWidth
}
var best = [[MinRag]]()
var tmp = [MinRag]();
//ensure that data structure is initialised in a way that we start with adding the first word
tmp.append((0, -1));
for word in words {
tmp.append((Float.infinity , -1));
}
best.append(tmp);
//now we can start. We simply calculate the cost for all possible lines
for l in 1...numberOfLines {
tmp = [MinRag]()
for j in 0...words.count {
var min:MinRag = (best[l - 1][0].0 + cost(0, j), 0);
var k: Int
for k = 0; k < j + 1 ; ++k {
var loc:Float = best[l - 1][k].0 + cost(k, j);
if (loc < min.0 || (loc == min.0 && k < min.1)) {
min=(loc, k);
}
println("l=\(l), j=\(j), k=\(k), min=\(min)")
}
tmp.append(min);
}
best.append(tmp);
}
//now build the answer based on above calculations
var lines = [String]();
var b = words.count;
var o:Int
for o = numberOfLines; o > 0 ; --o {
var a = best[o][b].1;
lines.append(" ".join(words[a...b-1]));
b = a;
}
return reverse(lines);
}
}

Related

Trapping Rain Water(algorithm)

That is a question at leetcode website.
https://leetcode-cn.com/problems/trapping-rain-water/
Given n non-negative integers representing an elevation map where the width of each bar is 1, compute how much water it can trap after raining.
I wrote the solution below.
int solution(vector<int>& height) {
int total = 0;
for (auto pos = height.begin(); pos != height.end(); pos++) {
if (*pos <= *(pos + 1))
continue;
for (auto lmaxpos = pos; *pos >= *(pos + 1); pos++) {
total = total + *lmaxpos - *(pos + 1);
for (; *(pos + 1) <= *lmaxpos; pos++) {
total = total + *lmaxpos - *(pos + 1);
if (*pos >= *(pos + 1)) {
total = total - (lmaxpos - pos) * (*lmaxpos - *pos);
break;
}
break;
}
break;
}
}
return total;
}
But after testing, i find that i have made some logical mistakes and i cann't find it out.
kindly ask you for help.
Traverse the array from both directions, storing the max_height seen so far. Now for each index, the water at that index is max(0, min(left_max, right_max) - cur_height).
E.g.,
input: [0,1,0,2,1,0,1,3,2,1,2,1]
max_from_left: [0,1,1,2,2,2,2,3,3,3,3,3]
max_from_right: [3,3,3,3,3,3,3,3,2,2,2,1]
water: [0-0, 1-0, 2-2, 2-1, 2-0, 2-1, 3-3, 2-2, 2-1, 2-2, 1-1]
water: [0, 1, 0, 1, 2, 1, 0, 0, 1, 0, 0]
sum(water) = 6
Here is a commented/explained solution.
Find maximum height of bar from the left end upto an index i in the array \text{left_max}left_max.
Find maximum height of bar from the right end upto an index i in the array \text{right_max}right_max.
Iterate over the \text{height}height array and update ans
from typing import List
def trapping_water_container(list: List) -> int:
l = 0
r = len(list) - 1
total, maxL, maxR = 0, 0, 0
while l < r:
# Identify the pointer with the lesser value
if list[l] <= list[r]:
# Check if the value for the left
# pointer is smaller than MaxLeft
if list[l] < maxL:
# Calculate the volume for this pointer
total += maxL - list[l]
else:
# Update the MaxLeft
maxL = list[l]
l += 1
else:
# Check if the value for the right
# pointer is smaller than MaxRight
if list[r] < maxR:
# Calculate the volume for this pointer
total += maxR - list[r]
else:
# Update the MaxRight
maxR = list[r]
r -= 1
return total

Dynamic Programming for shortest subsequence that is not a subsequence of two strings

Problem: Given two sequences s1 and s2 of '0' and '1'return the shortest sequence that is a subsequence of neither of the two sequences.
E.g. s1 = '011' s2 = '1101' Return s_out = '00' as one possible result.
Note that substring and subsequence are different where substring the characters are contiguous but in a subsequence that needs not be the case.
My question: How is dynamic programming applied in the "Solution Provided" below and what is its time complexity?
My attempt involves computing all the subsequences for each string giving sub1 and sub2. Append a '1' or a '0' to each sub1 and determine if that new subsequence is not present in sub2.Find the minimum length one. Here is my code:
My Solution
def get_subsequences(seq, index, subs, result):
if index == len(seq):
if subs:
result.add(''.join(subs))
else:
get_subsequences(seq, index + 1, subs, result)
get_subsequences(seq, index + 1, subs + [seq[index]], result)
def get_bad_subseq(subseq):
min_sub = ''
length = float('inf')
for sub in subseq:
for char in ['0', '1']:
if len(sub) + 1 < length and sub + char not in subseq:
length = len(sub) + 1
min_sub = sub + char
return min_sub
Solution Provided (not mine)
How does it work and its time complexity?
It looks that the below solution looks similar to: http://kyopro.hateblo.jp/entry/2018/12/11/100507
def set_nxt(s, nxt):
n = len(s)
idx_0 = n + 1
idx_1 = n + 1
for i in range(n, 0, -1):
nxt[i][0] = idx_0
nxt[i][1] = idx_1
if s[i-1] == '0':
idx_0 = i
else:
idx_1 = i
nxt[0][0] = idx_0
nxt[0][1] = idx_1
def get_shortest(seq1, seq2):
len_seq1 = len(seq1)
len_seq2 = len(seq2)
nxt_seq1 = [[len_seq1 + 1 for _ in range(2)] for _ in range(len_seq1 + 2)]
nxt_seq2 = [[len_seq2 + 1 for _ in range(2)] for _ in range(len_seq2 + 2)]
set_nxt(seq1, nxt_seq1)
set_nxt(seq2, nxt_seq2)
INF = 2 * max(len_seq1, len_seq2)
dp = [[INF for _ in range(len_seq2 + 2)] for _ in range(len_seq1 + 2)]
dp[len_seq1 + 1][len_seq2 + 1] = 0
for i in range( len_seq1 + 1, -1, -1):
for j in range(len_seq2 + 1, -1, -1):
for k in range(2):
if dp[nxt_seq1[i][k]][nxt_seq2[j][k]] < INF:
dp[i][j] = min(dp[i][j], dp[nxt_seq1[i][k]][nxt_seq2[j][k]] + 1);
res = ""
i = 0
j = 0
while i <= len_seq1 or j <= len_seq2:
for k in range(2):
if (dp[i][j] == dp[nxt_seq1[i][k]][nxt_seq2[j][k]] + 1):
i = nxt_seq1[i][k]
j = nxt_seq2[j][k]
res += str(k)
break;
return res
I am not going to work it through in detail, but the idea of this solution is to create a 2-D array of every combinations of positions in the one array and the other. It then populates this array with information about the shortest sequences that it finds that force you that far.
Just constructing that array takes space (and therefore time) O(len(seq1) * len(seq2)). Filling it in takes a similar time.
This is done with lots of bit twiddling that I don't want to track.
I have another approach that is clearer to me that usually takes less space and less time, but in the worst case could be as bad. But I have not coded it up.
UPDATE:
Here is is all coded up. With poor choices of variable names. Sorry about that.
# A trivial data class to hold a linked list for the candidate subsequences
# along with information about they match in the two sequences.
import collections
SubSeqLinkedList = collections.namedtuple('SubSeqLinkedList', 'value pos1 pos2 tail')
# This finds the position after the first match. No match is treated as off the end of seq.
def find_position_after_first_match (seq, start, value):
while start < len(seq) and seq[start] != value:
start += 1
return start+1
def make_longer_subsequence (subseq, value, seq1, seq2):
pos1 = find_position_after_first_match(seq1, subseq.pos1, value)
pos2 = find_position_after_first_match(seq2, subseq.pos2, value)
gotcha = SubSeqLinkedList(value=value, pos1=pos1, pos2=pos2, tail=subseq)
return gotcha
def minimal_nonsubseq (seq1, seq2):
# We start with one candidate for how to start the subsequence
# Namely an empty subsequence. Length 0, matches before the first character.
candidates = [SubSeqLinkedList(value=None, pos1=0, pos2=0, tail=None)]
# Now we try to replace candidates with longer maximal ones - nothing of
# the same length is better at going farther in both sequences.
# We keep this list ordered by descending how far it goes in sequence1.
while candidates[0].pos1 <= len(seq1) or candidates[0].pos2 <= len(seq2):
new_candidates = []
for candidate in candidates:
candidate1 = make_longer_subsequence(candidate, '0', seq1, seq2)
candidate2 = make_longer_subsequence(candidate, '1', seq1, seq2)
if candidate1.pos1 < candidate2.pos1:
# swap them.
candidate1, candidate2 = candidate2, candidate1
for c in (candidate1, candidate2):
if 0 == len(new_candidates):
new_candidates.append(c)
elif new_candidates[-1].pos1 <= c.pos1 and new_candidates[-1].pos2 <= c.pos2:
# We have found strictly better.
new_candidates[-1] = c
elif new_candidates[-1].pos2 < c.pos2:
# Note, by construction we cannot be shorter in pos1.
new_candidates.append(c)
# And now we throw away the ones we don't want.
# Those that are on their way to a solution will be captured in the linked list.
candidates = new_candidates
answer = candidates[0]
r_seq = [] # This winds up reversed.
while answer.value is not None:
r_seq.append(answer.value)
answer = answer.tail
return ''.join(reversed(r_seq))
print(minimal_nonsubseq('011', '1101'))

Compact way to produce a large sequence of strings in lexical order

I want to generate a sequence of strings with the following properties:
Lexically ordered
Theoretically infinite
Compact over a realistic range
Generated by a simple process of incrementation
Matches the regexp /\w+/
The obvious way to generate a lexically-ordered sequence is to choose a string length and pad the strings with a base value like this: 000000, 000001, etc. This approach poses a trade-off between the number of permutations and compactness: a string long enough to yield many permutations will be filled many zeros along the way. Plus, the length I choose sets an upper bound on the total number of permutations unless I have some mechanism for expanding the string when it maxes out.
So I came up with a sequence that works like this:
Each string consists of a "head", which is a base-36 number, followed by an underscore, and then the "tail", which is also a base-36 number padded by an increasing number of zeros
The first cycle goes from 0_0 to 0_z
The second cycle goes from 1_00 to 1_zz
The third cycle goes from 2_000 to 2_zzz, and so on
Once the head has reached z and the tail consists of 36 zs, the first "supercycle" has ended. Now the whole sequence starts over, except the z remains at the beginning, so the new cycle starts with z0_0, then continues to z1_00, and so on
The second supercycle goes zz0_0, zz1_00, and so on
Although the string of zs in the head could become unwieldy over the long run, a single supercycle contains over 10^56 permutations, which is far more than I ever expect to use. The sequence is theoretically infinite but very compact within a realistic range. For instance, the trillionth permutation is a succinct 7_bqd55h8s.
I can generate the sequence relatively simply with this javascript function:
function genStr (n) {
n = BigInt(n);
let prefix = "",
cycle = 0n,
max = 36n ** (cycle + 1n);
while (n >= max) {
n -= max;
if (cycle === 35n) {
prefix += "z";
cycle = 0n;
} else {
cycle++;
}
max = 36n ** (cycle + 1n);
}
return prefix
+ cycle.toString(36)
+ "_"
+ n.toString(36).padStart(Number(cycle) + 1, 0);
}
The n parameter is a number that I increment and pass to the function to get the next member of the sequence. All I need to keep track of is a simple integer, making the sequence very easy to use.
So obviously I spent a lot of time on this and I think it's pretty good, but I'm wondering if there is a better way. Is there a good algorithm for generating a sequence along the lines of the one I'm looking for?
A close idea to yours. (more rafined than my first edit...).
Let our alphabet be A = {0,1,2,3}.
Let |2| mean we iterate from 0 to 2 and |2|^2 mean we generate the cartesian product in a lexically sorted manner (00,01,10,11).
We start with
0 |3|
So we have a string of length 2. We "unshift" the digit 1 which "factorizes" since any 0|3|... is less than 1|3|^2.
1 |3|^2
Same idea: unshift 2, and make words of length 4.
2 |3|^3
Now we can continue and generate
3 |2| |3|^3
Notice |2| and not |3|. Now our maximum number becomes 32333. And as you did, we can now add the carry and start a new supercycle:
33 0|3|
This is a slight improvement, since _ can now be part of our alphabet: we don't need to reserve it as a token separator.
In our case we can represent in a supercycle:
n + n^2 + ... + n^(n-1) + (n-1) * n^(n-1)
\-----------------------/\--------------/
geometric special
In your case, the special part would be n^n (with the nuance that you have theorically one char less so replace n with n-1 everywhere)
The proposed supercycle is of length :
P = (n \sum_{k = 0}^{n-2} n^k) + (n-1) * n^(n-1)
P = (n \sum_{k = 0}^{n-3} n^k) + n^n
P = n(n^{n-2} - 1)/(n-1) + n^n
Here is an example diff with alphabet A={0,1,2}
my genStr(grandinero)
,00 0_0
,01 0_1
,02 0_2
,100 1_00
,101 1_01
,102 1_02
,110 1_10
,111 1_11
,112 1_12
,120 1_20
,121 1_21
,122 1_22
,2000 2_000
,2001 2_001
,2002 2_002
,2010 2_010
,2011 2_011
,2012 2_012
,2020 2_020
,2021 2_021
,2022 2_022
,2100 2_100
,2101 2_101
,2102 2_102
,2110 2_110
,2111 2_111
,2112 2_112
,2120 2_120
,2121 2_121
,2122 2_122
22,00 2_200 <-- end of my supercycle if no '_' allowed
22,01 2_201
22,02 2_202
22,100 2_210
22,101 2_211
22,102 2_212
22,110 2_220
22,111 2_221
22,112 2_222 <-- end of yours
22,120 z0_0
That said, for a given number x, we can can count how many supercycles (E(x / P)) there are, each supercycle making two leading e (e being the last char of A).
e.g: A = {0,1,2} and x = 43
e = 2
P = n(n^{n-2} - 1)/(n-1) + n^n = 3(3^1 -1)/2 + 27 = 30
// our supercycle is of length 30
E(43/30) = 1 // 43 makes one supercycle and a few more "strings"
r = x % P = 13 // this is also x - (E(43/30) * 30) (the rest of the euclidean division by P)
Then for the left over (r = x % P) two cases to consider:
either we fall in the geometric sequence
either we fall in the (n-1) * n^(n-1) part.
1. Adressing the geometric sequence with cumulative sums (x < S_w)
Let S_i be the cumsum of n, n^2,..
S_i = n\sum_{k = 0}^{i-1} n^k
S_i = n/(n-1)*(n^i - 1)
which gives S_0 = 0, S_1 = n, S_2 = n + n^2...
So basically, if x < S_1, we get 0(x), elif x < S_2, we get 1(x-S_1)
Let S_w = S_{n-1} the count of all the numbers we can represent.
If x <= S_w then we want the i such that
S_i < x <= S_{i+1} <=> n^i < (n-1)/n * x + 1 <= n^{i+1}
We can then apply some log flooring (base(n)) to get that i.
We can then associate the string: A[i] + base_n(x - S_i).
Illustration:
This time with A = {0,1,2,3}.
Let x be 17.
Our consecutive S_i are:
S_0 = 0
S_1 = 4
S_2 = S_1 + 4^2 = 20
S_3 = S_2 + 4^3 = 84
S_w = S_{4-1} = S_3 = 84
x=17 is indeed less than 84, we will be able to affect it to one of the S_i ranges.
In particular S_1==4 < x==17 <= S_2==20.
We remove the strings encoded by the leading 0(there are a number S_1 of those strings).
The position to encode with the leading 1 is
x - 4 = 13.
And we conclude the thirteen's string generated with a leading 1 is base_4(13) = '31' (idem string -> '131')
Should we have had x = 21, we would have removed the count of S_2 so 21-20 = 1, which in turn gives with a leading 2 the string '2001'.
2. Adressing x in the special part (x >= S_w)
Let's consider study case below:
with A = {0,1,2}
The special part is
2 |1| |2|^2
that is:
2 0 00
2 0 01
2 0 02
2 0 10
2 0 11
2 0 12
2 0 20
2 0 21
2 0 22
2 1 20
2 1 21
2 1 22
2 1 10
2 1 11
2 1 12
2 1 20
2 1 21
2 1 22
Each incremented number of the second column (here 0 to 1 (specified from |1|)) gives 3^2 combination.
This is similar to the geometric series except that here each range is constant. We want to find the range which means we know which string to prefix.
We can represent it as the matrix
20 (00,01,02,10,11,12,20,21,22)
21 (00,01,02,10,11,12,20,21,22)
The portion in parenthesis is our matrix.
Every item in a row is simply its position base_3 (left-padded with 0).
e.g: n=7 has base_3 value '21'. (7=2*3+1).
'21' does occur in position 7 in the row.
Assuming we get some x (relative to that special part).
E(x / 3^2) gives us the row number (here E(7/9) = 0 so prefix is '20')
x % 3^2 give us the position in the row (here base_3(7%9)='21' giving us the final string '2021')
If we want to observe it remember that we substracted S_w=12 before to get x = 7, so we would call myGen(7+12)
Some code
Notice the same output as long as we stand in the "geometric" range, without supercycle.
Obviously, when carry starts to appear, it depends on whether I can use '_' or not. If yes, my words get shorter otherwise longer.
// https://www.cs.sfu.ca/~ggbaker/zju/math/int-alg.html
// \w insensitive could give base64
// but also éè and other accents...
function base_n(x, n, A) {
const a = []
while (x !== 0n) {
a.push(A[Number(x % n)])
x = x / n // auto floor with bigInt
}
return a.reverse().join('')
}
function mygen (A) {
const n = A.length
const bn = BigInt(n)
const A_last = A[A.length-1]
const S = Array(n).fill(0).map((x, i) => bn * (bn ** BigInt(i) - 1n) / (bn - 1n))
const S_w = S[n-1]
const w = S_w + (bn - 1n) * bn ** (bn - 1n)
const w2 = bn ** (bn - 1n)
const flog_bn = x => {
// https://math.stackexchange.com/questions/1627914/smart-way-to-calculate-floorlogx
let L = 0
while (x >= bn) {
L++
x /= bn
}
return L
}
return function (x) {
x = BigInt(x)
let r = x % w
const q = (x - r) / w
let s
if (r < S_w) {
const i = flog_bn(r * (bn - 1n) / bn + 1n)
const r2 = r - S[i]
s = A[i] + base_n(r2, bn, A).padStart(i+1, '0')
} else {
const n2 = r - S_w
const r2 = n2 % w2
const q2 = (n2 - r2 ) / w2
s = A_last + A[q2] + base_n(r2, bn, A).padStart(n-1, '0')
}
// comma below __not__ necessary, just to ease seeing cycles
return A_last.repeat(2*Number(q)) +','+ s
}
}
function genStr (A) {
A = A.filter(x => x !== '_')
const bn_noUnderscore = BigInt(A.length)
return function (x) {
x = BigInt(x);
let prefix = "",
cycle = 0n,
max = bn_noUnderscore ** (cycle + 1n);
while (x >= max) {
x -= max;
if (cycle === bn_noUnderscore - 1n) {
prefix += "z";
cycle = 0n;
} else {
cycle++;
}
max = bn_noUnderscore ** (cycle + 1n);
}
return prefix
+ base_n(cycle, bn_noUnderscore, A)
+ "_"
+ base_n(x, bn_noUnderscore, A).padStart(Number(cycle) + 1, 0);
}
}
function test(a, b, x){
console.log(a(x), b(x))
}
{
console.log('---my supercycle is shorter if underscore not used. Plenty of room for grandinero')
const A = '0123456789abcdefghijklmnopqrstuvwxyz'.split('').sort((a,b)=>a.localeCompare(b))
let my = mygen(A)
const grandinero = genStr(A)
test(my, grandinero, 1e4)
test(my, grandinero, 1e12)
test(my, grandinero, 106471793335560744271846581685593263893929893610517909620n) // cycle ended for me (w variable value)
}
{
console.log('---\n my supercycle is greater if underscore is used in my alphabet (not grandinero since "forbidden')
// underscore used
const A = '0123456789abcdefghijklmnopqrstuvwxyz_'.split('').sort((a,b)=>a.localeCompare(b))
let my = mygen(A)
const grandinero = genStr(A)
test(my, grandinero, 1e12)
test(my, grandinero, 106471793335560744271846581685593263893929893610517909620n) // cycle ended for me (w variable value)
test(my, grandinero, 1e57) // still got some place in the supercycle
}
After considering the advice provided by #kaya3 and #grodzi and reviewing my original code, I have made some improvements. I realized a few things:
There was a bug in my original code. If one cycle ends at z_z (actually 36 z's after the underscore, but you get the idea) and the next one begins at z0_0, then lexical ordering is broken because _ comes after 0. The separator (or "neck") needs to be lower in lexical order than the lowest possible value of the head.
Though I was initially resistant to the idea of rolling a custom baseN generator so that more characters can be included, I have now come around to the idea.
I can squeeze more permutations out of a given string length by also incrementing the neck. For example, I can go from A00...A0z to A10...A1z, and so on, thus increasing the number of unique strings I can generate with A as the head before I move on to B.
With that in mind, I have revised my code:
// this is the alphabet used in standard baseN conversions:
let baseAlpha = "0123456789abcdefghijklmnopqrstuvwxyz";
// this is a factory for creating a new string generator:
function sequenceGenerator (config) {
let
// alphabets for the head, neck and body:
headAlpha = config.headAlpha,
neckAlpha = config.neckAlpha,
bodyAlpha = config.bodyAlpha,
// length of the body alphabet corresponds to the
// base of the numbering system:
base = BigInt(bodyAlpha.length),
// if bodyAlpha is identical to an alphabet that
// would be used for a standard baseN conversion,
// then use the built-in method, which should be
// much faster:
convertBody = baseAlpha.startsWith(bodyAlpha)
? (n) => n.toString(bodyAlpha.length)
// otherwise, roll a custom baseN generator:
: function (n) {
let s = "";
while (n > 0n) {
let i = n % base;
s = bodyAlpha[i] + s;
n = n / base;
}
return s;
},
// n is used to cache the last iteration and is
// incremented each time you call `getNext`
// it can optionally be initialized to a value other
// than 0:
n = BigInt(config.start || 0),
// see below:
headCycles = [0n],
cycleLength = 0n;
// the length of the body increases by 1 each time the
// head increments, meaning that the total number of
// permutations increases geometrically for each
// character in headAlpha
// here we cache the maximum number of permutations for
// each length of the body
// since we know these values ahead of time, calculating
// them in advance saves time when we generate a new
// string
// more importantly, it saves us from having to do a
// reverse calculation involving Math.log, which requires
// converting BigInts to Numbers, which breaks the
// program on larger numbers:
for (let i = 0; i < headAlpha.length; i++) {
// the maximum number of permutations depends on both
// the string length (i + 1) and the number of
// characters in neckAlpha, since the string length
// remains the same while the neck increments
cycleLength += BigInt(neckAlpha.length) * base ** BigInt(i + 1);
headCycles.push(cycleLength);
}
// given a number n, this function searches through
// headCycles to find where the total number of
// permutations exceeds n
// this is how we avoid the reverse calculation with
// Math.log to determine which head cycle we are on for
// a given permutation:
function getHeadCycle (n) {
for (let i = 0; i < headCycles.length; i++) {
if (headCycles[i] > n) return i;
}
}
return {
cycleLength: cycleLength,
getString: function (n) {
let cyclesDone = Number(n / cycleLength),
headLast = headAlpha[headAlpha.length - 1],
prefix = headLast.repeat(cyclesDone),
nn = n % cycleLength,
headCycle = getHeadCycle(nn),
head = headAlpha[headCycle - 1],
nnn = nn - headCycles[headCycle - 1],
neckCycleLength = BigInt(bodyAlpha.length) ** BigInt(headCycle),
neckCycle = nnn / neckCycleLength,
neck = neckAlpha[Number(neckCycle)],
body = convertBody(nnn % neckCycleLength);
body = body.padStart(headCycle , bodyAlpha[0]);
return prefix + head + neck + body;
},
getNext: function () { return this.getString(n++); }
};
}
let bodyAlpha = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz",
getStr = sequenceGenerator({
// achieve more permutations within a supercycle
// with a larger headAlpha:
headAlpha: "123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
// the highest value of neckAlpha must be lower than
// the lowest value of headAlpha:
neckAlpha: "0",
bodyAlpha: bodyAlpha
});
console.log("---supercycle length:");
console.log(Number(getStr.cycleLength));
console.log("---first two values:")
console.log(getStr.getNext());
console.log(getStr.getNext());
console.log("---arbitrary large value (1e57):");
console.log(getStr.getString(BigInt(1e57)));
console.log("");
// here we use a shorter headAlpha and longer neckAlpha
// to shorten the maximum length of the body, but this also
// decreases the number of permutations in the supercycle:
getStr = sequenceGenerator({
headAlpha: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
neckAlpha: "0123456789",
bodyAlpha: bodyAlpha
});
console.log("---supercycle length:");
console.log(Number(getStr.cycleLength));
console.log("---first two values:");
console.log(getStr.getNext());
console.log(getStr.getNext());
console.log("---arbitrary large value (1e57):");
console.log(getStr.getString(BigInt(1e57)));
EDIT
After further discussion with #grodzi, I have made some more improvements:
I realized that the "neck" or separator wasn't providing much value, so I have gotten rid of it. Later edit: actually, the separator is necessary. I am not sure why I thought it wasn't. Without the separator, the beginning of each new supercycle will lexically precede the end of the previous supercycle. I haven't changed my code below, but anyone using this code should include a separator. I have also realized that I was wrong to use an underscore as the separator. The separator must be a character, such as the hyphen, which lexically precedes the lowest digit used in the sequence (0).
I have taken #grodzi's suggestion to allow the length of the tail to continue growing indefinitely.
Here is the new code:
let baseAlpha = "0123456789abcdefghijklmnopqrstuvwxyz";
function sequenceGenerator (config) {
let headAlpha = config.headAlpha,
tailAlpha = config.tailAlpha,
base = BigInt(tailAlpha.length),
convertTail = baseAlpha.startsWith(tailAlpha)
? (n) => n.toString(tailAlpha.length)
: function (n) {
if (n === 0n) return "0";
let s = "";
while (n > 0n) {
let i = n % base;
s = tailAlpha[i] + s;
n = n / base;
}
return s;
},
n = BigInt(config.start || 0);
return {
getString: function (n) {
let cyclesDone = 0n,
headCycle = 0n,
initLength = 0n,
accum = 0n;
for (;; headCycle++) {
let _accum = accum + base ** (headCycle + 1n + initLength);
if (_accum > n) {
n -= accum;
break;
} else if (Number(headCycle) === headAlpha.length - 1) {
cyclesDone++;
initLength += BigInt(headAlpha.length);
headCycle = -1n;
}
accum = _accum;
}
let headLast = headAlpha[headAlpha.length - 1],
prefix = headLast.repeat(Number(cyclesDone)),
head = headAlpha[Number(headCycle)],
tail = convertTail(n),
tailLength = Number(headCycle + initLength);
tail = tail.padStart(tailLength, tailAlpha[0]);
return prefix + head + tail;
},
getNext: function () { return this.getString(n++); }
};
}
let alpha = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz",
genStr = sequenceGenerator({headAlpha: alpha, tailAlpha: alpha});
console.log("--- first string:");
console.log(genStr.getString(0n));
console.log("--- 1e+57");
console.log(genStr.getString(BigInt(1e+57)));
console.log("--- end of first supercycle:");
console.log(genStr.getString(63n*(1n-(63n**63n))/(1n-63n)-1n));
console.log("--- start of second supercycle:");
console.log(genStr.getString(63n*(1n-(63n**63n))/(1n-63n)));

Cutting algorithm of two dimensional board

I have problem with my homework.
Given a board of dimensions m x n is given, cut this board into rectangular pieces with the best total price. A matrix gives the price for each possible board size up through the original, uncut board.
Consider a 2 x 2 board with the price matrix:
3 4
3 6
We have a constant cost for each cutting for example 1.
Piece of length 1 x 1 is worth 3.
Horizontal piece of length 1 x 2 is worth 4.
Vertical piece of length 1 x 2 is worth 3.
Whole board is worth 6.
For this example, the optimal profit is 9, because we cut board into 1 x 1 pieces. Each piece is worth 3 and we done a 3 cut, so 4 x 3 - 3 x 1 = 9.
Second example:
1 2
3 4
Now I have to consider all the solutions:
4 1x1 pieces is worth 4x1 - (cost of cutting) 3x1 = 1
2 horizontal 1x2 is worth 2x2 - (cost of cutting) 1x1 = 3
2 vertical 1x2 is worth 3x2 - (cost of cutting) 1x1 = 5 -> best optimal profit
1 horizontal 1x2 + 2 x (1x1) pieces is worth 2 + 2 - (cost of cutting) 2 = 2
1 vertical 1x2 + 2 x (1x1) pieces is worth 3 + 2 - (cost of cutting) 2 = 3
I've read a lot about rod cutting algorithm but I don't have any idea how to bite this problem.
Do you have any ideas?
I did this in Python. The algorithm is
best_val = value of current board
check each horizontal and vertical cut for better value
for cut point <= half the current dimension (if none, return initial value)
recur on the two boards formed
if sum of values > best_val
... best_val = that sum
... record cut point and direction
return result: best_val, cut point, and direction
I'm not sure what you'll want for return values; I gave back the best value and tree of boards. For your second example, this is
(5, [[2, 1], [2, 1]])
Code, with debugging traces (indent and the labeled prints):
indent = ""
indent_len = 3
value = [[1, 2],
[3, 4]]
def best_cut(high, wide):
global indent
print indent, "ENTER", high, wide
indent += " " * indent_len
best_val = value[high-1][wide-1]
print indent, "Default", best_val
cut_vert = None
cut_val = best_val
cut_list = []
# Check horizontal cuts
for h_cut in range(1, 1 + high // 2):
print indent, "H_CUT", h_cut
cut_val1, cut_list1 = best_cut(h_cut, wide)
cut_val2, cut_list2 = best_cut(high - h_cut, wide)
cut_val = cut_val1 + cut_val2
if cut_val > best_val:
cut_list = [cut_list1, cut_list2]
print indent, "NEW H", h_cut, cut_val, cut_list
best_val = cut_val
cut_vert = False
best_h = h_cut
# Check vertical cuts
for v_cut in range(1, 1 + wide // 2):
print indent, "V_CUT", v_cut
cut_val1, cut_list1 = best_cut(high, v_cut)
cut_val2, cut_list2 = best_cut(high, wide - v_cut)
cut_val = cut_val1 + cut_val2
if cut_val > best_val:
cut_list = [cut_list1, cut_list2]
print indent, "NEW V", v_cut, cut_val, cut_list
best_val = cut_val
cut_vert = True
best_v = v_cut
# Return result of best cut
# Remember to subtract the cut cost
if cut_vert is None:
result = best_val , [high, wide]
elif cut_vert:
result = best_val-1, cut_list
else:
result = best_val-1, cut_list
indent = indent[indent_len:]
print indent, "LEAVE", cut_vert, result
return result
print best_cut(2, 2)
Output (profit and cut sizes) for each of the two tests:
(9, [[[1, 1], [1, 1]], [[1, 1], [1, 1]]])
(5, [[2, 1], [2, 1]])
Let f(h,w) represent the best total price achievable for a board with height h and width w with cutting price c. Then
f(h,w) = max(
price_matrix(h, w),
f(i, w) + f(h - i, w) - c,
f(h, j) + f(h, w - j) - c
)
for i = 1 to floor(h / 2)
for j = 1 to floor(w / 2)
Here's a bottom-up example in JavaScript that returns the filled table given the price matrix. The answer would be in the bottom right corner.
function f(prices, cost){
var m = new Array(prices.length);
for (let i=0; i<prices.length; i++)
m[i] = [];
for (let h=0; h<prices.length; h++){
for (let w=0; w<prices[0].length; w++){
m[h][w] = prices[h][w];
if (h == 0 && w == 0)
continue;
for (let i=1; i<(h+1>>1)+1; i++)
m[h][w] = Math.max(
m[h][w],
m[i-1][w] + m[h-i][w] - cost
);
for (let i=1; i<(w+1>>1)+1; i++)
m[h][w] = Math.max(
m[h][w],
m[h][i-1] + m[h][w-i] - cost
);
}
}
return m;
}
$('#submit').click(function(){
let prices = JSON.parse($('#input').val());
let result = f(prices, 1);
let str = result.map(line => JSON.stringify(line)).join('<br>');
$('#output').html(str);
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<textarea id="input">[[3, 4],
[3, 6]]</textarea>
<p><button type="button" id="submit">Submit</button></p>
<div id="output"><div>
Some thoughts on the problem rather than an answer:
It was a long time ago i studied dynamic programming, but i wrote up the following pseudo code which is think is O(n^2):
// 'Board'-class not included
val valueOfBoards: HashMap<Board, int>
fun cutBoard(b: Board, value: int) : int {
if (b.isEmpty()) return 0
if (valueOfBoards[b] > value) {
return 0;
} else {
valueOfBoards[b] = value
}
int maxValue = Integer.MIN_VALUE
for (Board piece : b.getPossiblePieces()) {
val (cuttingCost, smallerBoard) = b.cutOffPiece(piece)
val valueGained: int = piece.getPrice() - cuttingCost
maxValue = Max(maxValue, valueGained + cutBoard(smallerBoard, value + valueGained))
}
return maxValue;
}
The board class is not trivially implemented, here is some elaboration:
// returns all boards which fits in the current board
// for the initial board this will be width*height subboards
board.getPossiblePieces()
// returns a smaller board and the cutting cost of the cut
// I can see this becoming complex, depends on how one chooses to represent the board.
board.cutOffPiece(piece: Board)
It is not clear to me at the moment if cutOffPiece() breaks the algorithm in that you do not know how to optimally cut. I think since the algorithm will proceed from larger pieces to smaller pieces at some point it will be fine.
I tried to solve the re computation of sub problems (identical boards) by storing results in something like HashMap<Board, price> and comparing the new board with the stored best price before proceeding.
According to your answers I've prepared bottom-up and top-down implementation.
Bottom-up:
function bottomUp($high, $wide, $matrix){
$m = [];
for($h = 0; $h < $high; $h++){
for($w = 0; $w < $wide; $w++){
$m[$h][$w] = $matrix[$h][$w];
if($h == 0 && $w == 0){
continue;
}
for($i = 1; $i < ($h + 1 >> 1) + 1; $i++){
$m[$h][$w] = max(
$m[$h][$w],
$m[$i - 1][$w] + $m[$h - $i][$w] - CUT_COST
);
}
for($i = 1; $i < ($w + 1 >> 1) + 1; $i++){
$m[$h][$w] = max(
$m[$h][$w],
$m[$h][$i - 1] + $m[$h][$w - $i] - CUT_COST
);
}
}
}
return $m[$high-1][$wide-1];
}
Top-down:
function getBestCut($high, $wide, $matrix){
global $checked;
if(isset($checked[$high][$wide])){
return $checked[$high][$wide];
}
$bestVal = $matrix[$high-1][$wide-1];
$cutVert = CUT_VERT_NONE;
$cutVal = $bestVal;
$cutList = [];
for($hCut = 1; $hCut < 1 + floor($high/2); $hCut++){
$result1 = getBestCut($hCut, $wide, $matrix);
$cutVal1 = $result1[0];
$cutList1 = $result1[1];
$result2 = getBestCut($high - $hCut, $wide, $matrix);
$cutVal2 = $result2[0];
$cutList2 = $result2[1];
$cutVal = $cutVal1 + $cutVal2;
if($cutVal > $bestVal){
$cutList = [$cutList1, $cutList2];
$bestVal = $cutVal;
$cutVert = CUT_VERT_FALSE;
$bestH = $hCut;
}
$checked[$hCut][$wide] = $result1;
$checked[$high - $hCut][$wide] = $result2;
}
for($vCut = 1; $vCut < 1 + floor($wide/2); $vCut++){
$result1 = getBestCut($hCut, $vCut, $matrix);
$cutVal1 = $result1[0];
$cutList1 = $result1[1];
$result2 = getBestCut($high, $wide - $vCut, $matrix);
$cutVal2 = $result2[0];
$cutList2 = $result2[1];
$cutVal = $cutVal1 + $cutVal2;
if($cutVal > $bestVal){
$cutList = [$cutList1, $cutList2];
$bestVal = $cutVal;
$cutVert = CUT_VERT_TRUE;
$bestH = $vCut;
}
$checked[$hCut][$vCut] = $result1;
$checked[$high][$wide - $vCut] = $result2;
}
if($cutVert == CUT_VERT_NONE){
$result = [$bestVal, [$high, $wide]];
}else if($cutVert == CUT_VERT_TRUE){
$result = [$bestVal - CUT_COST, $cutList];
}else{
$result = [$bestVal - CUT_COST, $cutList];
}
return $result;
}
Please tell me are they correct implementation of this method?
I wonder if time complexity is O(m^2*n^2) in top-down method?

How to calculate the index (lexicographical order) when the combination is given

I know that there is an algorithm that permits, given a combination of number (no repetitions, no order), calculates the index of the lexicographic order.
It would be very useful for my application to speedup things...
For example:
combination(10, 5)
1 - 1 2 3 4 5
2 - 1 2 3 4 6
3 - 1 2 3 4 7
....
251 - 5 7 8 9 10
252 - 6 7 8 9 10
I need that the algorithm returns the index of the given combination.
es: index( 2, 5, 7, 8, 10 ) --> index
EDIT: actually I'm using a java application that generates all combinations C(53, 5) and inserts them into a TreeMap.
My idea is to create an array that contains all combinations (and related data) that I can index with this algorithm.
Everything is to speedup combination searching.
However I tried some (not all) of your solutions and the algorithms that you proposed are slower that a get() from TreeMap.
If it helps: my needs are for a combination of 5 from 53 starting from 0 to 52.
Thank you again to all :-)
Here is a snippet that will do the work.
#include <iostream>
int main()
{
const int n = 10;
const int k = 5;
int combination[k] = {2, 5, 7, 8, 10};
int index = 0;
int j = 0;
for (int i = 0; i != k; ++i)
{
for (++j; j != combination[i]; ++j)
{
index += c(n - j, k - i - 1);
}
}
std::cout << index + 1 << std::endl;
return 0;
}
It assumes you have a function
int c(int n, int k);
that will return the number of combinations of choosing k elements out of n elements.
The loop calculates the number of combinations preceding the given combination.
By adding one at the end we get the actual index.
For the given combination there are
c(9, 4) = 126 combinations containing 1 and hence preceding it in lexicographic order.
Of the combinations containing 2 as the smallest number there are
c(7, 3) = 35 combinations having 3 as the second smallest number
c(6, 3) = 20 combinations having 4 as the second smallest number
All of these are preceding the given combination.
Of the combinations containing 2 and 5 as the two smallest numbers there are
c(4, 2) = 6 combinations having 6 as the third smallest number.
All of these are preceding the given combination.
Etc.
If you put a print statement in the inner loop you will get the numbers
126, 35, 20, 6, 1.
Hope that explains the code.
Convert your number selections to a factorial base number. This number will be the index you want. Technically this calculates the lexicographical index of all permutations, but if you only give it combinations, the indexes will still be well ordered, just with some large gaps for all the permutations that come in between each combination.
Edit: pseudocode removed, it was incorrect, but the method above should work. Too tired to come up with correct pseudocode at the moment.
Edit 2: Here's an example. Say we were choosing a combination of 5 elements from a set of 10 elements, like in your example above. If the combination was 2 3 4 6 8, you would get the related factorial base number like so:
Take the unselected elements and count how many you have to pass by to get to the one you are selecting.
1 2 3 4 5 6 7 8 9 10
2 -> 1
1 3 4 5 6 7 8 9 10
3 -> 1
1 4 5 6 7 8 9 10
4 -> 1
1 5 6 7 8 9 10
6 -> 2
1 5 7 8 9 10
8 -> 3
So the index in factorial base is 1112300000
In decimal base, it's
1*9! + 1*8! + 1*7! + 2*6! + 3*5! = 410040
This is Algorithm 2.7 kSubsetLexRank on page 44 of Combinatorial Algorithms by Kreher and Stinson.
r = 0
t[0] = 0
for i from 1 to k
if t[i - 1] + 1 <= t[i] - 1
for j from t[i - 1] to t[i] - 1
r = r + choose(n - j, k - i)
return r
The array t holds your values, for example [5 7 8 9 10]. The function choose(n, k) calculates the number "n choose k". The result value r will be the index, 251 for the example. Other inputs are n and k, for the example they would be 10 and 5.
zero-base,
# v: array of length k consisting of numbers between 0 and n-1 (ascending)
def index_of_combination(n,k,v):
idx = 0
for p in range(k-1):
if p == 0: arrg = range(1,v[p]+1)
else: arrg = range(v[p-1]+2, v[p]+1)
for a in arrg:
idx += combi[n-a, k-1-p]
idx += v[k-1] - v[k-2] - 1
return idx
Null Set has the right approach. The index corresponds to the factorial-base number of the sequence. You build a factorial-base number just like any other base number, except that the base decreases for each digit.
Now, the value of each digit in the factorial-base number is the number of elements less than it that have not yet been used. So, for combination(10, 5):
(1 2 3 4 5) == 0*9!/5! + 0*8!/5! + 0*7!/5! + 0*6!/5! + 0*5!/5!
== 0*3024 + 0*336 + 0*42 + 0*6 + 0*1
== 0
(10 9 8 7 6) == 9*3024 + 8*336 + 7*42 + 6*6 + 5*1
== 30239
It should be pretty easy to calculate the index incrementally.
If you have a set of positive integers 0<=x_1 < x_2< ... < x_k , then you could use something called the squashed order:
I = sum(j=1..k) Choose(x_j,j)
The beauty of the squashed order is that it works independent of the largest value in the parent set.
The squashed order is not the order you are looking for, but it is related.
To use the squashed order to get the lexicographic order in the set of k-subsets of {1,...,n) is by taking
1 <= x1 < ... < x_k <=n
compute
0 <= n-x_k < n-x_(k-1) ... < n-x_1
Then compute the squashed order index of (n-x_k,...,n-k_1)
Then subtract the squashed order index from Choose(n,k) to get your result, which is the lexicographic index.
If you have relatively small values of n and k, you can cache all the values Choose(a,b) with a
See Anderson, Combinatorics on Finite Sets, pp 112-119
I needed also the same for a project of mine and the fastest solution I found was (Python):
import math
def nCr(n,r):
f = math.factorial
return f(n) / f(r) / f(n-r)
def index(comb,n,k):
r=nCr(n,k)
for i in range(k):
if n-comb[i]<k-i:continue
r=r-nCr(n-comb[i],k-i)
return r
My input "comb" contained elements in increasing order You can test the code with for example:
import itertools
k=3
t=[1,2,3,4,5]
for x in itertools.combinations(t, k):
print x,index(x,len(t),k)
It is not hard to prove that if comb=(a1,a2,a3...,ak) (in increasing order) then:
index=[nCk-(n-a1+1)Ck] + [(n-a1)C(k-1)-(n-a2+1)C(k-1)] + ... =
nCk -(n-a1)Ck -(n-a2)C(k-1) - .... -(n-ak)C1
There's another way to do all this. You could generate all possible combinations and write them into a binary file where each comb is represented by it's index starting from zero. Then, when you need to find an index, and the combination is given, you apply a binary search on the file. Here's the function. It's written in VB.NET 2010 for my lotto program, it works with Israel lottery system so there's a bonus (7th) number; just ignore it.
Public Function Comb2Index( _
ByVal gAr() As Byte) As UInt32
Dim mxPntr As UInt32 = WHL.AMT.WHL_SYS_00 '(16.273.488)
Dim mdPntr As UInt32 = mxPntr \ 2
Dim eqCntr As Byte
Dim rdAr() As Byte
modBinary.OpenFile(WHL.WHL_SYS_00, _
FileMode.Open, FileAccess.Read)
Do
modBinary.ReadBlock(mdPntr, rdAr)
RP: If eqCntr = 7 Then GoTo EX
If gAr(eqCntr) = rdAr(eqCntr) Then
eqCntr += 1
GoTo RP
ElseIf gAr(eqCntr) < rdAr(eqCntr) Then
If eqCntr > 0 Then eqCntr = 0
mxPntr = mdPntr
mdPntr \= 2
ElseIf gAr(eqCntr) > rdAr(eqCntr) Then
If eqCntr > 0 Then eqCntr = 0
mdPntr += (mxPntr - mdPntr) \ 2
End If
Loop Until eqCntr = 7
EX: modBinary.CloseFile()
Return mdPntr
End Function
P.S. It takes 5 to 10 mins to generate 16 million combs on a Core 2 Duo. To find the index using binary search on file takes 397 milliseconds on a SATA drive.
Assuming the maximum setSize is not too large, you can simply generate a lookup table, where the inputs are encoded this way:
int index(a,b,c,...)
{
int key = 0;
key |= 1<<a;
key |= 1<<b;
key |= 1<<c;
//repeat for all arguments
return Lookup[key];
}
To generate the lookup table, look at this "banker's order" algorithm. Generate all the combinations, and also store the base index for each nItems. (For the example on p6, this would be [0,1,5,11,15]). Note that by you storing the answers in the opposite order from the example (LSBs set first) you will only need one table, sized for the largest possible set.
Populate the lookup table by walking through the combinations doing Lookup[combination[i]]=i-baseIdx[nItems]
EDIT: Never mind. This is completely wrong.
Let your combination be (a1, a2, ..., ak-1, ak) where a1 < a2 < ... < ak. Let choose(a,b) = a!/(b!*(a-b)!) if a >= b and 0 otherwise. Then, the index you are looking for is
choose(ak-1, k) + choose(ak-1-1, k-1) + choose(ak-2-1, k-2) + ... + choose (a2-1, 2) + choose (a1-1, 1) + 1
The first term counts the number of k-element combinations such that the largest element is less than ak. The second term counts the number of (k-1)-element combinations such that the largest element is less than ak-1. And, so on.
Notice that the size of the universe of elements to be chosen from (10 in your example) does not play a role in the computation of the index. Can you see why?
Sample solution:
class Program
{
static void Main(string[] args)
{
// The input
var n = 5;
var t = new[] { 2, 4, 5 };
// Helping transformations
ComputeDistances(t);
CorrectDistances(t);
// The algorithm
var r = CalculateRank(t, n);
Console.WriteLine("n = 5");
Console.WriteLine("t = {2, 4, 5}");
Console.WriteLine("r = {0}", r);
Console.ReadKey();
}
static void ComputeDistances(int[] t)
{
var k = t.Length;
while (--k >= 0)
t[k] -= (k + 1);
}
static void CorrectDistances(int[] t)
{
var k = t.Length;
while (--k > 0)
t[k] -= t[k - 1];
}
static int CalculateRank(int[] t, int n)
{
int k = t.Length - 1, r = 0;
for (var i = 0; i < t.Length; i++)
{
if (t[i] == 0)
{
n--;
k--;
continue;
}
for (var j = 0; j < t[i]; j++)
{
n--;
r += CalculateBinomialCoefficient(n, k);
}
n--;
k--;
}
return r;
}
static int CalculateBinomialCoefficient(int n, int k)
{
int i, l = 1, m, x, y;
if (n - k < k)
{
x = k;
y = n - k;
}
else
{
x = n - k;
y = k;
}
for (i = x + 1; i <= n; i++)
l *= i;
m = CalculateFactorial(y);
return l/m;
}
static int CalculateFactorial(int n)
{
int i, w = 1;
for (i = 1; i <= n; i++)
w *= i;
return w;
}
}
The idea behind the scenes is to associate a k-subset with an operation of drawing k-elements from the n-size set. It is a combination, so the overall count of possible items will be (n k). It is a clue that we could seek the solution in Pascal Triangle. After a while of comparing manually written examples with the appropriate numbers from the Pascal Triangle, we will find the pattern and hence the algorithm.
I used user515430's answer and converted to python3. Also this supports non-continuous values so you could pass in [1,3,5,7,9] as your pool instead of range(1,11)
from itertools import combinations
from scipy.special import comb
from pandas import Index
debugcombinations = False
class IndexedCombination:
def __init__(self, _setsize, _poolvalues):
self.setsize = _setsize
self.poolvals = Index(_poolvalues)
self.poolsize = len(self.poolvals)
self.totalcombinations = 1
fast_k = min(self.setsize, self.poolsize - self.setsize)
for i in range(1, fast_k + 1):
self.totalcombinations = self.totalcombinations * (self.poolsize - fast_k + i) // i
#fill the nCr cache
self.choose_cache = {}
n = self.poolsize
k = self.setsize
for i in range(k + 1):
for j in range(n + 1):
if n - j >= k - i:
self.choose_cache[n - j,k - i] = comb(n - j,k - i, exact=True)
if debugcombinations:
print('testnth = ' + str(self.testnth()))
def get_nth_combination(self,index):
n = self.poolsize
r = self.setsize
c = self.totalcombinations
#if index < 0 or index >= c:
# raise IndexError
result = []
while r:
c, n, r = c*r//n, n-1, r-1
while index >= c:
index -= c
c, n = c*(n-r)//n, n-1
result.append(self.poolvals[-1 - n])
return tuple(result)
def get_n_from_combination(self,someset):
n = self.poolsize
k = self.setsize
index = 0
j = 0
for i in range(k):
setidx = self.poolvals.get_loc(someset[i])
for j in range(j + 1, setidx + 1):
index += self.choose_cache[n - j, k - i - 1]
j += 1
return index
#just used to test whether nth_combination from the internet actually works
def testnth(self):
n = 0
_setsize = self.setsize
mainset = self.poolvals
for someset in combinations(mainset, _setsize):
nthset = self.get_nth_combination(n)
n2 = self.get_n_from_combination(nthset)
if debugcombinations:
print(str(n) + ': ' + str(someset) + ' vs ' + str(n2) + ': ' + str(nthset))
if n != n2:
return False
for x in range(_setsize):
if someset[x] != nthset[x]:
return False
n += 1
return True
setcombination = IndexedCombination(5, list(range(1,10+1)))
print( str(setcombination.get_n_from_combination([2,5,7,8,10])))
returns 188

Resources