Generate all valid combinations of N pairs of parentheses - algorithm

UPDATE (task detailed Explanation):
We have a string consist of numbers 0 and 1, divided by operators |, ^ or &. The task is to create all fully parenthesized expressions. So the final expressions should be divided into "2 parts"
For example
0^1 -> (0)^(1) but not extraneously: 0^1 -> (((0))^(1))
Example for expression 1|0&1:
(1)|((0)&(1))
((1)|(0))&(1)
As you can see both expressions above have left and write part:
left: (1); right: ((0)&(1))
left: ((1)|(0)); right: (1)
I tried the following code, but it does not work correctly (see output):
// expression has type string
// result has type Array (ArrayList in Java)
function setParens(expression, result) {
if (expression.length === 1) return "(" + expression + ")";
for (var i = 0; i < expression.length; i++) {
var c = expression[i];
if (c === "|" || c === "^" || c === "&") {
var left = expression.substring(0, i);
var right = expression.substring(i + 1);
leftParen = setParens(left, result);
rightParen = setParens(right, result);
var newExp = leftParen + c + rightParen;
result.push(newExp);
}
}
return expression;
}
function test() {
var r = [];
setParens('1|0&1', r);
console.log(r);
}
test();
code output: ["(0)&(1)", "(0)|0&1", "(1)|(0)", "1|0&(1)"]

Assuming the input expression is not already partially parenthesized and you want only fully parenthesized results:
FullyParenthesize(expression[1...n])
result = {}
// looking for operators
for p = 1 to n do
// binary operator; parenthesize LHS and RHS
// parenthesize the binary operation
if expression[p] is a binary operator then
lps = FullyParenthesize(expression[1 ... p - 1])
rps = FullyParenthesize(expression[p + 1 ... n])
for each lp in lps do
for each rp in rps do
result = result U {"(" + lp + expression[p] + rp + ")"}
// no binary operations <=> single variable
if result == {} then
result = {"(" + expression + ")")}
return result
Example: 1|2&3
FullyParenthesize("1|2&3")
result = {}
binary operator | at p = 2;
lps = FullyParenthesize("1")
no operators
result = {"(" + "1" + ")"}
return result = {"(1)"}
rps = Parenthesize("2&3")
result = {"2&3", "(2&3)"}
binary operator & at p = 2
lps = Parenthesize("2")
no operators
result = {"(" + "2" + ")"}
return result = {"(2)"}
rps = Parenthesize("3")
no operators
result = {"(" + "3" + ")"}
return result = {"(3)"}
lp = "(2)"
rp = "(3)"
result = result U {"(" + "(2)" + "&" + "(3)" + ")"}
return result = {"((2)&(3))"}
lp = "(1)"
rp = "((2)&(3))"
result = result U {"(" + "(1)" + "|" + "((2)&(3))" + ")"}
binary operator & at p = 4
...
result = result U {"(" + "((1)|(2))" + "&" + "(3)" + ")"}
return result {"((1)|((2)&(3)))", "(((1)|(2))&(3))"}
You will have 2^k unique fully parenthesized expressions (without repeated parentheses) given an input expression with k binary operators.

Related

Compact way to produce a large sequence of strings in lexical order

I want to generate a sequence of strings with the following properties:
Lexically ordered
Theoretically infinite
Compact over a realistic range
Generated by a simple process of incrementation
Matches the regexp /\w+/
The obvious way to generate a lexically-ordered sequence is to choose a string length and pad the strings with a base value like this: 000000, 000001, etc. This approach poses a trade-off between the number of permutations and compactness: a string long enough to yield many permutations will be filled many zeros along the way. Plus, the length I choose sets an upper bound on the total number of permutations unless I have some mechanism for expanding the string when it maxes out.
So I came up with a sequence that works like this:
Each string consists of a "head", which is a base-36 number, followed by an underscore, and then the "tail", which is also a base-36 number padded by an increasing number of zeros
The first cycle goes from 0_0 to 0_z
The second cycle goes from 1_00 to 1_zz
The third cycle goes from 2_000 to 2_zzz, and so on
Once the head has reached z and the tail consists of 36 zs, the first "supercycle" has ended. Now the whole sequence starts over, except the z remains at the beginning, so the new cycle starts with z0_0, then continues to z1_00, and so on
The second supercycle goes zz0_0, zz1_00, and so on
Although the string of zs in the head could become unwieldy over the long run, a single supercycle contains over 10^56 permutations, which is far more than I ever expect to use. The sequence is theoretically infinite but very compact within a realistic range. For instance, the trillionth permutation is a succinct 7_bqd55h8s.
I can generate the sequence relatively simply with this javascript function:
function genStr (n) {
n = BigInt(n);
let prefix = "",
cycle = 0n,
max = 36n ** (cycle + 1n);
while (n >= max) {
n -= max;
if (cycle === 35n) {
prefix += "z";
cycle = 0n;
} else {
cycle++;
}
max = 36n ** (cycle + 1n);
}
return prefix
+ cycle.toString(36)
+ "_"
+ n.toString(36).padStart(Number(cycle) + 1, 0);
}
The n parameter is a number that I increment and pass to the function to get the next member of the sequence. All I need to keep track of is a simple integer, making the sequence very easy to use.
So obviously I spent a lot of time on this and I think it's pretty good, but I'm wondering if there is a better way. Is there a good algorithm for generating a sequence along the lines of the one I'm looking for?
A close idea to yours. (more rafined than my first edit...).
Let our alphabet be A = {0,1,2,3}.
Let |2| mean we iterate from 0 to 2 and |2|^2 mean we generate the cartesian product in a lexically sorted manner (00,01,10,11).
We start with
0 |3|
So we have a string of length 2. We "unshift" the digit 1 which "factorizes" since any 0|3|... is less than 1|3|^2.
1 |3|^2
Same idea: unshift 2, and make words of length 4.
2 |3|^3
Now we can continue and generate
3 |2| |3|^3
Notice |2| and not |3|. Now our maximum number becomes 32333. And as you did, we can now add the carry and start a new supercycle:
33 0|3|
This is a slight improvement, since _ can now be part of our alphabet: we don't need to reserve it as a token separator.
In our case we can represent in a supercycle:
n + n^2 + ... + n^(n-1) + (n-1) * n^(n-1)
\-----------------------/\--------------/
geometric special
In your case, the special part would be n^n (with the nuance that you have theorically one char less so replace n with n-1 everywhere)
The proposed supercycle is of length :
P = (n \sum_{k = 0}^{n-2} n^k) + (n-1) * n^(n-1)
P = (n \sum_{k = 0}^{n-3} n^k) + n^n
P = n(n^{n-2} - 1)/(n-1) + n^n
Here is an example diff with alphabet A={0,1,2}
my genStr(grandinero)
,00 0_0
,01 0_1
,02 0_2
,100 1_00
,101 1_01
,102 1_02
,110 1_10
,111 1_11
,112 1_12
,120 1_20
,121 1_21
,122 1_22
,2000 2_000
,2001 2_001
,2002 2_002
,2010 2_010
,2011 2_011
,2012 2_012
,2020 2_020
,2021 2_021
,2022 2_022
,2100 2_100
,2101 2_101
,2102 2_102
,2110 2_110
,2111 2_111
,2112 2_112
,2120 2_120
,2121 2_121
,2122 2_122
22,00 2_200 <-- end of my supercycle if no '_' allowed
22,01 2_201
22,02 2_202
22,100 2_210
22,101 2_211
22,102 2_212
22,110 2_220
22,111 2_221
22,112 2_222 <-- end of yours
22,120 z0_0
That said, for a given number x, we can can count how many supercycles (E(x / P)) there are, each supercycle making two leading e (e being the last char of A).
e.g: A = {0,1,2} and x = 43
e = 2
P = n(n^{n-2} - 1)/(n-1) + n^n = 3(3^1 -1)/2 + 27 = 30
// our supercycle is of length 30
E(43/30) = 1 // 43 makes one supercycle and a few more "strings"
r = x % P = 13 // this is also x - (E(43/30) * 30) (the rest of the euclidean division by P)
Then for the left over (r = x % P) two cases to consider:
either we fall in the geometric sequence
either we fall in the (n-1) * n^(n-1) part.
1. Adressing the geometric sequence with cumulative sums (x < S_w)
Let S_i be the cumsum of n, n^2,..
S_i = n\sum_{k = 0}^{i-1} n^k
S_i = n/(n-1)*(n^i - 1)
which gives S_0 = 0, S_1 = n, S_2 = n + n^2...
So basically, if x < S_1, we get 0(x), elif x < S_2, we get 1(x-S_1)
Let S_w = S_{n-1} the count of all the numbers we can represent.
If x <= S_w then we want the i such that
S_i < x <= S_{i+1} <=> n^i < (n-1)/n * x + 1 <= n^{i+1}
We can then apply some log flooring (base(n)) to get that i.
We can then associate the string: A[i] + base_n(x - S_i).
Illustration:
This time with A = {0,1,2,3}.
Let x be 17.
Our consecutive S_i are:
S_0 = 0
S_1 = 4
S_2 = S_1 + 4^2 = 20
S_3 = S_2 + 4^3 = 84
S_w = S_{4-1} = S_3 = 84
x=17 is indeed less than 84, we will be able to affect it to one of the S_i ranges.
In particular S_1==4 < x==17 <= S_2==20.
We remove the strings encoded by the leading 0(there are a number S_1 of those strings).
The position to encode with the leading 1 is
x - 4 = 13.
And we conclude the thirteen's string generated with a leading 1 is base_4(13) = '31' (idem string -> '131')
Should we have had x = 21, we would have removed the count of S_2 so 21-20 = 1, which in turn gives with a leading 2 the string '2001'.
2. Adressing x in the special part (x >= S_w)
Let's consider study case below:
with A = {0,1,2}
The special part is
2 |1| |2|^2
that is:
2 0 00
2 0 01
2 0 02
2 0 10
2 0 11
2 0 12
2 0 20
2 0 21
2 0 22
2 1 20
2 1 21
2 1 22
2 1 10
2 1 11
2 1 12
2 1 20
2 1 21
2 1 22
Each incremented number of the second column (here 0 to 1 (specified from |1|)) gives 3^2 combination.
This is similar to the geometric series except that here each range is constant. We want to find the range which means we know which string to prefix.
We can represent it as the matrix
20 (00,01,02,10,11,12,20,21,22)
21 (00,01,02,10,11,12,20,21,22)
The portion in parenthesis is our matrix.
Every item in a row is simply its position base_3 (left-padded with 0).
e.g: n=7 has base_3 value '21'. (7=2*3+1).
'21' does occur in position 7 in the row.
Assuming we get some x (relative to that special part).
E(x / 3^2) gives us the row number (here E(7/9) = 0 so prefix is '20')
x % 3^2 give us the position in the row (here base_3(7%9)='21' giving us the final string '2021')
If we want to observe it remember that we substracted S_w=12 before to get x = 7, so we would call myGen(7+12)
Some code
Notice the same output as long as we stand in the "geometric" range, without supercycle.
Obviously, when carry starts to appear, it depends on whether I can use '_' or not. If yes, my words get shorter otherwise longer.
// https://www.cs.sfu.ca/~ggbaker/zju/math/int-alg.html
// \w insensitive could give base64
// but also éè and other accents...
function base_n(x, n, A) {
const a = []
while (x !== 0n) {
a.push(A[Number(x % n)])
x = x / n // auto floor with bigInt
}
return a.reverse().join('')
}
function mygen (A) {
const n = A.length
const bn = BigInt(n)
const A_last = A[A.length-1]
const S = Array(n).fill(0).map((x, i) => bn * (bn ** BigInt(i) - 1n) / (bn - 1n))
const S_w = S[n-1]
const w = S_w + (bn - 1n) * bn ** (bn - 1n)
const w2 = bn ** (bn - 1n)
const flog_bn = x => {
// https://math.stackexchange.com/questions/1627914/smart-way-to-calculate-floorlogx
let L = 0
while (x >= bn) {
L++
x /= bn
}
return L
}
return function (x) {
x = BigInt(x)
let r = x % w
const q = (x - r) / w
let s
if (r < S_w) {
const i = flog_bn(r * (bn - 1n) / bn + 1n)
const r2 = r - S[i]
s = A[i] + base_n(r2, bn, A).padStart(i+1, '0')
} else {
const n2 = r - S_w
const r2 = n2 % w2
const q2 = (n2 - r2 ) / w2
s = A_last + A[q2] + base_n(r2, bn, A).padStart(n-1, '0')
}
// comma below __not__ necessary, just to ease seeing cycles
return A_last.repeat(2*Number(q)) +','+ s
}
}
function genStr (A) {
A = A.filter(x => x !== '_')
const bn_noUnderscore = BigInt(A.length)
return function (x) {
x = BigInt(x);
let prefix = "",
cycle = 0n,
max = bn_noUnderscore ** (cycle + 1n);
while (x >= max) {
x -= max;
if (cycle === bn_noUnderscore - 1n) {
prefix += "z";
cycle = 0n;
} else {
cycle++;
}
max = bn_noUnderscore ** (cycle + 1n);
}
return prefix
+ base_n(cycle, bn_noUnderscore, A)
+ "_"
+ base_n(x, bn_noUnderscore, A).padStart(Number(cycle) + 1, 0);
}
}
function test(a, b, x){
console.log(a(x), b(x))
}
{
console.log('---my supercycle is shorter if underscore not used. Plenty of room for grandinero')
const A = '0123456789abcdefghijklmnopqrstuvwxyz'.split('').sort((a,b)=>a.localeCompare(b))
let my = mygen(A)
const grandinero = genStr(A)
test(my, grandinero, 1e4)
test(my, grandinero, 1e12)
test(my, grandinero, 106471793335560744271846581685593263893929893610517909620n) // cycle ended for me (w variable value)
}
{
console.log('---\n my supercycle is greater if underscore is used in my alphabet (not grandinero since "forbidden')
// underscore used
const A = '0123456789abcdefghijklmnopqrstuvwxyz_'.split('').sort((a,b)=>a.localeCompare(b))
let my = mygen(A)
const grandinero = genStr(A)
test(my, grandinero, 1e12)
test(my, grandinero, 106471793335560744271846581685593263893929893610517909620n) // cycle ended for me (w variable value)
test(my, grandinero, 1e57) // still got some place in the supercycle
}
After considering the advice provided by #kaya3 and #grodzi and reviewing my original code, I have made some improvements. I realized a few things:
There was a bug in my original code. If one cycle ends at z_z (actually 36 z's after the underscore, but you get the idea) and the next one begins at z0_0, then lexical ordering is broken because _ comes after 0. The separator (or "neck") needs to be lower in lexical order than the lowest possible value of the head.
Though I was initially resistant to the idea of rolling a custom baseN generator so that more characters can be included, I have now come around to the idea.
I can squeeze more permutations out of a given string length by also incrementing the neck. For example, I can go from A00...A0z to A10...A1z, and so on, thus increasing the number of unique strings I can generate with A as the head before I move on to B.
With that in mind, I have revised my code:
// this is the alphabet used in standard baseN conversions:
let baseAlpha = "0123456789abcdefghijklmnopqrstuvwxyz";
// this is a factory for creating a new string generator:
function sequenceGenerator (config) {
let
// alphabets for the head, neck and body:
headAlpha = config.headAlpha,
neckAlpha = config.neckAlpha,
bodyAlpha = config.bodyAlpha,
// length of the body alphabet corresponds to the
// base of the numbering system:
base = BigInt(bodyAlpha.length),
// if bodyAlpha is identical to an alphabet that
// would be used for a standard baseN conversion,
// then use the built-in method, which should be
// much faster:
convertBody = baseAlpha.startsWith(bodyAlpha)
? (n) => n.toString(bodyAlpha.length)
// otherwise, roll a custom baseN generator:
: function (n) {
let s = "";
while (n > 0n) {
let i = n % base;
s = bodyAlpha[i] + s;
n = n / base;
}
return s;
},
// n is used to cache the last iteration and is
// incremented each time you call `getNext`
// it can optionally be initialized to a value other
// than 0:
n = BigInt(config.start || 0),
// see below:
headCycles = [0n],
cycleLength = 0n;
// the length of the body increases by 1 each time the
// head increments, meaning that the total number of
// permutations increases geometrically for each
// character in headAlpha
// here we cache the maximum number of permutations for
// each length of the body
// since we know these values ahead of time, calculating
// them in advance saves time when we generate a new
// string
// more importantly, it saves us from having to do a
// reverse calculation involving Math.log, which requires
// converting BigInts to Numbers, which breaks the
// program on larger numbers:
for (let i = 0; i < headAlpha.length; i++) {
// the maximum number of permutations depends on both
// the string length (i + 1) and the number of
// characters in neckAlpha, since the string length
// remains the same while the neck increments
cycleLength += BigInt(neckAlpha.length) * base ** BigInt(i + 1);
headCycles.push(cycleLength);
}
// given a number n, this function searches through
// headCycles to find where the total number of
// permutations exceeds n
// this is how we avoid the reverse calculation with
// Math.log to determine which head cycle we are on for
// a given permutation:
function getHeadCycle (n) {
for (let i = 0; i < headCycles.length; i++) {
if (headCycles[i] > n) return i;
}
}
return {
cycleLength: cycleLength,
getString: function (n) {
let cyclesDone = Number(n / cycleLength),
headLast = headAlpha[headAlpha.length - 1],
prefix = headLast.repeat(cyclesDone),
nn = n % cycleLength,
headCycle = getHeadCycle(nn),
head = headAlpha[headCycle - 1],
nnn = nn - headCycles[headCycle - 1],
neckCycleLength = BigInt(bodyAlpha.length) ** BigInt(headCycle),
neckCycle = nnn / neckCycleLength,
neck = neckAlpha[Number(neckCycle)],
body = convertBody(nnn % neckCycleLength);
body = body.padStart(headCycle , bodyAlpha[0]);
return prefix + head + neck + body;
},
getNext: function () { return this.getString(n++); }
};
}
let bodyAlpha = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz",
getStr = sequenceGenerator({
// achieve more permutations within a supercycle
// with a larger headAlpha:
headAlpha: "123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
// the highest value of neckAlpha must be lower than
// the lowest value of headAlpha:
neckAlpha: "0",
bodyAlpha: bodyAlpha
});
console.log("---supercycle length:");
console.log(Number(getStr.cycleLength));
console.log("---first two values:")
console.log(getStr.getNext());
console.log(getStr.getNext());
console.log("---arbitrary large value (1e57):");
console.log(getStr.getString(BigInt(1e57)));
console.log("");
// here we use a shorter headAlpha and longer neckAlpha
// to shorten the maximum length of the body, but this also
// decreases the number of permutations in the supercycle:
getStr = sequenceGenerator({
headAlpha: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
neckAlpha: "0123456789",
bodyAlpha: bodyAlpha
});
console.log("---supercycle length:");
console.log(Number(getStr.cycleLength));
console.log("---first two values:");
console.log(getStr.getNext());
console.log(getStr.getNext());
console.log("---arbitrary large value (1e57):");
console.log(getStr.getString(BigInt(1e57)));
EDIT
After further discussion with #grodzi, I have made some more improvements:
I realized that the "neck" or separator wasn't providing much value, so I have gotten rid of it. Later edit: actually, the separator is necessary. I am not sure why I thought it wasn't. Without the separator, the beginning of each new supercycle will lexically precede the end of the previous supercycle. I haven't changed my code below, but anyone using this code should include a separator. I have also realized that I was wrong to use an underscore as the separator. The separator must be a character, such as the hyphen, which lexically precedes the lowest digit used in the sequence (0).
I have taken #grodzi's suggestion to allow the length of the tail to continue growing indefinitely.
Here is the new code:
let baseAlpha = "0123456789abcdefghijklmnopqrstuvwxyz";
function sequenceGenerator (config) {
let headAlpha = config.headAlpha,
tailAlpha = config.tailAlpha,
base = BigInt(tailAlpha.length),
convertTail = baseAlpha.startsWith(tailAlpha)
? (n) => n.toString(tailAlpha.length)
: function (n) {
if (n === 0n) return "0";
let s = "";
while (n > 0n) {
let i = n % base;
s = tailAlpha[i] + s;
n = n / base;
}
return s;
},
n = BigInt(config.start || 0);
return {
getString: function (n) {
let cyclesDone = 0n,
headCycle = 0n,
initLength = 0n,
accum = 0n;
for (;; headCycle++) {
let _accum = accum + base ** (headCycle + 1n + initLength);
if (_accum > n) {
n -= accum;
break;
} else if (Number(headCycle) === headAlpha.length - 1) {
cyclesDone++;
initLength += BigInt(headAlpha.length);
headCycle = -1n;
}
accum = _accum;
}
let headLast = headAlpha[headAlpha.length - 1],
prefix = headLast.repeat(Number(cyclesDone)),
head = headAlpha[Number(headCycle)],
tail = convertTail(n),
tailLength = Number(headCycle + initLength);
tail = tail.padStart(tailLength, tailAlpha[0]);
return prefix + head + tail;
},
getNext: function () { return this.getString(n++); }
};
}
let alpha = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz",
genStr = sequenceGenerator({headAlpha: alpha, tailAlpha: alpha});
console.log("--- first string:");
console.log(genStr.getString(0n));
console.log("--- 1e+57");
console.log(genStr.getString(BigInt(1e+57)));
console.log("--- end of first supercycle:");
console.log(genStr.getString(63n*(1n-(63n**63n))/(1n-63n)-1n));
console.log("--- start of second supercycle:");
console.log(genStr.getString(63n*(1n-(63n**63n))/(1n-63n)));

Remove consecutive duplicates in a string to make the smallest string

Given a string and the constraint of matching on >= 3 characters, how can you ensure that the result string will be as small as possible?
edit with gassa's explicitness:
E.G.
'AAAABBBAC'
If I remove the B's first,
AAAA[BBB]AC -- > AAAAAC, then I can remove all of the A's from the resultant string and be left with:
[AAAAA]C --> C
'C'
If I just remove what is available first (the sequence of A's), I get:
[AAAA]BBBAC -- > [BBB]AC --> AC
'AC'
A tree would definitely get you the shortest string(s).
The tree solution:
Define a State (node) for each current string Input and all its removable sub-strings' int[] Indexes.
Create the tree: For each int index create another State and add it to the parent state State[] Children.
A State with no possible removable sub-strings has no children Children = null.
Get all Descendants State[] of your root State. Order them by their shortest string Input. And that is/are your answer(s).
Test cases:
string result = FindShortest("AAAABBBAC"); // AC
string result2 = FindShortest("AABBAAAC"); // AABBC
string result3 = FindShortest("BAABCCCBBA"); // B
The Code:
Note: Of-course everyone is welcome to enhance the following code in terms of performance and/or fixing any bug.
class Program
{
static void Main(string[] args)
{
string result = FindShortest("AAAABBBAC"); // AC
string result2 = FindShortest("AABBAAAC"); // AABBC
string result3 = FindShortest("BAABCCCBBA"); // B
}
// finds the FIRST shortest string for a given input
private static string FindShortest(string input)
{
// all possible removable strings' indexes
// for this given input
int[] indexes = RemovableIndexes(input);
// each input string and its possible removables are a state
var state = new State { Input = input, Indexes = indexes };
// create the tree
GetChildren(state);
// get the FIRST shortest
// i.e. there would be more than one answer sometimes
// this could be easily changed to get all possible results
var result =
Descendants(state)
.Where(d => d.Children == null || d.Children.Length == 0)
.OrderBy(d => d.Input.Length)
.FirstOrDefault().Input;
return result;
}
// simple get all descendants of a node/state in a tree
private static IEnumerable<State> Descendants(State root)
{
var states = new Stack<State>(new[] { root });
while (states.Any())
{
State node = states.Pop();
yield return node;
if (node.Children != null)
foreach (var n in node.Children) states.Push(n);
}
}
// creates the tree
private static void GetChildren(State state)
{
// for each an index there is a child
state.Children = state.Indexes.Select(
i =>
{
var input = RemoveAllAt(state.Input, i);
return input.Length < state.Input.Length && input.Length > 0
? new State
{
Input = input,
Indexes = RemovableIndexes(input)
}
: null;
}).ToArray();
foreach (var c in state.Children)
GetChildren(c);
}
// find all possible removable strings' indexes
private static int[] RemovableIndexes(string input)
{
var indexes = new List<int>();
char d = input[0];
int count = 1;
for (int i = 1; i < input.Length; i++)
{
if (d == input[i])
count++;
else
{
if (count >= 3)
indexes.Add(i - count);
// reset
d = input[i];
count = 1;
}
}
if (count >= 3)
indexes.Add(input.Length - count);
return indexes.ToArray();
}
// remove all duplicate chars starting from an index
private static string RemoveAllAt(string input, int startIndex)
{
string part1, part2;
int endIndex = startIndex + 1;
int i = endIndex;
for (; i < input.Length; i++)
if (input[i] != input[startIndex])
{
endIndex = i;
break;
}
if (i == input.Length && input[i - 1] == input[startIndex])
endIndex = input.Length;
part1 = startIndex > 0 ? input.Substring(0, startIndex) : string.Empty;
part2 = endIndex <= (input.Length - 1) ? input.Substring(endIndex) : string.Empty;
return part1 + part2;
}
// our node, which is
// an input string &
// all possible removable strings' indexes
// & its children
public class State
{
public string Input;
public int[] Indexes;
public State[] Children;
}
}
I propose O(n^2) solution with dynamic programming.
Let's introduce notation. Prefix and suffix of length l of string A denoted by P[l] and S[l]. And we call our procedure Rcd.
Rcd(A) = Rcd(Rcd(P[n-1])+S[1])
Rcd(A) = Rcd(P[1]+Rcd(S[n-1]))
Note that outer Rcd in the RHS is trivial. So, that's our optimal substructure. Based on this i came up with the following implementation:
#include <iostream>
#include <string>
#include <vector>
#include <cassert>
using namespace std;
string remdupright(string s, bool allowEmpty) {
if (s.size() >= 3) {
auto pos = s.find_last_not_of(s.back());
if (pos == string::npos && allowEmpty) s = "";
else if (pos != string::npos && s.size() - pos > 3) s = s.substr(0, pos + 1);
}
return s;
}
string remdupleft(string s, bool allowEmpty) {
if (s.size() >= 3) {
auto pos = s.find_first_not_of(s.front());
if (pos == string::npos && allowEmpty) s = "";
else if (pos != string::npos && pos >= 3) s = s.substr(pos);
}
return s;
}
string remdup(string s, bool allowEmpty) {
return remdupleft(remdupright(s, allowEmpty), allowEmpty);
}
string run(const string in) {
vector<vector<string>> table(in.size());
for (int i = 0; i < (int)table.size(); ++i) {
table[i].resize(in.size() - i);
}
for (int i = 0; i < (int)table[0].size(); ++i) {
table[0][i] = in.substr(i,1);
}
for (int len = 2; len <= (int)table.size(); ++len) {
for (int pos = 0; pos < (int)in.size() - len + 1; ++pos) {
string base(table[len - 2][pos]);
const char suffix = in[pos + len - 1];
if (base.size() && suffix != base.back()) {
base = remdupright(base, false);
}
const string opt1 = base + suffix;
base = table[len - 2][pos+1];
const char prefix = in[pos];
if (base.size() && prefix != base.front()) {
base = remdupleft(base, false);
}
const string opt2 = prefix + base;
const string nodupopt1 = remdup(opt1, true);
const string nodupopt2 = remdup(opt2, true);
table[len - 1][pos] = nodupopt1.size() > nodupopt2.size() ? opt2 : opt1;
assert(nodupopt1.size() != nodupopt2.size() || nodupopt1 == nodupopt2);
}
}
string& res = table[in.size() - 1][0];
return remdup(res, true);
}
void testRcd(string s, string expected) {
cout << s << " : " << run(s) << ", expected: " << expected << endl;
}
int main()
{
testRcd("BAABCCCBBA", "B");
testRcd("AABBAAAC", "AABBC");
testRcd("AAAA", "");
testRcd("AAAABBBAC", "C");
}
You can check default and run your tests here.
Clearly we are not concerned about any block of repeated characters longer than 2 characters. And there is only one way two blocks of the same character where at least one of the blocks is less than 3 in length can be combined - namely, if the sequence between them can be removed.
So (1) look at pairs of blocks of the same character where at least one is less than 3 in length, and (2) determine if the sequence between them can be removed.
We want to decide which pairs to join so as to minimize the total length of blocks less than 3 characters long. (Note that the number of pairs is bound by the size (and distribution) of the alphabet.)
Let f(b) represent the minimal total length of same-character blocks remaining up to the block b that are less than 3 characters in length. Then:
f(b):
p1 <- previous block of the same character
if b and p1 can combine:
if b.length + p1.length > 2:
f(b) = min(
// don't combine
(0 if b.length > 2 else b.length) +
f(block before b),
// combine
f(block before p1)
)
// b.length + p1.length < 3
else:
p2 <- block previous to p1 of the same character
if p1 and p2 can combine:
f(b) = min(
// don't combine
b.length + f(block before b),
// combine
f(block before p2)
)
else:
f(b) = b.length + f(block before b)
// b and p1 cannot combine
else:
f(b) = b.length + f(block before b)
for all p1 before b
The question is how can we efficiently determine if a block can be combined with the previous block of the same character (aside from the obvious recursion into the sub-block-list between the two blocks).
Python code:
import random
import time
def parse(length):
return length if length < 3 else 0
def f(string):
chars = {}
blocks = [[string[0], 1, 0]]
chars[string[0]] = {'indexes': [0]}
chars[string[0]][0] = {'prev': -1}
p = 0 # pointer to current block
for i in xrange(1, len(string)):
if blocks[len(blocks) - 1][0] == string[i]:
blocks[len(blocks) - 1][1] += 1
else:
p += 1
# [char, length, index, f(i), temp]
blocks.append([string[i], 1, p])
if string[i] in chars:
chars[string[i]][p] = {'prev': chars[string[i]]['indexes'][ len(chars[string[i]]['indexes']) - 1 ]}
chars[string[i]]['indexes'].append(p)
else:
chars[string[i]] = {'indexes': [p]}
chars[string[i]][p] = {'prev': -1}
#print blocks
#print
#print chars
#print
memo = [[None for j in xrange(len(blocks))] for i in xrange(len(blocks))]
def g(l, r, top_level=False):
####
####
#print "(l, r): (%s, %s)" % (l,r)
if l == r:
return parse(blocks[l][1])
if memo[l][r]:
return memo[l][r]
result = [parse(blocks[l][1])] + [None for k in xrange(r - l)]
if l < r:
for i in xrange(l + 1, r + 1):
result[i - l] = parse(blocks[i][1]) + result[i - l - 1]
for i in xrange(l, r + 1):
####
####
#print "\ni: %s" % i
[char, length, index] = blocks[i]
#p1 <- previous block of the same character
p1_idx = chars[char][index]['prev']
####
####
#print "(p1_idx, l, p1_idx >= l): (%s, %s, %s)" % (p1_idx, l, p1_idx >= l)
if p1_idx < l and index > l:
result[index - l] = parse(length) + result[index - l - 1]
while p1_idx >= l:
p1 = blocks[p1_idx]
####
####
#print "(b, p1, p1_idx, l): (%s, %s, %s, %s)\n" % (blocks[i], p1, p1_idx, l)
between = g(p1[2] + 1, index - 1)
####
####
#print "between: %s" % between
#if b and p1 can combine:
if between == 0:
if length + p1[1] > 2:
result[index - l] = min(
result[index - l],
# don't combine
parse(length) + (result[index - l - 1] if index - l > 0 else 0),
# combine: f(block before p1)
result[p1[2] - l - 1] if p1[2] > l else 0
)
# b.length + p1.length < 3
else:
#p2 <- block previous to p1 of the same character
p2_idx = chars[char][p1[2]]['prev']
if p2_idx < l:
p1_idx = chars[char][p1_idx]['prev']
continue
between2 = g(p2_idx + 1, p1[2] - 1)
#if p1 and p2 can combine:
if between2 == 0:
result[index - l] = min(
result[index - l],
# don't combine
parse(length) + (result[index - l - 1] if index - l > 0 else 0),
# combine the block, p1 and p2
result[p2_idx - l - 1] if p2_idx - l > 0 else 0
)
else:
#f(b) = b.length + f(block before b)
result[index - l] = min(
result[index - l],
parse(length) + (result[index - l - 1] if index - l > 0 else 0)
)
# b and p1 cannot combine
else:
#f(b) = b.length + f(block before b)
result[index - l] = min(
result[index - l],
parse(length) + (result[index - l - 1] if index - l > 0 else 0)
)
p1_idx = chars[char][p1_idx]['prev']
#print l,r,result
memo[l][r] = result[r - l]
"""if top_level:
return (result, blocks)
else:"""
return result[r - l]
if len(blocks) == 1:
return ([parse(blocks[0][1])], blocks)
else:
return g(0, len(blocks) - 1, True)
"""s = ""
for i in xrange(300):
s = s + ['A','B','C'][random.randint(0,2)]"""
print f("abcccbcccbacccab") # b
print
print f("AAAABBBAC"); # C
print
print f("CAAAABBBA"); # C
print
print f("AABBAAAC"); # AABBC
print
print f("BAABCCCBBA"); # B
print
print f("aaaa")
print
The string answers for these longer examples were computed using jdehesa's answer:
t0 = time.time()
print f("BCBCCBCCBCABBACCBABAABBBABBBACCBBBAABBACBCCCACABBCAABACBBBBCCCBBAACBAABACCBBCBBAABCCCCCAABBBBACBBAAACACCBCCBBBCCCCCCCACBABACCABBCBBBBBCBABABBACCAACBCBBAACBBBBBCCBABACBBABABAAABCCBBBAACBCACBAABAAAABABB")
# BCBCCBCCBCABBACCBABCCAABBACBACABBCAABACAACBAABACCBBCBBCACCBACBABACCABBCCBABABBACCAACBCBBAABABACBBABABBCCAACBCACBAABBABB
t1 = time.time()
total = t1-t0
print total
t0 = time.time()
print f("CBBACAAAAABBBBCAABBCBAABBBCBCBCACACBAABCBACBBABCABACCCCBACBCBBCBACBBACCCBAAAACACCABAACCACCBCBCABAACAABACBABACBCBAACACCBCBCCCABACABBCABBAAAAABBBBAABAABBCACACABBCBCBCACCCBABCAACBCAAAABCBCABACBABCABCBBBBABCBACABABABCCCBBCCBBCCBAAABCABBAAABBCAAABCCBAABAABCAACCCABBCAABCBCBCBBAACCBBBACBBBCABAABCABABABABCA")
# CBBACCAABBCBAACBCBCACACBAABCBACBBABCABABACBCBBCBACBBABCACCABAACCACCBCBCABAACAABACBABACBCBAACACCBCBABACABBCBBCACACABBCBCBCABABCAACBCBCBCABACBABCABCABCBACABABACCBBCCBBCACBCCBAABAABCBBCAABCBCBCBBAACCACCABAABCABABABABCA
t1 = time.time()
total = t1-t0
print total
t0 = time.time()
print f("AADBDBEBBBBCABCEBCDBBBBABABDCCBCEBABADDCABEEECCECCCADDACCEEAAACCABBECBAEDCEEBDDDBAAAECCBBCEECBAEBEEEECBEEBDACDDABEEABEEEECBABEDDABCDECDAABDAEADEECECEBCBDDAEEECCEEACCBBEACDDDDBDBCCAAECBEDAAAADBEADBAAECBDEACDEABABEBCABDCEEAABABABECDECADCEDAEEEBBBCEDECBCABDEDEBBBABABEEBDAEADBEDABCAEABCCBCCEDCBBEBCECCCA")
# AADBDBECABCEBCDABABDCCBCEBABADDCABCCEADDACCEECCABBECBAEDCEEBBECCBBCEECBAEBCBEEBDACDDABEEABCBABEDDABCDECDAABDAEADEECECEBCBDDACCEEACCBBEACBDBCCAAECBEDDBEADBAAECBDEACDEABABEBCABDCEEAABABABECDECADCEDACEDECBCABDEDEABABEEBDAEADBEDABCAEABCCBCCEDCBBEBCEA
t1 = time.time()
total = t1-t0
print total
Another scala answer, using memoization and tailcall optimization (partly) (updated).
import scala.collection.mutable.HashSet
import scala.annotation._
object StringCondense extends App {
#tailrec
def groupConsecutive (s: String, sofar: List[String]): List[String] = s.toList match {
// def groupConsecutive (s: String): List[String] = s.toList match {
case Nil => sofar
// case Nil => Nil
case c :: str => {
val (prefix, rest) = (c :: str).span (_ == c)
// Strings of equal characters, longer than 3, don't make a difference to just 3
groupConsecutive (rest.mkString(""), (prefix.take (3)).mkString ("") :: sofar)
// (prefix.take (3)).mkString ("") :: groupConsecutive (rest.mkString(""))
}
}
// to count the effect of memoization
var count = 0
// recursively try to eliminate every group of 3 or more, brute forcing
// but for "aabbaabbaaabbbaabb", many reductions will lead sooner or
// later to the same result, so we try to detect these and avoid duplicate
// work
def moreThan2consecutive (s: String, seenbefore: HashSet [String]): String = {
if (seenbefore.contains (s)) s
else
{
count += 1
seenbefore += s
val sublists = groupConsecutive (s, Nil)
// val sublists = groupConsecutive (s)
val atLeast3 = sublists.filter (_.size > 2)
atLeast3.length match {
case 0 => s
case 1 => {
val res = sublists.filter (_.size < 3)
moreThan2consecutive (res.mkString (""), seenbefore)
}
case _ => {
val shrinked = (
for {idx <- (0 until sublists.size)
if (sublists (idx).length >= 3)
pre = (sublists.take (idx)).mkString ("")
post= (sublists.drop (idx+1)).mkString ("")
} yield {
moreThan2consecutive (pre + post, seenbefore)
}
)
(shrinked.head /: shrinked.tail) ((a, b) => if (a.length <= b.length) a else b)
}
}
}
}
// don't know what Rcd means, adopted from other solution but modified
// kind of a unit test **update**: forgot to reset count
testRcd (s: String, expected: String) : Boolean = {
count = 0
val seenbefore = HashSet [String] ()
val result = moreThan2consecutive (s, seenbefore)
val hit = result.equals (expected)
println (s"Input: $s\t result: ${result}\t expected ${expected}\t $hit\t count: $count");
hit
}
// some test values from other users with expected result
// **upd:** more testcases
def testgroup () : Unit = {
testRcd ("baabcccbba", "b")
testRcd ("aabbaaac", "aabbc")
testRcd ("aaaa", "")
testRcd ("aaaabbbac", "c")
testRcd ("abcccbcccbacccab", "b")
testRcd ("AAAABBBAC", "C")
testRcd ("CAAAABBBA", "C")
testRcd ("AABBAAAC", "AABBC")
testRcd ("BAABCCCBBA", "B")
testRcd ("AAABBBAAABBBAAABBBC", "C") // 377 subcalls reported by Yola,
testRcd ("AAABBBAAABBBAAABBBAAABBBC", "C") // 4913 when preceeded with AAABBB
}
testgroup
def testBigs () : Unit = {
/*
testRcd ("BCBCCBCCBCABBACCBABAABBBABBBACCBBBAABBACBCCCACABBCAABACBBBBCCCBBAACBAABACCBBCBBAABCCCCCAABBBBACBBAAACACCBCCBBBCCCCCCCACBABACCABBCBBBBBCBABABBACCAACBCBBAACBBBBBCCBABACBBABABAAABCCBBBAACBCACBAABAAAABABB",
"BCBCCBCCBCABBACCBABCCAABBACBACABBCAABACAACBAABACCBBCBBCACCBACBABACCABBCCBABABBACCAACBCBBAABABACBBABABBCCAACBCACBAABBABB")
*/
testRcd ("CBBACAAAAABBBBCAABBCBAABBBCBCBCACACBAABCBACBBABCABACCCCBACBCBBCBACBBACCCBAAAACACCABAACCACCBCBCABAACAABACBABACBCBAACACCBCBCCCABACABBCABBAAAAABBBBAABAABBCACACABBCBCBCACCCBABCAACBCAAAABCBCABACBABCABCBBBBABCBACABABABCCCBBCCBBCCBAAABCABBAAABBCAAABCCBAABAABCAACCCABBCAABCBCBCBBAACCBBBACBBBCABAABCABABABABCA",
"CBBACCAABBCBAACBCBCACACBAABCBACBBABCABABACBCBBCBACBBABCACCABAACCACCBCBCABAACAABACBABACBCBAACACCBCBABACABBCBBCACACABBCBCBCABABCAACBCBCBCABACBABCABCABCBACABABACCBBCCBBCACBCCBAABAABCBBCAABCBCBCBBAACCACCABAABCABABABABCA")
/*testRcd ("AADBDBEBBBBCABCEBCDBBBBABABDCCBCEBABADDCABEEECCECCCADDACCEEAAACCABBECBAEDCEEBDDDBAAAECCBBCEECBAEBEEEECBEEBDACDDABEEABEEEECBABEDDABCDECDAABDAEADEECECEBCBDDAEEECCEEACCBBEACDDDDBDBCCAAECBEDAAAADBEADBAAECBDEACDEABABEBCABDCEEAABABABECDECADCEDAEEEBBBCEDECBCABDEDEBBBABABEEBDAEADBEDABCAEABCCBCCEDCBBEBCECCCA",
"AADBDBECABCEBCDABABDCCBCEBABADDCABCCEADDACCEECCABBECBAEDCEEBBECCBBCEECBAEBCBEEBDACDDABEEABCBABEDDABCDECDAABDAEADEECECEBCBDDACCEEACCBBEACBDBCCAAECBEDDBEADBAAECBDEACDEABABEBCABDCEEAABABABECDECADCEDACEDECBCABDEDEABABEEBDAEADBEDABCAEABCCBCCEDCBBEBCEA")
*/
}
// for generated input, but with fixed seed, to compare the count with
// and without memoization
import util.Random
val r = new Random (31415)
// generate Strings but with high chances to produce some triples and
// longer sequences of char clones
def genRandomString () : String = {
(1 to 20).map (_ => r.nextInt (6) match {
case 0 => "t"
case 1 => "r"
case 2 => "-"
case 3 => "tt"
case 4 => "rr"
case 5 => "--"
}).mkString ("")
}
def testRandom () : Unit = {
(1 to 10).map (i=> testRcd (genRandomString, "random mode - false might be true"))
}
testRandom
testgroup
testRandom
// testBigs
}
Comparing the effect of memoization lead to interesting results:
Updated measurements. In the old values, I forgot to reset the counter, which leaded to much higher results. Now the spreading of results
is much more impressive and in total, the values are smaller.
No seenbefore:
Input: baabcccbba result: b expected b true count: 4
Input: aabbaaac result: aabbc expected aabbc true count: 2
Input: aaaa result: expected true count: 2
Input: aaaabbbac result: c expected c true count: 5
Input: abcccbcccbacccab result: b expected b true count: 34
Input: AAAABBBAC result: C expected C true count: 5
Input: CAAAABBBA result: C expected C true count: 5
Input: AABBAAAC result: AABBC expected AABBC true count: 2
Input: BAABCCCBBA result: B expected B true count: 4
Input: AAABBBAAABBBAAABBBC res: C expected C true count: 377
Input: AAABBBAAABBBAAABBBAAABBBC r: C expected C true count: 4913
Input: r--t----ttrrrrrr--tttrtttt--rr----result: rr--rr expected ? unknown ? false count: 1959
Input: ttrtt----tr---rrrtttttttrtr--rr result: r--rr expected ? unknown ? false count: 213
Input: tt----r-----ttrr----ttrr-rr--rr-- result: ttrttrrttrr-rr--rr-- ex ? unknown ? false count: 16
Input: --rr---rrrrrrr-r--rr-r--tt--rrrrr result: rr-r--tt-- expected ? unknown ? false count: 32
Input: tt-rrrrr--r--tt--rrtrrr------- result: ttr--tt--rrt expected ? unknown ? false count: 35
Input: --t-ttt-ttt--rrrrrt-rrtrttrr result: --tt-rrtrttrr expected ? unknown ? false count: 35
Input: rrt--rrrr----trrr-rttttrrtttrr result: rrtt- expected ? unknown ? false count: 1310
Input: ---tttrrrrrttrrttrr---tt-----tt result: rrttrr expected ? unknown ? false count: 1011
Input: -rrtt--rrtt---t-r--r---rttr-- result: -rrtt--rr-r--rrttr-- ex ? unknown ? false count: 9
Input: rtttt--rrrrrrrt-rrttt--tt--t result: r--t-rr--tt--t expectd ? unknown ? false count: 16
real 0m0.607s (without testBigs)
user 0m1.276s
sys 0m0.056s
With seenbefore:
Input: baabcccbba result: b expected b true count: 4
Input: aabbaaac result: aabbc expected aabbc true count: 2
Input: aaaa result: expected true count: 2
Input: aaaabbbac result: c expected c true count: 5
Input: abcccbcccbacccab result: b expected b true count: 11
Input: AAAABBBAC result: C expected C true count: 5
Input: CAAAABBBA result: C expected C true count: 5
Input: AABBAAAC result: AABBC expected AABBC true count: 2
Input: BAABCCCBBA result: B expected B true count: 4
Input: AAABBBAAABBBAAABBBC rest: C expected C true count: 28
Input: AAABBBAAABBBAAABBBAAABBBC C expected C true count: 52
Input: r--t----ttrrrrrr--tttrtttt--rr----result: rr--rr expected ? unknown ? false count: 63
Input: ttrtt----tr---rrrtttttttrtr--rr result: r--rr expected ? unknown ? false count: 48
Input: tt----r-----ttrr----ttrr-rr--rr-- result: ttrttrrttrr-rr--rr-- xpe? unknown ? false count: 8
Input: --rr---rrrrrrr-r--rr-r--tt--rrrrr result: rr-r--tt-- expected ? unknown ? false count: 19
Input: tt-rrrrr--r--tt--rrtrrr------- result: ttr--tt--rrt expected ? unknown ? false count: 12
Input: --t-ttt-ttt--rrrrrt-rrtrttrr result: --tt-rrtrttrr expected ? unknown ? false count: 16
Input: rrt--rrrr----trrr-rttttrrtttrr result: rrtt- expected ? unknown ? false count: 133
Input: ---tttrrrrrttrrttrr---tt-----tt result: rrttrr expected ? unknown ? false count: 89
Input: -rrtt--rrtt---t-r--r---rttr-- result: -rrtt--rr-r--rrttr-- ex ? unknown ? false count: 6
Input: rtttt--rrrrrrrt-rrttt--tt--t result: r--t-rr--tt--t expected ? unknown ? false count: 8
real 0m0.474s (without testBigs)
user 0m0.852s
sys 0m0.060s
With tailcall:
real 0m0.478s (without testBigs)
user 0m0.860s
sys 0m0.060s
For some random strings, the difference is bigger than a 10fold.
For long Strings with many groups one could, as an improvement, eliminate all groups which are the only group of that character, for instance:
aa bbb aa ccc xx ddd aa eee aa fff xx
The groups bbb, ccc, ddd, eee and fff are unique in the string, so they can't fit to something else and could all be eliminated, and the order of removal is will not matter. This would lead to the intermediate result
aaaa xx aaaa xx
and a fast solution. Maybe I try to implement it too. However, I guess, it will be possible to produce random Strings, where this will have a big impact and by a different form of random generated strings, to distributions, where the impact is low.
Here is a Python solution (function reduce_min), not particularly smart but I think fairly easy to understand (excessive amount of comments added for answer clarity):
def reductions(s, min_len):
"""
Yields every possible reduction of s by eliminating contiguous blocks
of l or more repeated characters.
For example, reductions('AAABBCCCCBAAC', 3) yields
'BBCCCCBAAC' and 'AAABBBAAC'.
"""
# Current character
curr = ''
# Length of current block
n = 0
# Start position of current block
idx = 0
# For each character
for i, c in enumerate(s):
if c != curr:
# New block begins
if n >= min_len:
# If previous block was long enough
# yield reduced string without it
yield s[:idx] + s[i:]
# Start new block
curr = c
n = 1
idx = i
else:
# Still in the same block
n += 1
# Yield reduction without last block if it was long enough
if n >= min_len:
yield s[:idx]
def reduce_min(s, min_len):
"""
Finds the smallest possible reduction of s by successive
elimination of contiguous blocks of min_len or more repeated
characters.
"""
# Current set of possible reductions
rs = set([s])
# Current best solution
result = s
# While there are strings to reduce
while rs:
# Get one element
r = rs.pop()
# Find reductions
r_red = list(reductions(r, min_len))
# If no reductions are found it is irreducible
if len(r_red) == 0 and len(r) < len(result):
# Replace if shorter than current best
result = r
else:
# Save reductions for next iterations
rs.update(r_red)
return result
assert reduce_min("BAABCCCBBA", 3) == "B"
assert reduce_min("AABBAAAC", 3) == "AABBC"
assert reduce_min("AAAA", 3) == ""
assert reduce_min("AAAABBBAC", 3) == "C"
EDIT: Since people seem to be posting C++ solutions, here is mine in C++ (again, function reduce_min):
#include <string>
#include <vector>
#include <unordered_set>
#include <iterator>
#include <utility>
#include <cassert>
using namespace std;
void reductions(const string &s, unsigned int min_len, vector<string> &rs)
{
char curr = '\0';
unsigned int n = 0;
unsigned int idx = 0;
for (auto it = s.begin(); it != s.end(); ++it)
{
if (curr != *it)
{
auto i = distance(s.begin(), it);
if (n >= min_len)
{
rs.push_back(s.substr(0, idx) + s.substr(i));
}
curr = *it;
n = 1;
idx = i;
}
else
{
n += 1;
}
}
if (n >= min_len)
{
rs.push_back(s.substr(0, idx));
}
}
string reduce_min(const string &s, unsigned int min_len)
{
unordered_set<string> rs { s };
string result = s;
vector<string> rs_new;
while (!rs.empty())
{
auto it = rs.begin();
auto r = *it;
rs.erase(it);
rs_new.clear();
reductions(r, min_len, rs_new);
if (rs_new.empty() && r.size() < result.size())
{
result = move(r);
}
else
{
rs.insert(rs_new.begin(), rs_new.end());
}
}
return result;
}
int main(int argc, char **argv)
{
assert(reduce_min("BAABCCCBBA", 3) == "B");
assert(reduce_min("AABBAAAC", 3) == "AABBC");
assert(reduce_min("AAAA", 3) == "");
assert(reduce_min("AAAABBBAC", 3) == "C");
return 0;
}
If you can use C++17 you can save memory by using string views.
EDIT 2: About the complexity of the algorithm. It is not straightforward to figure out, and as I said the algorithm is meant to be simple more than anything, but let's see. In the end, it is more or less the same as a breadth-first search. Let's say the string length is n, and, for generality, let's say the minimum block length (value 3 in the question) is m. In the first level, we can generate up to n / m reductions in the worst case. For each of these, we can generate up to (n - m) / m reductions, and so on. So basically, at "level" i (loop iteration i) we create up to (n - i * m) / m reductions per string we had, and each of these will take O(n - i * m) time to process. The maximum number of levels we can have is, again, n / m. So the complexity of the algorithm (if I'm not making mistakes) should have the form:
O( sum {i = 0 .. n / m} ( O(n - i * m) * prod {j = 0 .. i} ((n - i * m) / m) ))
|-Outer iters--| |---Cost---| |-Prev lvl-| |---Branching---|
Whew. So this should be something like:
O( sum {i = 0 .. n / m} (n - i * m) * O(n^i / m^i) )
Which in turn would collapse to:
O((n / m)^(n / m))
So yeah, the algorithm is more or less simple, but it can run into exponential cost cases (the bad cases would be strings made entirely of exactly m-long blocks, like AAABBBCCCAAACCC... for m = 3).

Remove redundant parentheses from an arithmetic expression

This is an interview question, for which I did not find any satisfactory answers on stackoverflow or outside. Problem statement:
Given an arithmetic expression, remove redundant parentheses. E.g.
((a*b)+c) should become a*b+c
I can think of an obvious way of converting the infix expression to post fix and converting it back to infix - but is there a better way to do this?
A pair of parentheses is necessary if and only if they enclose an unparenthesized expression of the form X % X % ... % X where X are either parenthesized expressions or atoms, and % are binary operators, and if at least one of the operators % has lower precedence than an operator attached directly to the parenthesized expression on either side of it; or if it is the whole expression. So e.g. in
q * (a * b * c * d) + c
the surrounding operators are {+, *} and the lowest precedence operator inside the parentheses is *, so the parentheses are unnecessary. On the other hand, in
q * (a * b + c * d) + c
there is a lower precedence operator + inside the parentheses than the surrounding operator *, so they are necessary. However, in
z * q + (a * b + c * d) + c
the parentheses are not necessary because the outer * is not attached to the parenthesized expression.
Why this is true is that if all the operators inside an expression (X % X % ... % X) have higher priority than a surrounding operator, then the inner operators are anyway calculated out first even if the parentheses are removed.
So, you can check any pair of matching parentheses directly for redundancy by this algorithm:
Let L be operator immediately left of the left parenthesis, or nil
Let R be operator immediately right of the right parenthesis, or nil
If L is nil and R is nil:
Redundant
Else:
Scan the unparenthesized operators between the parentheses
Let X be the lowest priority operator
If X has lower priority than L or R:
Not redundant
Else:
Redundant
You can iterate this, removing redundant pairs until all remaining pairs are non-redundant.
Example:
((a * b) + c * (e + f))
(Processing pairs from left to right):
((a * b) + c * (e + f)) L = nil R = nil --> Redundant
^ ^
(a * b) + c * (e + f) L = nil R = nil --> Redundant
^ ^ L = nil R = + X = * --> Redundant
a * b + c * (e + f) L = * R = nil X = + --> Not redundant
^ ^
Final result:
a * b + c * (e + f)
I just figured out an answer:
the premises are:
1. the expression has been tokenized
2. no syntax error
3. there are only binary operators
input:
list of the tokens, for example:
(, (, a, *, b, ), +, c, )
output:
set of the redundant parentheses pairs (the orders of the pairs are not important),
for example,
0, 8
1, 5
please be aware of that : the set is not unique, for instance, ((a+b))*c, we can remove outer parentheses or inner one, but the final expression is unique
the data structure:
a stack, each item records information in each parenthese pair
the struct is:
left_pa: records the position of the left parenthese
min_op: records the operator in the parentheses with minimum priority
left_op: records current operator
the algorithm
1.push one empty item in the stack
2.scan the token list
2.1 if the token is operand, ignore
2.2 if the token is operator, records the operator in the left_op,
if min_op is nil, set the min_op = this operator, if the min_op
is not nil, compare the min_op with this operator, set min_op as
one of the two operators with less priority
2.3 if the token is left parenthese, push one item in the stack,
with left_pa = position of the parenthese
2.4 if the token is right parenthese,
2.4.1 we have the pair of the parentheses(left_pa and the
right parenthese)
2.4.2 pop the item
2.4.3 pre-read next token, if it is an operator, set it
as right operator
2.4.4 compare min_op of the item with left_op and right operator
(if any of them exists), we can easily get to know if the pair
of the parentheses is redundant, and output it(if the min_op
< any of left_op and right operator, the parentheses are necessary,
if min_op = left_op, the parentheses are necessary, otherwise
redundant)
2.4.5 if there is no left_op and no right operator(which also means
min_op = nil) and the stack is not empty, set the min_op of top
item as the min_op of the popped-up item
examples
example one
((a*b)+c)
after scanning to b, we have stack:
index left_pa min_op left_op
0
1 0
2 1 * * <-stack top
now we meet the first ')'(at pos 5), we pop the item
left_pa = 1
min_op = *
left_op = *
and pre-read operator '+', since min_op priority '*' > '+', so the pair(1,5) is redundant, so output it.
then scan till we meet last ')', at the moment, we have stack
index left_pa min_op left_op
0
1 0 + +
we pop this item(since we meet ')' at pos 8), and pre-read next operator, since there is no operator and at index 0, there is no left_op, so output the pair(0, 8)
example two
a*(b+c)
when we meet the ')', the stack is like:
index left_pa min_op left_op
0 * *
1 2 + +
now, we pop the item at index = 1, compare the min_op '+' with the left_op '*' at index 0, we can find out the '(',')' are necessary
This solutions works if the expression is a valid. We need mapping of the operators to priority values.
a. Traverse from two ends of the array to figure out matching parenthesis from both ends.
Let the indexes be i and j respectively.
b. Now traverse from i to j and find out the lowest precedence operator which is not contained inside any parentheses.
c. Compare the priority of this operator with the operators to left of open parenthesis and right of closing parenthesis. If no such operator exists, treat its priority as -1. If the priority of the operator is higher than these two, remove the parenthesis at i and j.
d. Continue the steps a to c until i<=j.
Push one empty item in the stack
Scan the token list
2.1 if the token is operand, ignore.
2.2 if the token is operator, records the operator in the left_op,
if min_op is nil, set the min_op = this operator, if the min_op
is not nil, compare the min_op with this operator, set min_op as
one of the two operators with less priority.
2.3 if the token is left parenthese, push one item in the stack,
with left_pa = position of the parenthesis.
2.4 if the token is right parenthesis:
2.4.1 we have the pair of the parentheses(left_pa and the
right parenthesis)
2.4.2 pop the item
2.4.3 pre-read next token, if it is an operator, set it
as right operator
2.4.4 compare min_op of the item with left_op and right operator
(if any of them exists), we can easily get to know if the pair
of the parentheses is redundant, and output it(if the min_op
< any of left_op and right operator, the parentheses are necessary,
if min_op = left_op, the parentheses are necessary, otherwise
redundant)
2.4.5 if there is no left_op and no right operator(which also means
min_op = nil) and the stack is not empty, set the min_op of top
item as the min_op of the popped-up item
examples
The code below implements a straightforward solution. It is limited to +, -, *, and /, but it can be extended to handle other operators if needed.
#include <iostream>
#include <set>
#include <stack>
int loc;
std::string parser(std::string input, int _loc) {
std::set<char> support = {'+', '-', '*', '/'};
std::string expi;
std::set<char> op;
loc = _loc;
while (true) {
if (input[loc] == '(') {
expi += parser(input, loc + 1);
} else if (input[loc] == ')') {
if ((input[loc + 1] != '*') && (input[loc + 1] != '/')) {
return expi;
} else {
if ((op.find('+') == op.end()) && (op.find('-') == op.end())) {
return expi;
} else {
return '(' + expi + ')';
}
}
} else {
char temp = input[loc];
expi = expi + temp;
if (support.find(temp) != support.end()) {
op.insert(temp);
}
}
loc++;
if (loc >= input.size()) {
break;
}
}
return expi;
}
int main() {
std::string input("(((a)+((b*c)))+(d*(f*g)))");
std::cout << parser(input, 0);
return 0;
}
I coded it previously in https://calculation-test.211368e.repl.co/trim.html. This doesn't have some errors in other answers.
(6 / (-2454) ** (((234)))) + (-5435) --> 6 / (-2454) ** 234 + (-5435)
const format = expression => {
var change = [], result = expression.replace(/ /g, "").replace(/\*\*/g, "^"), _count;
function replace(index, string){result = result.slice(0, index) + string + result.slice(index + 1)}
function add(index, string){result = result.slice(0, index) + string + result.slice(index)}
for (var count = 0; count < result.length; count++){
if (result[count] == "-"){
if ("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890)".includes(result[count - 1])){
change.push(count);
}else if (result[count - 1] != "("){
add(count, "(");
count++;
_count = count + 1;
while ("1234567890.".includes(result[_count])) _count++;
if (_count < result.length - 1){
add(_count, ")");
}else{
add(_count + 2, ")");
}
}
}
}
change = change.sort(function(a, b){return a - b});
const len = change.length;
for (var count = 0; count < len; count++){replace(change[0] + count * 2, " - "); change.shift()}
return result.replace(/\*/g, " * ").replace(/\^/g, " ** ").replace(/\//g, " / ").replace(/\+/g, " + ");
}
const trim = expression => {
var result = format(expression).replace(/ /g, "").replace(/\*\*/g, "^"), deleting = [];
const brackets = bracket_pairs(result);
function bracket_pairs(){
function findcbracket(str, pos){
const rExp = /\(|\)/g;
rExp.lastIndex = pos + 1;
var depth = 1;
while ((pos = rExp.exec(str))) if (!(depth += str[pos.index] == "(" ? 1 : -1 )) {return pos.index}
}
function occurences(searchStr, str){
var startIndex = 0, index, indices = [];
while ((index = str.indexOf(searchStr, startIndex)) > -1){
indices.push(index);
startIndex = index + 1;
}
return indices;
}
const obrackets = occurences("(", result);
var cbrackets = [];
for (var count = 0; count < obrackets.length; count++) cbrackets.push(findcbracket(result, obrackets[count]));
return obrackets.map((e, i) => [e, cbrackets[i]]);
}
function remove(deleting){
function _remove(index){result = result.slice(0, index) + result.slice(index + 1)}
const len = deleting.length;
var deleting = deleting.sort(function(a, b){return a - b});
for (var count = 0; count < len; count++){
_remove(deleting[0] - count);
deleting.shift()
}
}
function precedence(operator, position){
if (!"^/*-+".includes(operator)) return "^/*-+";
if (position == "l" || position == "w") return {"^": "^", "/": "^", "*": "^/*", "-": "^/*", "+": "^/*-+"}[operator];
if (position == "r") return {"^": "^", "/": "^/*", "*": "^/*", "-": "^/*-+", "+": "^/*-+"}[operator];
}
function strip_bracket(string){
var result = "", level = 0;
for (var count = 0; count < string.length; count++){
if (string.charAt(count) == "(") level++;
if (level == 0) result += string.charAt(count);
if (string.charAt(count) == ")") level--;
}
return result.replace(/\s{2,}/g, " ");
}
for (var count = 0; count < brackets.length; count++){
const pair = brackets[count];
if (result[pair[0] - 1] == "(" && result[pair[1] + 1] == ")"){
deleting.push(...pair);
}else{
const left = precedence(result[pair[0] - 1], "l"), right = precedence(result[pair[1] + 1], "r");
var contents = strip_bracket(result.slice(pair[0] + 1, pair[1])), within = "+";
for (var _count = 0; _count < contents.length; _count++) if (precedence(contents[_count], "w").length < precedence(within, "w").length) within = contents[_count];
if (/^[0-9]+$/g.test(contents) || contents == ""){
deleting.push(...pair);
continue;
}
if (left.includes(within) && right.includes(within)){
if (!isNaN(result.slice(pair[0] + 1, pair[1]))){
if (Number(result.slice(pair[0] + 1, pair[1])) >= 0 && !"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890".includes(result[pair[0] - 1])) deleting.push(...pair);
}else if (!"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890".includes(result[pair[0] - 1])) deleting.push(...pair);
}
}
}
remove(deleting);
result = format(result);
return result;
}
<input id="input">
<button onclick="document.getElementById('result').innerHTML = trim(document.getElementById('input').value)">Remove and format</button>
<div id="result"></div>
I think that you are looking for kind of algorithm as seen in the following photo.
This algorithm is "almost" ready, since a lot of bugs arise once the more complex it becomes, the more complicated it gets. The way I work on this thing, is 'build-and-write-code-on-the-fly', which means that for up to 4 parentheses, things are easy. But after the expression goes more complex, there are things that I cannot predict while writing down thoughts on paper. And there comes the compiler to tell me what to correct. It would not be a lie if I state that it is not me to have written the algorithm, but the (C#) compiler instead! So far, it took me 1400 lines. It is not that the commands were difficult to write. It was their arrangement that was a real puzzle. This program you are looking for, is characterized by a really high grade of complexity. Well, if you need any primary ideas, please let me know and I will reply. Thanx!
Algorithm

Algorithm to generate a sequence proportional to specified percentage

Given a Map of objects and designated proportions (let's say they add up to 100 to make it easy):
val ss : Map[String,Double] = Map("A"->42, "B"->32, "C"->26)
How can I generate a sequence such that for a subset of size n there are ~42% "A"s, ~32% "B"s and ~26% "C"s? (Obviously, small n will have larger errors).
(Work language is Scala, but I'm just asking for the algorithm.)
UPDATE: I resisted a random approach since, for instance, there's ~16% chance that the sequence would start with AA and ~11% chance it would start with BB and there would be very low odds that for n precisely == (sum of proportions) the distribution would be perfect. So, following #MvG's answer, I implemented as follows:
/**
Returns the key whose achieved proportions are most below desired proportions
*/
def next[T](proportions : Map[T, Double], achievedToDate : Map[T,Double]) : T = {
val proportionsSum = proportions.values.sum
val desiredPercentages = proportions.mapValues(v => v / proportionsSum)
//Initially no achieved percentages, so avoid / 0
val toDateTotal = if(achievedToDate.values.sum == 0.0){
1
}else{
achievedToDate.values.sum
}
val achievedPercentages = achievedToDate.mapValues(v => v / toDateTotal)
val gaps = achievedPercentages.map{ case (k, v) =>
val gap = desiredPercentages(k) - v
(k -> gap)
}
val maxUnder = gaps.values.toList.sortWith(_ > _).head
//println("Max gap is " + maxUnder)
val gapsForMaxUnder = gaps.mapValues{v => Math.abs(v - maxUnder) < Double.Epsilon }
val keysByHasMaxUnder = gapsForMaxUnder.map(_.swap)
keysByHasMaxUnder(true)
}
/**
Stream of most-fair next element
*/
def proportionalStream[T](proportions : Map[T, Double], toDate : Map[T, Double]) : Stream[T] = {
val nextS = next(proportions, toDate)
val tailToDate = toDate + (nextS -> (toDate(nextS) + 1.0))
Stream.cons(
nextS,
proportionalStream(proportions, tailToDate)
)
}
That when used, e.g., :
val ss : Map[String,Double] = Map("A"->42, "B"->32, "C"->26)
val none : Map[String,Double] = ss.mapValues(_ => 0.0)
val mySequence = (proportionalStream(ss, none) take 100).toList
println("Desired : " + ss)
println("Achieved : " + mySequence.groupBy(identity).mapValues(_.size))
mySequence.map(s => print(s))
println
produces :
Desired : Map(A -> 42.0, B -> 32.0, C -> 26.0)
Achieved : Map(C -> 26, A -> 42, B -> 32)
ABCABCABACBACABACBABACABCABACBACABABCABACABCABACBA
CABABCABACBACABACBABACABCABACBACABABCABACABCABACBA
For a deterministic approach, the most obvious solution would probably be this:
Keep track of the number of occurrences of each item in the sequence so far.
For the next item, choose that item for which the difference between intended and actual count (or proportion, if you prefer that) is maximal, but only if the intended count (resp. proportion) is greater than the actual one.
If there is a tie, break it in an arbitrary but deterministic way, e.g. choosing the alphabetically lowest item.
This approach would ensure an optimal adherence to the prescribed ratio for every prefix of the infinite sequence generated in this way.
Quick & dirty python proof of concept (don't expect any of the variable “names” to make any sense):
import sys
p = [0.42, 0.32, 0.26]
c = [0, 0, 0]
a = ['A', 'B', 'C']
n = 0
while n < 70*5:
n += 1
x = 0
s = n*p[0] - c[0]
for i in [1, 2]:
si = n*p[i] - c[i]
if si > s:
x = i
s = si
sys.stdout.write(a[x])
if n % 70 == 0:
sys.stdout.write('\n')
c[x] += 1
Generates
ABCABCABACABACBABCAABCABACBACABACBABCABACABACBACBAABCABCABACABACBABCAB
ACABACBACABACBABCABACABACBACBAABCABCABACABACBABCAABCABACBACABACBABCABA
CABACBACBAABCABCABACABACBABCABACABACBACBAACBABCABACABACBACBAABCABCABAC
ABACBABCABACABACBACBAACBABCABACABACBACBAABCABCABACABACBABCABACABACBACB
AACBABCABACABACBACBAABCABCABACABACBABCAABCABACBACBAACBABCABACABACBACBA
For every item of the sequence, compute a (pseudo-)random number r equidistributed between 0 (inclusive) and 100 (exclusive).
If 0 ≤ r < 42, take A
If 42 ≤ r < (42+32), take B
If (42+32) ≤ r < (42+32+26)=100, take C
The number of each entry in your subset is going to be the same as in your map, but with a scaling factor applied.
The scaling factor is n/100.
So if n was 50, you would have { Ax21, Bx16, Cx13 }.
Randomize the order to your liking.
The simplest "deterministic" [in terms of #elements of each category] solution [IMO] will be: add elements in predefined order, and then shuffle the resulting list.
First, add map(x)/100 * n elements from each element x chose how you handle integer arithmetics to avoid off by one element], and then shuffle the resulting list.
Shuffling a list is simple with fisher-yates shuffle, which is implemented in most languages: for example java has Collections.shuffle(), and C++ has random_shuffle()
In java, it will be as simple as:
int N = 107;
List<String> res = new ArrayList<String>();
for (Entry<String,Integer> e : map.entrySet()) { //map is predefined Map<String,Integer> for frequencies
for (int i = 0; i < Math.round(e.getValue()/100.0 * N); i++) {
res.add(e.getKey());
}
}
Collections.shuffle(res);
This is nondeterministic, but gives a distribution of values close to MvG's. It suffers from the problem that it could give AAA right at the start. I post it here for completeness' sake given how it proves my dissent with MvG was misplaced (and I don't expect any upvotes).
Now, if someone has an idea for an expand function that is deterministic and won't just duplicate MvG's method (rendering the calc function useless), I'm all ears!
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title>ErikE's answer</title>
</head>
<body>
<div id="output"></div>
<script type="text/javascript">
if (!Array.each) {
Array.prototype.each = function(callback) {
var i, l = this.length;
for (i = 0; i < l; i += 1) {
callback(i, this[i]);
}
};
}
if (!Array.prototype.sum) {
Array.prototype.sum = function() {
var sum = 0;
this.each(function(i, val) {
sum += val;
});
return sum;
};
}
function expand(counts) {
var
result = "",
charlist = [],
l,
index;
counts.each(function(i, val) {
char = String.fromCharCode(i + 65);
for ( ; val > 0; val -= 1) {
charlist.push(char);
}
});
l = charlist.length;
for ( ; l > 0; l -= 1) {
index = Math.floor(Math.random() * l);
result += charlist[index];
charlist.splice(index, 1);
}
return result;
}
function calc(n, proportions) {
var percents = [],
counts = [],
errors = [],
fnmap = [],
errorSum,
worstIndex;
fnmap[1] = "min";
fnmap[-1] = "max";
proportions.each(function(i, val) {
percents[i] = val / proportions.sum() * n;
counts[i] = Math.round(percents[i]);
errors[i] = counts[i] - percents[i];
});
errorSum = counts.sum() - n;
while (errorSum != 0) {
adjust = errorSum < 0 ? 1 : -1;
worstIndex = errors.indexOf(Math[fnmap[adjust]].apply(0, errors));
counts[worstIndex] += adjust;
errors[worstIndex] = counts[worstIndex] - percents[worstIndex];
errorSum += adjust;
}
return expand(counts);
}
document.body.onload = function() {
document.getElementById('output').innerHTML = calc(99, [25.1, 24.9, 25.9, 24.1]);
};
</script>
</body>
</html>

Algorithm to divide text into 3 evenly-sized groups

I'm would like to create an algorithm that will divide text into 3-evenly sized groups (based on text length). Since this will be put to use for line-breaks, the order of the text needs to be maintained.
For instance this string:
Just testing to see how this works.
would sort to:
Just testing // 12 characters
to see how // 10 characters
this works. // 11 characters
Any ideas?
The "minimum raggedness" dynamic program, also from the Wikipedia article on word wrap, can be adapted to your needs. Set LineWidth = len(text)/n - 1 and ignore the comment about infinite penalties for exceeding the line width; use the definition of c(i, j) as is with P = 2.
Code. I took the liberty of modifying the DP always to return exactly n lines, at the cost of increasing the running time from O(#words ** 2) to O(#words ** 2 * n).
def minragged(text, n=3):
"""
>>> minragged('Just testing to see how this works.')
['Just testing', 'to see how', 'this works.']
>>> minragged('Just testing to see how this works.', 10)
['', '', 'Just', 'testing', 'to', 'see', 'how', 'this', 'works.', '']
"""
words = text.split()
cumwordwidth = [0]
# cumwordwidth[-1] is the last element
for word in words:
cumwordwidth.append(cumwordwidth[-1] + len(word))
totalwidth = cumwordwidth[-1] + len(words) - 1 # len(words) - 1 spaces
linewidth = float(totalwidth - (n - 1)) / float(n) # n - 1 line breaks
def cost(i, j):
"""
cost of a line words[i], ..., words[j - 1] (words[i:j])
"""
actuallinewidth = max(j - i - 1, 0) + (cumwordwidth[j] - cumwordwidth[i])
return (linewidth - float(actuallinewidth)) ** 2
# best[l][k][0] is the min total cost for words 0, ..., k - 1 on l lines
# best[l][k][1] is a minimizing index for the start of the last line
best = [[(0.0, None)] + [(float('inf'), None)] * len(words)]
# xrange(upper) is the interval 0, 1, ..., upper - 1
for l in xrange(1, n + 1):
best.append([])
for j in xrange(len(words) + 1):
best[l].append(min((best[l - 1][k][0] + cost(k, j), k) for k in xrange(j + 1)))
lines = []
b = len(words)
# xrange(upper, 0, -1) is the interval upper, upper - 1, ..., 1
for l in xrange(n, 0, -1):
a = best[l][b][1]
lines.append(' '.join(words[a:b]))
b = a
lines.reverse()
return lines
if __name__ == '__main__':
import doctest
doctest.testmod()
You can try the next simple heuristic for starters: Place to iterators in n/3 and 2n/3 and search for the closest space near each of them.
From http://en.wikipedia.org/wiki/Word_wrap:
SpaceLeft := LineWidth
for each Word in Text
if Width(Word) > SpaceLeft
insert line break before Word in Text
SpaceLeft := LineWidth - Width(Word)
else
SpaceLeft := SpaceLeft - (Width(Word) + SpaceWidth)
This method is used by many modern word processors, such as OpenOffice.org Writer and Microsoft Word. This algorithm is optimal in that it always puts the text on the minimum number of lines.
The answer from "someone" works fine. However, I had problems translating this into SWIFT code. Here is my translation for all those that are interested.
import Foundation
class SplitText{
typealias MinRag = (Float, Int) // meaning (cost for line (so far), word index)
// from http://stackoverflow.com/questions/6426017/word-wrap-to-x-lines-instead-of-maximum-width-least-raggedness?lq=1
class func splitText(text:String, numberOfLines:Int)-> [String]{
//preparations
var words = split(text, maxSplit:100, allowEmptySlices: false, isSeparator:{(s:Character)-> Bool in return s == " " || s == "\n"})
var cumwordwidth = [Int](); //cummulative word widths
cumwordwidth.append(0);
for word in words{
cumwordwidth.append(cumwordwidth[cumwordwidth.count - 1] + count(word));
}
var totalwidth = cumwordwidth[cumwordwidth.count - 1] + count(words) - 1;
var linewidth:Float = Float(totalwidth - (numberOfLines - 1)) / Float(numberOfLines)
// cost function for one line for words i .. j
var cost = { (i:Int,j:Int)-> Float in
var actuallinewidth = max(j - i - 1, 0) + (cumwordwidth[j] - cumwordwidth[i]);
var remainingWidth: Float = linewidth - Float(actuallinewidth)
return remainingWidth * remainingWidth
}
var best = [[MinRag]]()
var tmp = [MinRag]();
//ensure that data structure is initialised in a way that we start with adding the first word
tmp.append((0, -1));
for word in words {
tmp.append((Float.infinity , -1));
}
best.append(tmp);
//now we can start. We simply calculate the cost for all possible lines
for l in 1...numberOfLines {
tmp = [MinRag]()
for j in 0...words.count {
var min:MinRag = (best[l - 1][0].0 + cost(0, j), 0);
var k: Int
for k = 0; k < j + 1 ; ++k {
var loc:Float = best[l - 1][k].0 + cost(k, j);
if (loc < min.0 || (loc == min.0 && k < min.1)) {
min=(loc, k);
}
println("l=\(l), j=\(j), k=\(k), min=\(min)")
}
tmp.append(min);
}
best.append(tmp);
}
//now build the answer based on above calculations
var lines = [String]();
var b = words.count;
var o:Int
for o = numberOfLines; o > 0 ; --o {
var a = best[o][b].1;
lines.append(" ".join(words[a...b-1]));
b = a;
}
return reverse(lines);
}
}

Resources