Remove consecutive duplicates in a string to make the smallest string - algorithm

Given a string and the constraint of matching on >= 3 characters, how can you ensure that the result string will be as small as possible?
edit with gassa's explicitness:
E.G.
'AAAABBBAC'
If I remove the B's first,
AAAA[BBB]AC -- > AAAAAC, then I can remove all of the A's from the resultant string and be left with:
[AAAAA]C --> C
'C'
If I just remove what is available first (the sequence of A's), I get:
[AAAA]BBBAC -- > [BBB]AC --> AC
'AC'

A tree would definitely get you the shortest string(s).
The tree solution:
Define a State (node) for each current string Input and all its removable sub-strings' int[] Indexes.
Create the tree: For each int index create another State and add it to the parent state State[] Children.
A State with no possible removable sub-strings has no children Children = null.
Get all Descendants State[] of your root State. Order them by their shortest string Input. And that is/are your answer(s).
Test cases:
string result = FindShortest("AAAABBBAC"); // AC
string result2 = FindShortest("AABBAAAC"); // AABBC
string result3 = FindShortest("BAABCCCBBA"); // B
The Code:
Note: Of-course everyone is welcome to enhance the following code in terms of performance and/or fixing any bug.
class Program
{
static void Main(string[] args)
{
string result = FindShortest("AAAABBBAC"); // AC
string result2 = FindShortest("AABBAAAC"); // AABBC
string result3 = FindShortest("BAABCCCBBA"); // B
}
// finds the FIRST shortest string for a given input
private static string FindShortest(string input)
{
// all possible removable strings' indexes
// for this given input
int[] indexes = RemovableIndexes(input);
// each input string and its possible removables are a state
var state = new State { Input = input, Indexes = indexes };
// create the tree
GetChildren(state);
// get the FIRST shortest
// i.e. there would be more than one answer sometimes
// this could be easily changed to get all possible results
var result =
Descendants(state)
.Where(d => d.Children == null || d.Children.Length == 0)
.OrderBy(d => d.Input.Length)
.FirstOrDefault().Input;
return result;
}
// simple get all descendants of a node/state in a tree
private static IEnumerable<State> Descendants(State root)
{
var states = new Stack<State>(new[] { root });
while (states.Any())
{
State node = states.Pop();
yield return node;
if (node.Children != null)
foreach (var n in node.Children) states.Push(n);
}
}
// creates the tree
private static void GetChildren(State state)
{
// for each an index there is a child
state.Children = state.Indexes.Select(
i =>
{
var input = RemoveAllAt(state.Input, i);
return input.Length < state.Input.Length && input.Length > 0
? new State
{
Input = input,
Indexes = RemovableIndexes(input)
}
: null;
}).ToArray();
foreach (var c in state.Children)
GetChildren(c);
}
// find all possible removable strings' indexes
private static int[] RemovableIndexes(string input)
{
var indexes = new List<int>();
char d = input[0];
int count = 1;
for (int i = 1; i < input.Length; i++)
{
if (d == input[i])
count++;
else
{
if (count >= 3)
indexes.Add(i - count);
// reset
d = input[i];
count = 1;
}
}
if (count >= 3)
indexes.Add(input.Length - count);
return indexes.ToArray();
}
// remove all duplicate chars starting from an index
private static string RemoveAllAt(string input, int startIndex)
{
string part1, part2;
int endIndex = startIndex + 1;
int i = endIndex;
for (; i < input.Length; i++)
if (input[i] != input[startIndex])
{
endIndex = i;
break;
}
if (i == input.Length && input[i - 1] == input[startIndex])
endIndex = input.Length;
part1 = startIndex > 0 ? input.Substring(0, startIndex) : string.Empty;
part2 = endIndex <= (input.Length - 1) ? input.Substring(endIndex) : string.Empty;
return part1 + part2;
}
// our node, which is
// an input string &
// all possible removable strings' indexes
// & its children
public class State
{
public string Input;
public int[] Indexes;
public State[] Children;
}
}

I propose O(n^2) solution with dynamic programming.
Let's introduce notation. Prefix and suffix of length l of string A denoted by P[l] and S[l]. And we call our procedure Rcd.
Rcd(A) = Rcd(Rcd(P[n-1])+S[1])
Rcd(A) = Rcd(P[1]+Rcd(S[n-1]))
Note that outer Rcd in the RHS is trivial. So, that's our optimal substructure. Based on this i came up with the following implementation:
#include <iostream>
#include <string>
#include <vector>
#include <cassert>
using namespace std;
string remdupright(string s, bool allowEmpty) {
if (s.size() >= 3) {
auto pos = s.find_last_not_of(s.back());
if (pos == string::npos && allowEmpty) s = "";
else if (pos != string::npos && s.size() - pos > 3) s = s.substr(0, pos + 1);
}
return s;
}
string remdupleft(string s, bool allowEmpty) {
if (s.size() >= 3) {
auto pos = s.find_first_not_of(s.front());
if (pos == string::npos && allowEmpty) s = "";
else if (pos != string::npos && pos >= 3) s = s.substr(pos);
}
return s;
}
string remdup(string s, bool allowEmpty) {
return remdupleft(remdupright(s, allowEmpty), allowEmpty);
}
string run(const string in) {
vector<vector<string>> table(in.size());
for (int i = 0; i < (int)table.size(); ++i) {
table[i].resize(in.size() - i);
}
for (int i = 0; i < (int)table[0].size(); ++i) {
table[0][i] = in.substr(i,1);
}
for (int len = 2; len <= (int)table.size(); ++len) {
for (int pos = 0; pos < (int)in.size() - len + 1; ++pos) {
string base(table[len - 2][pos]);
const char suffix = in[pos + len - 1];
if (base.size() && suffix != base.back()) {
base = remdupright(base, false);
}
const string opt1 = base + suffix;
base = table[len - 2][pos+1];
const char prefix = in[pos];
if (base.size() && prefix != base.front()) {
base = remdupleft(base, false);
}
const string opt2 = prefix + base;
const string nodupopt1 = remdup(opt1, true);
const string nodupopt2 = remdup(opt2, true);
table[len - 1][pos] = nodupopt1.size() > nodupopt2.size() ? opt2 : opt1;
assert(nodupopt1.size() != nodupopt2.size() || nodupopt1 == nodupopt2);
}
}
string& res = table[in.size() - 1][0];
return remdup(res, true);
}
void testRcd(string s, string expected) {
cout << s << " : " << run(s) << ", expected: " << expected << endl;
}
int main()
{
testRcd("BAABCCCBBA", "B");
testRcd("AABBAAAC", "AABBC");
testRcd("AAAA", "");
testRcd("AAAABBBAC", "C");
}
You can check default and run your tests here.

Clearly we are not concerned about any block of repeated characters longer than 2 characters. And there is only one way two blocks of the same character where at least one of the blocks is less than 3 in length can be combined - namely, if the sequence between them can be removed.
So (1) look at pairs of blocks of the same character where at least one is less than 3 in length, and (2) determine if the sequence between them can be removed.
We want to decide which pairs to join so as to minimize the total length of blocks less than 3 characters long. (Note that the number of pairs is bound by the size (and distribution) of the alphabet.)
Let f(b) represent the minimal total length of same-character blocks remaining up to the block b that are less than 3 characters in length. Then:
f(b):
p1 <- previous block of the same character
if b and p1 can combine:
if b.length + p1.length > 2:
f(b) = min(
// don't combine
(0 if b.length > 2 else b.length) +
f(block before b),
// combine
f(block before p1)
)
// b.length + p1.length < 3
else:
p2 <- block previous to p1 of the same character
if p1 and p2 can combine:
f(b) = min(
// don't combine
b.length + f(block before b),
// combine
f(block before p2)
)
else:
f(b) = b.length + f(block before b)
// b and p1 cannot combine
else:
f(b) = b.length + f(block before b)
for all p1 before b
The question is how can we efficiently determine if a block can be combined with the previous block of the same character (aside from the obvious recursion into the sub-block-list between the two blocks).
Python code:
import random
import time
def parse(length):
return length if length < 3 else 0
def f(string):
chars = {}
blocks = [[string[0], 1, 0]]
chars[string[0]] = {'indexes': [0]}
chars[string[0]][0] = {'prev': -1}
p = 0 # pointer to current block
for i in xrange(1, len(string)):
if blocks[len(blocks) - 1][0] == string[i]:
blocks[len(blocks) - 1][1] += 1
else:
p += 1
# [char, length, index, f(i), temp]
blocks.append([string[i], 1, p])
if string[i] in chars:
chars[string[i]][p] = {'prev': chars[string[i]]['indexes'][ len(chars[string[i]]['indexes']) - 1 ]}
chars[string[i]]['indexes'].append(p)
else:
chars[string[i]] = {'indexes': [p]}
chars[string[i]][p] = {'prev': -1}
#print blocks
#print
#print chars
#print
memo = [[None for j in xrange(len(blocks))] for i in xrange(len(blocks))]
def g(l, r, top_level=False):
####
####
#print "(l, r): (%s, %s)" % (l,r)
if l == r:
return parse(blocks[l][1])
if memo[l][r]:
return memo[l][r]
result = [parse(blocks[l][1])] + [None for k in xrange(r - l)]
if l < r:
for i in xrange(l + 1, r + 1):
result[i - l] = parse(blocks[i][1]) + result[i - l - 1]
for i in xrange(l, r + 1):
####
####
#print "\ni: %s" % i
[char, length, index] = blocks[i]
#p1 <- previous block of the same character
p1_idx = chars[char][index]['prev']
####
####
#print "(p1_idx, l, p1_idx >= l): (%s, %s, %s)" % (p1_idx, l, p1_idx >= l)
if p1_idx < l and index > l:
result[index - l] = parse(length) + result[index - l - 1]
while p1_idx >= l:
p1 = blocks[p1_idx]
####
####
#print "(b, p1, p1_idx, l): (%s, %s, %s, %s)\n" % (blocks[i], p1, p1_idx, l)
between = g(p1[2] + 1, index - 1)
####
####
#print "between: %s" % between
#if b and p1 can combine:
if between == 0:
if length + p1[1] > 2:
result[index - l] = min(
result[index - l],
# don't combine
parse(length) + (result[index - l - 1] if index - l > 0 else 0),
# combine: f(block before p1)
result[p1[2] - l - 1] if p1[2] > l else 0
)
# b.length + p1.length < 3
else:
#p2 <- block previous to p1 of the same character
p2_idx = chars[char][p1[2]]['prev']
if p2_idx < l:
p1_idx = chars[char][p1_idx]['prev']
continue
between2 = g(p2_idx + 1, p1[2] - 1)
#if p1 and p2 can combine:
if between2 == 0:
result[index - l] = min(
result[index - l],
# don't combine
parse(length) + (result[index - l - 1] if index - l > 0 else 0),
# combine the block, p1 and p2
result[p2_idx - l - 1] if p2_idx - l > 0 else 0
)
else:
#f(b) = b.length + f(block before b)
result[index - l] = min(
result[index - l],
parse(length) + (result[index - l - 1] if index - l > 0 else 0)
)
# b and p1 cannot combine
else:
#f(b) = b.length + f(block before b)
result[index - l] = min(
result[index - l],
parse(length) + (result[index - l - 1] if index - l > 0 else 0)
)
p1_idx = chars[char][p1_idx]['prev']
#print l,r,result
memo[l][r] = result[r - l]
"""if top_level:
return (result, blocks)
else:"""
return result[r - l]
if len(blocks) == 1:
return ([parse(blocks[0][1])], blocks)
else:
return g(0, len(blocks) - 1, True)
"""s = ""
for i in xrange(300):
s = s + ['A','B','C'][random.randint(0,2)]"""
print f("abcccbcccbacccab") # b
print
print f("AAAABBBAC"); # C
print
print f("CAAAABBBA"); # C
print
print f("AABBAAAC"); # AABBC
print
print f("BAABCCCBBA"); # B
print
print f("aaaa")
print
The string answers for these longer examples were computed using jdehesa's answer:
t0 = time.time()
print f("BCBCCBCCBCABBACCBABAABBBABBBACCBBBAABBACBCCCACABBCAABACBBBBCCCBBAACBAABACCBBCBBAABCCCCCAABBBBACBBAAACACCBCCBBBCCCCCCCACBABACCABBCBBBBBCBABABBACCAACBCBBAACBBBBBCCBABACBBABABAAABCCBBBAACBCACBAABAAAABABB")
# BCBCCBCCBCABBACCBABCCAABBACBACABBCAABACAACBAABACCBBCBBCACCBACBABACCABBCCBABABBACCAACBCBBAABABACBBABABBCCAACBCACBAABBABB
t1 = time.time()
total = t1-t0
print total
t0 = time.time()
print f("CBBACAAAAABBBBCAABBCBAABBBCBCBCACACBAABCBACBBABCABACCCCBACBCBBCBACBBACCCBAAAACACCABAACCACCBCBCABAACAABACBABACBCBAACACCBCBCCCABACABBCABBAAAAABBBBAABAABBCACACABBCBCBCACCCBABCAACBCAAAABCBCABACBABCABCBBBBABCBACABABABCCCBBCCBBCCBAAABCABBAAABBCAAABCCBAABAABCAACCCABBCAABCBCBCBBAACCBBBACBBBCABAABCABABABABCA")
# CBBACCAABBCBAACBCBCACACBAABCBACBBABCABABACBCBBCBACBBABCACCABAACCACCBCBCABAACAABACBABACBCBAACACCBCBABACABBCBBCACACABBCBCBCABABCAACBCBCBCABACBABCABCABCBACABABACCBBCCBBCACBCCBAABAABCBBCAABCBCBCBBAACCACCABAABCABABABABCA
t1 = time.time()
total = t1-t0
print total
t0 = time.time()
print f("AADBDBEBBBBCABCEBCDBBBBABABDCCBCEBABADDCABEEECCECCCADDACCEEAAACCABBECBAEDCEEBDDDBAAAECCBBCEECBAEBEEEECBEEBDACDDABEEABEEEECBABEDDABCDECDAABDAEADEECECEBCBDDAEEECCEEACCBBEACDDDDBDBCCAAECBEDAAAADBEADBAAECBDEACDEABABEBCABDCEEAABABABECDECADCEDAEEEBBBCEDECBCABDEDEBBBABABEEBDAEADBEDABCAEABCCBCCEDCBBEBCECCCA")
# AADBDBECABCEBCDABABDCCBCEBABADDCABCCEADDACCEECCABBECBAEDCEEBBECCBBCEECBAEBCBEEBDACDDABEEABCBABEDDABCDECDAABDAEADEECECEBCBDDACCEEACCBBEACBDBCCAAECBEDDBEADBAAECBDEACDEABABEBCABDCEEAABABABECDECADCEDACEDECBCABDEDEABABEEBDAEADBEDABCAEABCCBCCEDCBBEBCEA
t1 = time.time()
total = t1-t0
print total

Another scala answer, using memoization and tailcall optimization (partly) (updated).
import scala.collection.mutable.HashSet
import scala.annotation._
object StringCondense extends App {
#tailrec
def groupConsecutive (s: String, sofar: List[String]): List[String] = s.toList match {
// def groupConsecutive (s: String): List[String] = s.toList match {
case Nil => sofar
// case Nil => Nil
case c :: str => {
val (prefix, rest) = (c :: str).span (_ == c)
// Strings of equal characters, longer than 3, don't make a difference to just 3
groupConsecutive (rest.mkString(""), (prefix.take (3)).mkString ("") :: sofar)
// (prefix.take (3)).mkString ("") :: groupConsecutive (rest.mkString(""))
}
}
// to count the effect of memoization
var count = 0
// recursively try to eliminate every group of 3 or more, brute forcing
// but for "aabbaabbaaabbbaabb", many reductions will lead sooner or
// later to the same result, so we try to detect these and avoid duplicate
// work
def moreThan2consecutive (s: String, seenbefore: HashSet [String]): String = {
if (seenbefore.contains (s)) s
else
{
count += 1
seenbefore += s
val sublists = groupConsecutive (s, Nil)
// val sublists = groupConsecutive (s)
val atLeast3 = sublists.filter (_.size > 2)
atLeast3.length match {
case 0 => s
case 1 => {
val res = sublists.filter (_.size < 3)
moreThan2consecutive (res.mkString (""), seenbefore)
}
case _ => {
val shrinked = (
for {idx <- (0 until sublists.size)
if (sublists (idx).length >= 3)
pre = (sublists.take (idx)).mkString ("")
post= (sublists.drop (idx+1)).mkString ("")
} yield {
moreThan2consecutive (pre + post, seenbefore)
}
)
(shrinked.head /: shrinked.tail) ((a, b) => if (a.length <= b.length) a else b)
}
}
}
}
// don't know what Rcd means, adopted from other solution but modified
// kind of a unit test **update**: forgot to reset count
testRcd (s: String, expected: String) : Boolean = {
count = 0
val seenbefore = HashSet [String] ()
val result = moreThan2consecutive (s, seenbefore)
val hit = result.equals (expected)
println (s"Input: $s\t result: ${result}\t expected ${expected}\t $hit\t count: $count");
hit
}
// some test values from other users with expected result
// **upd:** more testcases
def testgroup () : Unit = {
testRcd ("baabcccbba", "b")
testRcd ("aabbaaac", "aabbc")
testRcd ("aaaa", "")
testRcd ("aaaabbbac", "c")
testRcd ("abcccbcccbacccab", "b")
testRcd ("AAAABBBAC", "C")
testRcd ("CAAAABBBA", "C")
testRcd ("AABBAAAC", "AABBC")
testRcd ("BAABCCCBBA", "B")
testRcd ("AAABBBAAABBBAAABBBC", "C") // 377 subcalls reported by Yola,
testRcd ("AAABBBAAABBBAAABBBAAABBBC", "C") // 4913 when preceeded with AAABBB
}
testgroup
def testBigs () : Unit = {
/*
testRcd ("BCBCCBCCBCABBACCBABAABBBABBBACCBBBAABBACBCCCACABBCAABACBBBBCCCBBAACBAABACCBBCBBAABCCCCCAABBBBACBBAAACACCBCCBBBCCCCCCCACBABACCABBCBBBBBCBABABBACCAACBCBBAACBBBBBCCBABACBBABABAAABCCBBBAACBCACBAABAAAABABB",
"BCBCCBCCBCABBACCBABCCAABBACBACABBCAABACAACBAABACCBBCBBCACCBACBABACCABBCCBABABBACCAACBCBBAABABACBBABABBCCAACBCACBAABBABB")
*/
testRcd ("CBBACAAAAABBBBCAABBCBAABBBCBCBCACACBAABCBACBBABCABACCCCBACBCBBCBACBBACCCBAAAACACCABAACCACCBCBCABAACAABACBABACBCBAACACCBCBCCCABACABBCABBAAAAABBBBAABAABBCACACABBCBCBCACCCBABCAACBCAAAABCBCABACBABCABCBBBBABCBACABABABCCCBBCCBBCCBAAABCABBAAABBCAAABCCBAABAABCAACCCABBCAABCBCBCBBAACCBBBACBBBCABAABCABABABABCA",
"CBBACCAABBCBAACBCBCACACBAABCBACBBABCABABACBCBBCBACBBABCACCABAACCACCBCBCABAACAABACBABACBCBAACACCBCBABACABBCBBCACACABBCBCBCABABCAACBCBCBCABACBABCABCABCBACABABACCBBCCBBCACBCCBAABAABCBBCAABCBCBCBBAACCACCABAABCABABABABCA")
/*testRcd ("AADBDBEBBBBCABCEBCDBBBBABABDCCBCEBABADDCABEEECCECCCADDACCEEAAACCABBECBAEDCEEBDDDBAAAECCBBCEECBAEBEEEECBEEBDACDDABEEABEEEECBABEDDABCDECDAABDAEADEECECEBCBDDAEEECCEEACCBBEACDDDDBDBCCAAECBEDAAAADBEADBAAECBDEACDEABABEBCABDCEEAABABABECDECADCEDAEEEBBBCEDECBCABDEDEBBBABABEEBDAEADBEDABCAEABCCBCCEDCBBEBCECCCA",
"AADBDBECABCEBCDABABDCCBCEBABADDCABCCEADDACCEECCABBECBAEDCEEBBECCBBCEECBAEBCBEEBDACDDABEEABCBABEDDABCDECDAABDAEADEECECEBCBDDACCEEACCBBEACBDBCCAAECBEDDBEADBAAECBDEACDEABABEBCABDCEEAABABABECDECADCEDACEDECBCABDEDEABABEEBDAEADBEDABCAEABCCBCCEDCBBEBCEA")
*/
}
// for generated input, but with fixed seed, to compare the count with
// and without memoization
import util.Random
val r = new Random (31415)
// generate Strings but with high chances to produce some triples and
// longer sequences of char clones
def genRandomString () : String = {
(1 to 20).map (_ => r.nextInt (6) match {
case 0 => "t"
case 1 => "r"
case 2 => "-"
case 3 => "tt"
case 4 => "rr"
case 5 => "--"
}).mkString ("")
}
def testRandom () : Unit = {
(1 to 10).map (i=> testRcd (genRandomString, "random mode - false might be true"))
}
testRandom
testgroup
testRandom
// testBigs
}
Comparing the effect of memoization lead to interesting results:
Updated measurements. In the old values, I forgot to reset the counter, which leaded to much higher results. Now the spreading of results
is much more impressive and in total, the values are smaller.
No seenbefore:
Input: baabcccbba result: b expected b true count: 4
Input: aabbaaac result: aabbc expected aabbc true count: 2
Input: aaaa result: expected true count: 2
Input: aaaabbbac result: c expected c true count: 5
Input: abcccbcccbacccab result: b expected b true count: 34
Input: AAAABBBAC result: C expected C true count: 5
Input: CAAAABBBA result: C expected C true count: 5
Input: AABBAAAC result: AABBC expected AABBC true count: 2
Input: BAABCCCBBA result: B expected B true count: 4
Input: AAABBBAAABBBAAABBBC res: C expected C true count: 377
Input: AAABBBAAABBBAAABBBAAABBBC r: C expected C true count: 4913
Input: r--t----ttrrrrrr--tttrtttt--rr----result: rr--rr expected ? unknown ? false count: 1959
Input: ttrtt----tr---rrrtttttttrtr--rr result: r--rr expected ? unknown ? false count: 213
Input: tt----r-----ttrr----ttrr-rr--rr-- result: ttrttrrttrr-rr--rr-- ex ? unknown ? false count: 16
Input: --rr---rrrrrrr-r--rr-r--tt--rrrrr result: rr-r--tt-- expected ? unknown ? false count: 32
Input: tt-rrrrr--r--tt--rrtrrr------- result: ttr--tt--rrt expected ? unknown ? false count: 35
Input: --t-ttt-ttt--rrrrrt-rrtrttrr result: --tt-rrtrttrr expected ? unknown ? false count: 35
Input: rrt--rrrr----trrr-rttttrrtttrr result: rrtt- expected ? unknown ? false count: 1310
Input: ---tttrrrrrttrrttrr---tt-----tt result: rrttrr expected ? unknown ? false count: 1011
Input: -rrtt--rrtt---t-r--r---rttr-- result: -rrtt--rr-r--rrttr-- ex ? unknown ? false count: 9
Input: rtttt--rrrrrrrt-rrttt--tt--t result: r--t-rr--tt--t expectd ? unknown ? false count: 16
real 0m0.607s (without testBigs)
user 0m1.276s
sys 0m0.056s
With seenbefore:
Input: baabcccbba result: b expected b true count: 4
Input: aabbaaac result: aabbc expected aabbc true count: 2
Input: aaaa result: expected true count: 2
Input: aaaabbbac result: c expected c true count: 5
Input: abcccbcccbacccab result: b expected b true count: 11
Input: AAAABBBAC result: C expected C true count: 5
Input: CAAAABBBA result: C expected C true count: 5
Input: AABBAAAC result: AABBC expected AABBC true count: 2
Input: BAABCCCBBA result: B expected B true count: 4
Input: AAABBBAAABBBAAABBBC rest: C expected C true count: 28
Input: AAABBBAAABBBAAABBBAAABBBC C expected C true count: 52
Input: r--t----ttrrrrrr--tttrtttt--rr----result: rr--rr expected ? unknown ? false count: 63
Input: ttrtt----tr---rrrtttttttrtr--rr result: r--rr expected ? unknown ? false count: 48
Input: tt----r-----ttrr----ttrr-rr--rr-- result: ttrttrrttrr-rr--rr-- xpe? unknown ? false count: 8
Input: --rr---rrrrrrr-r--rr-r--tt--rrrrr result: rr-r--tt-- expected ? unknown ? false count: 19
Input: tt-rrrrr--r--tt--rrtrrr------- result: ttr--tt--rrt expected ? unknown ? false count: 12
Input: --t-ttt-ttt--rrrrrt-rrtrttrr result: --tt-rrtrttrr expected ? unknown ? false count: 16
Input: rrt--rrrr----trrr-rttttrrtttrr result: rrtt- expected ? unknown ? false count: 133
Input: ---tttrrrrrttrrttrr---tt-----tt result: rrttrr expected ? unknown ? false count: 89
Input: -rrtt--rrtt---t-r--r---rttr-- result: -rrtt--rr-r--rrttr-- ex ? unknown ? false count: 6
Input: rtttt--rrrrrrrt-rrttt--tt--t result: r--t-rr--tt--t expected ? unknown ? false count: 8
real 0m0.474s (without testBigs)
user 0m0.852s
sys 0m0.060s
With tailcall:
real 0m0.478s (without testBigs)
user 0m0.860s
sys 0m0.060s
For some random strings, the difference is bigger than a 10fold.
For long Strings with many groups one could, as an improvement, eliminate all groups which are the only group of that character, for instance:
aa bbb aa ccc xx ddd aa eee aa fff xx
The groups bbb, ccc, ddd, eee and fff are unique in the string, so they can't fit to something else and could all be eliminated, and the order of removal is will not matter. This would lead to the intermediate result
aaaa xx aaaa xx
and a fast solution. Maybe I try to implement it too. However, I guess, it will be possible to produce random Strings, where this will have a big impact and by a different form of random generated strings, to distributions, where the impact is low.

Here is a Python solution (function reduce_min), not particularly smart but I think fairly easy to understand (excessive amount of comments added for answer clarity):
def reductions(s, min_len):
"""
Yields every possible reduction of s by eliminating contiguous blocks
of l or more repeated characters.
For example, reductions('AAABBCCCCBAAC', 3) yields
'BBCCCCBAAC' and 'AAABBBAAC'.
"""
# Current character
curr = ''
# Length of current block
n = 0
# Start position of current block
idx = 0
# For each character
for i, c in enumerate(s):
if c != curr:
# New block begins
if n >= min_len:
# If previous block was long enough
# yield reduced string without it
yield s[:idx] + s[i:]
# Start new block
curr = c
n = 1
idx = i
else:
# Still in the same block
n += 1
# Yield reduction without last block if it was long enough
if n >= min_len:
yield s[:idx]
def reduce_min(s, min_len):
"""
Finds the smallest possible reduction of s by successive
elimination of contiguous blocks of min_len or more repeated
characters.
"""
# Current set of possible reductions
rs = set([s])
# Current best solution
result = s
# While there are strings to reduce
while rs:
# Get one element
r = rs.pop()
# Find reductions
r_red = list(reductions(r, min_len))
# If no reductions are found it is irreducible
if len(r_red) == 0 and len(r) < len(result):
# Replace if shorter than current best
result = r
else:
# Save reductions for next iterations
rs.update(r_red)
return result
assert reduce_min("BAABCCCBBA", 3) == "B"
assert reduce_min("AABBAAAC", 3) == "AABBC"
assert reduce_min("AAAA", 3) == ""
assert reduce_min("AAAABBBAC", 3) == "C"
EDIT: Since people seem to be posting C++ solutions, here is mine in C++ (again, function reduce_min):
#include <string>
#include <vector>
#include <unordered_set>
#include <iterator>
#include <utility>
#include <cassert>
using namespace std;
void reductions(const string &s, unsigned int min_len, vector<string> &rs)
{
char curr = '\0';
unsigned int n = 0;
unsigned int idx = 0;
for (auto it = s.begin(); it != s.end(); ++it)
{
if (curr != *it)
{
auto i = distance(s.begin(), it);
if (n >= min_len)
{
rs.push_back(s.substr(0, idx) + s.substr(i));
}
curr = *it;
n = 1;
idx = i;
}
else
{
n += 1;
}
}
if (n >= min_len)
{
rs.push_back(s.substr(0, idx));
}
}
string reduce_min(const string &s, unsigned int min_len)
{
unordered_set<string> rs { s };
string result = s;
vector<string> rs_new;
while (!rs.empty())
{
auto it = rs.begin();
auto r = *it;
rs.erase(it);
rs_new.clear();
reductions(r, min_len, rs_new);
if (rs_new.empty() && r.size() < result.size())
{
result = move(r);
}
else
{
rs.insert(rs_new.begin(), rs_new.end());
}
}
return result;
}
int main(int argc, char **argv)
{
assert(reduce_min("BAABCCCBBA", 3) == "B");
assert(reduce_min("AABBAAAC", 3) == "AABBC");
assert(reduce_min("AAAA", 3) == "");
assert(reduce_min("AAAABBBAC", 3) == "C");
return 0;
}
If you can use C++17 you can save memory by using string views.
EDIT 2: About the complexity of the algorithm. It is not straightforward to figure out, and as I said the algorithm is meant to be simple more than anything, but let's see. In the end, it is more or less the same as a breadth-first search. Let's say the string length is n, and, for generality, let's say the minimum block length (value 3 in the question) is m. In the first level, we can generate up to n / m reductions in the worst case. For each of these, we can generate up to (n - m) / m reductions, and so on. So basically, at "level" i (loop iteration i) we create up to (n - i * m) / m reductions per string we had, and each of these will take O(n - i * m) time to process. The maximum number of levels we can have is, again, n / m. So the complexity of the algorithm (if I'm not making mistakes) should have the form:
O( sum {i = 0 .. n / m} ( O(n - i * m) * prod {j = 0 .. i} ((n - i * m) / m) ))
|-Outer iters--| |---Cost---| |-Prev lvl-| |---Branching---|
Whew. So this should be something like:
O( sum {i = 0 .. n / m} (n - i * m) * O(n^i / m^i) )
Which in turn would collapse to:
O((n / m)^(n / m))
So yeah, the algorithm is more or less simple, but it can run into exponential cost cases (the bad cases would be strings made entirely of exactly m-long blocks, like AAABBBCCCAAACCC... for m = 3).

Related

Tail recursive solution in Scala for Linked-List chaining

I wanted to write a tail-recursive solution for the following problem on Leetcode -
You are given two non-empty linked lists representing two non-negative integers. The digits are stored in reverse order and each of their nodes contains a single digit. Add the two numbers and return it as a linked list.
You may assume the two numbers do not contain any leading zero, except the number 0 itself.
Example:
*Input: (2 -> 4 -> 3) + (5 -> 6 -> 4)*
*Output: 7 -> 0 -> 8*
*Explanation: 342 + 465 = 807.*
Link to the problem on Leetcode
I was not able to figure out a way to call the recursive function in the last line.
What I am trying to achieve here is the recursive calling of the add function that adds the heads of the two lists with a carry and returns a node. The returned node is chained with the node in the calling stack.
I am pretty new to scala, I am guessing I may have missed some useful constructs.
/**
* Definition for singly-linked list.
* class ListNode(_x: Int = 0, _next: ListNode = null) {
* var next: ListNode = _next
* var x: Int = _x
* }
*/
import scala.annotation.tailrec
object Solution {
def addTwoNumbers(l1: ListNode, l2: ListNode): ListNode = {
add(l1, l2, 0)
}
//#tailrec
def add(l1: ListNode, l2: ListNode, carry: Int): ListNode = {
var sum = 0;
sum = (if(l1!=null) l1.x else 0) + (if(l2!=null) l2.x else 0) + carry;
if(l1 != null || l2 != null || sum > 0)
ListNode(sum%10,add(if(l1!=null) l1.next else null, if(l2!=null) l2.next else null,sum/10))
else null;
}
}
You have a couple of problems, which can mostly be reduced as being not idiomatic.
Things like var and null are not common in Scala and usually, you would use a tail-recursive algorithm to avoid that kind of things.
Finally, remember that a tail-recursive algorithm requires that the last expression is either a plain value or a recursive call. For doing that, you usually keep track of the remaining job as well as an accumulator.
Here is a possible solution:
type Digit = Int // Refined [0..9]
type Number = List[Digit] // Refined NonEmpty.
def sum(n1: Number, n2: Number): Number = {
def aux(d1: Digit, d2: Digit, carry: Digit): (Digit, Digit) = {
val tmp = d1 + d2 + carry
val d = tmp % 10
val c = tmp / 10
d -> c
}
#annotation.tailrec
def loop(r1: Number, r2: Number, acc: Number, carry: Digit): Number =
(r1, r2) match {
case (d1 :: tail1, d2 :: tail2) =>
val (d, c) = aux(d1, d2, carry)
loop(r1 = tail1, r2 = tail2, d :: acc, carry = c)
case (Nil, d2 :: tail2) =>
val (d, c) = aux(d1 = 0, d2, carry)
loop(r1 = Nil, r2 = tail2, d :: acc, carry = c)
case (d1 :: tail1, Nil) =>
val (d, c) = aux(d1, d2 = 0, carry)
loop(r1 = tail1, r2 = Nil, d :: acc, carry = c)
case (Nil, Nil) =>
acc
}
loop(r1 = n1, r2 = n2, acc = List.empty, carry = 0).reverse
}
Now, this kind of recursions tends to be very verbose.
Usually, the stdlib provide ways to make this same algorithm more concise:
// This is a solution that do not require the numbers to be already reversed and the output is also in the correct order.
def sum(n1: Number, n2: Number): Number = {
val (result, carry) = n1.reverseIterator.zipAll(n2.reverseIterator, 0, 0).foldLeft(List.empty[Digit] -> 0) {
case ((acc, carry), (d1, d2)) =>
val tmp = d1 + d2 + carry
val d = tmp % 10
val c = tmp / 10
(d :: acc) -> c
}
if (carry > 0) carry :: result else result
}
Scala is less popular on LeetCode, but this Solution (which is not the best) would get accepted by LeetCode's online judge:
import scala.collection.mutable._
object Solution {
def addTwoNumbers(listA: ListNode, listB: ListNode): ListNode = {
var tempBufferA: ListBuffer[Int] = ListBuffer.empty
var tempBufferB: ListBuffer[Int] = ListBuffer.empty
tempBufferA.clear()
tempBufferB.clear()
def listTraversalA(listA: ListNode): ListBuffer[Int] = {
if (listA == null) {
return tempBufferA
} else {
tempBufferA += listA.x
listTraversalA(listA.next)
}
}
def listTraversalB(listB: ListNode): ListBuffer[Int] = {
if (listB == null) {
return tempBufferB
} else {
tempBufferB += listB.x
listTraversalB(listB.next)
}
}
val resultA: ListBuffer[Int] = listTraversalA(listA)
val resultB: ListBuffer[Int] = listTraversalB(listB)
val resultSum: BigInt = BigInt(resultA.reverse.mkString) + BigInt(resultB.reverse.mkString)
var listNodeResult: ListBuffer[ListNode] = ListBuffer.empty
val resultList = resultSum.toString.toList
var lastListNode: ListNode = null
for (i <-0 until resultList.size) {
if (i == 0) {
lastListNode = new ListNode(resultList(i).toString.toInt)
listNodeResult += lastListNode
} else {
lastListNode = new ListNode(resultList(i).toString.toInt, lastListNode)
listNodeResult += lastListNode
}
}
return listNodeResult.reverse(0)
}
}
References
For additional details, you can see the Discussion Board. There are plenty of accepted solutions, explanations, efficient algorithms with a variety of languages, and time/space complexity analysis in there.

how to match dna sequence pattern

I am getting a trouble finding an approach to solve this problem.
Input-output sequences are as follows :
**input1 :** aaagctgctagag
**output1 :** a3gct2ag2
**input2 :** aaaaaaagctaagctaag
**output2 :** a6agcta2ag
Input nsequence can be of 10^6 characters and largest continuous patterns will be considered.
For example for input2 "agctaagcta" output will not be "agcta2gcta" but it will be "agcta2".
Any help appreciated.
Explanation of the algorithm:
Having a sequence S with symbols s(1), s(2),…, s(N).
Let B(i) be the best compressed sequence with elements s(1), s(2),…,s(i).
So, for example, B(3) will be the best compressed sequence for s(1), s(2), s(3).
What we want to know is B(N).
To find it, we will proceed by induction. We want to calculate B(i+1), knowing B(i), B(i-1), B(i-2), …, B(1), B(0), where B(0) is empty sequence, and and B(1) = s(1). At the same time, this constitutes a proof that the solution is optimal. ;)
To calculate B(i+1), we will pick the best sequence among the candidates:
Candidate sequences where the last block has one element:
B(i )s(i+1)1
B(i-1)s(i+1)2 ; only if s(i) = s(i+1)
B(i-2)s(i+1)3 ; only if s(i-1) = s(i) and s(i) = s(i+1)
…
B(1)s(i+1)[i-1] ; only if s(2)=s(3) and s(3)=s(4) and … and s(i) = s(i+1)
B(0)s(i+1)i = s(i+1)i ; only if s(1)=s(2) and s(2)=s(3) and … and s(i) = s(i+1)
Candidate sequences where the last block has 2 elements:
B(i-1)s(i)s(i+1)1
B(i-3)s(i)s(i+1)2 ; only if s(i-2)s(i-1)=s(i)s(i+1)
B(i-5)s(i)s(i+1)3 ; only if s(i-4)s(i-3)=s(i-2)s(i-1) and s(i-2)s(i-1)=s(i)s(i+1)
…
Candidate sequences where the last block has 3 elements:
…
Candidate sequences where the last block has 4 elements:
…
…
Candidate sequences where last block has n+1 elements:
s(1)s(2)s(3)………s(i+1)
For each possibility, the algorithm stops when the sequence block is no longer repeated. And that’s it.
The algorithm will be some thing like this in psude-c code:
B(0) = “”
for (i=1; i<=N; i++) {
// Calculate all the candidates for B(i)
BestCandidate=null
for (j=1; j<=i; j++) {
Calculate all the candidates of length (i)
r=1;
do {
Candidadte = B([i-j]*r-1) s(i-j+1)…s(i-1)s(i) r
If ( (BestCandidate==null)
|| (Candidate is shorter that BestCandidate))
{
BestCandidate=Candidate.
}
r++;
} while ( ([i-j]*r <= i)
&&(s(i-j*r+1) s(i-j*r+2)…s(i-j*r+j) == s(i-j+1) s(i-j+2)…s(i-j+j))
}
B(i)=BestCandidate
}
Hope that this can help a little more.
The full C program performing the required task is given below. It runs in O(n^2). The central part is only 30 lines of code.
EDIT I have restructured a little bit the code, changed the names of the variables and added some comment in order to be more readable.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
// This struct represents a compressed segment like atg4, g3, agc1
struct Segment {
char *elements;
int nElements;
int count;
};
// As an example, for the segment agagt3 elements would be:
// {
// elements: "agagt",
// nElements: 5,
// count: 3
// }
struct Sequence {
struct Segment lastSegment;
struct Sequence *prev; // Points to a sequence without the last segment or NULL if it is the first segment
int totalLen; // Total length of the compressed sequence.
};
// as an example, for the sequence agt32ta5, the representation will be:
// {
// lastSegment:{"ta" , 2 , 5},
// prev: #A,
// totalLen: 8
// }
// and A will be
// {
// lastSegment{ "agt", 3, 32},
// prev: NULL,
// totalLen: 5
// }
// This function converts a sequence to a string.
// You have to free the string after using it.
// The strategy is to construct the string from right to left.
char *sequence2string(struct Sequence *S) {
char *Res=malloc(S->totalLen + 1);
char *digits="0123456789";
int p= S->totalLen;
Res[p]=0;
while (S!=NULL) {
// first we insert the count of the last element.
// We do digit by digit starting with the units.
int C = S->lastSegment.count;
while (C) {
p--;
Res[p] = digits[ C % 10 ];
C /= 10;
}
p -= S->lastSegment.nElements;
strncpy(Res + p , S->lastSegment.elements, S->lastSegment.nElements);
S = S ->prev;
}
return Res;
}
// Compresses a dna sequence.
// Returns a string with the in sequence compressed.
// The returned string must be freed after using it.
char *dnaCompress(char *in) {
int i,j;
int N = strlen(in);; // Number of elements of a in sequence.
// B is an array of N+1 sequences where B(i) is the best compressed sequence sequence of the first i characters.
// What we want to return is B[N];
struct Sequence *B;
B = malloc((N+1) * sizeof (struct Sequence));
// We first do an initialization for i=0
B[0].lastSegment.elements="";
B[0].lastSegment.nElements=0;
B[0].lastSegment.count=0;
B[0].prev = NULL;
B[0].totalLen=0;
// and set totalLen of all the sequences to a very HIGH VALUE in this case N*2 will be enougth, We will try different sequences and keep the minimum one.
for (i=1; i<=N; i++) B[i].totalLen = INT_MAX; // A very high value
for (i=1; i<=N; i++) {
// at this point we want to calculate B[i] and we know B[i-1], B[i-2], .... ,B[0]
for (j=1; j<=i; j++) {
// Here we will check all the candidates where the last segment has j elements
int r=1; // number of times the last segment is repeated
int rNDigits=1; // Number of digits of r
int rNDigitsBound=10; // We will increment r, so this value is when r will have an extra digit.
// when r = 0,1,...,9 => rNDigitsBound = 10
// when r = 10,11,...,99 => rNDigitsBound = 100
// when r = 100,101,.,999 => rNDigitsBound = 1000 and so on.
do {
// Here we analitze a candidate B(i).
// where the las segment has j elements repeated r times.
int CandidateLen = B[i-j*r].totalLen + j + rNDigits;
if (CandidateLen < B[i].totalLen) {
B[i].lastSegment.elements = in + i - j*r;
B[i].lastSegment.nElements = j;
B[i].lastSegment.count = r;
B[i].prev = &(B[i-j*r]);
B[i].totalLen = CandidateLen;
}
r++;
if (r == rNDigitsBound ) {
rNDigits++;
rNDigitsBound *= 10;
}
} while ( (i - j*r >= 0)
&& (strncmp(in + i -j, in + i - j*r, j)==0));
}
}
char *Res=sequence2string(&(B[N]));
free(B);
return Res;
}
int main(int argc, char** argv) {
char *compressedDNA=dnaCompress(argv[1]);
puts(compressedDNA);
free(compressedDNA);
return 0;
}
Forget Ukonnen. Dynamic programming it is. With 3-dimensional table:
sequence position
subsequence size
number of segments
TERMINOLOGY: For example, having a = "aaagctgctagag", sequence position coordinate would run from 1 to 13. At sequence position 3 (letter 'g'), having subsequence size 4, the subsequence would be "gctg". Understood? And as for the number of segments, then expressing a as "aaagctgctagag1" consists of 1 segment (the sequence itself). Expressing it as "a3gct2ag2" consists of 3 segments. "aaagctgct1ag2" consists of 2 segments. "a2a1ctg2ag2" would consist of 4 segments. Understood? Now, with this, you start filling a 3-dimensional array 13 x 13 x 13, so your time and memory complexity seems to be around n ** 3 for this. Are you sure you can handle it for million-bp sequences? I think that greedy approach would be better, because large DNA sequences are unlikely to repeat exactly. And, I would suggest that you widen your assignment to approximate matches, and you can publish it straight in a journal.
Anyway, you will start filling the table of compressing a subsequence starting at some position (dimension 1) with length equal to dimension 2 coordinate, having at most dimension 3 segments. So you first fill the first row, representing compressions of subsequences of length 1 consisting of at most 1 segment:
a a a g c t g c t a g a g
1(a1) 1(a1) 1(a1) 1(g1) 1(c1) 1(t1) 1(g1) 1(c1) 1(t1) 1(a1) 1(g1) 1(a1) 1(g1)
The number is the character cost (always 1 for these trivial 1-char sequences; number 1 does not count into the character cost), and in the parenthesis, you have the compression (also trivial for this simple case). The second row will be still simple:
2(a2) 2(a2) 2(ag1) 2(gc1) 2(ct1) 2(tg1) 2(gc1) 2(ct1) 2(ta1) 2(ag1) 2(ga1) 2(ag1)
There is only 1 way to decompose a 2-character sequence into 2 subsequences -- 1 character + 1 character. If they are identical, the result is like a + a = a2. If they are different, such as a + g, then, because only 1-segment sequences are admissible, the result cannot be a1g1, but must be ag1. The third row will be finally more interesting:
2(a3) 2(aag1) 3(agc1) 3(gct1) 3(ctg1) 3(tgc1) 3(gct1) 3(cta1) 3(tag1) 3(aga1) 3(gag1)
Here, you can always choose between 2 ways of composing the compressed string. For example, aag can be composed either as aa + g or a + ag. But again, we cannot have 2 segments, as in aa1g1 or a1ag1, so we must be satisfied with aag1, unless both components consist of the same character, as in aa + a => a3, with character cost 2. We can continue onto 4 th line:
4(aaag1) 4(aagc1) 4(agct1) 4(gctg1) 4(ctgc1) 4(tgct1) 4(gcta1) 4(ctag1) 4(taga1) 3(ag2)
Here, on the first position, we cannot use a3g1, because only 1 segment is allowed at this layer. But at the last position, compression to character cost 3 is agchieved by ag1 + ag1 = ag2. This way, one can fill the whole first-level table all the way up to the single subsequence of 13 characters, and each subsequence will have its optimal character cost and its compression under the first-level constraint of at most 1 segment associated with it.
Then you go to the 2nd level, where 2 segments are allowed... And again, from the bottom up, you identify the optimum cost and compression of each table coordinate under the given level's segment count constraint, by comparing all the possible ways to compose the subsequence using already computed positions, until you fill the table completely and thus compute the global optimum. There are some details to solve, but sorry, I'm not gonna code this for you.
After trying my own way for a while, my kudos to jbaylina for his beautiful algorithm and C implementation. Here's my attempted version of jbaylina's algorithm in Haskell, and below it further development of my attempt at a linear-time algorithm that attempts to compress segments that include repeated patterns in a one-by-one fashion:
import Data.Map (fromList, insert, size, (!))
compress s = (foldl f (fromList [(0,([],0)),(1,([s!!0],1))]) [1..n - 1]) ! n
where
n = length s
f b i = insert (size b) bestCandidate b where
add (sequence, sLength) (sequence', sLength') =
(sequence ++ sequence', sLength + sLength')
j' = [1..min 100 i]
bestCandidate = foldr combCandidates (b!i `add` ([s!!i,'1'],2)) j'
combCandidates j candidate' =
let nextCandidate' = comb 2 (b!(i - j + 1)
`add` ((take j . drop (i - j + 1) $ s) ++ "1", j + 1))
in if snd nextCandidate' <= snd candidate'
then nextCandidate'
else candidate' where
comb r candidate
| r > uBound = candidate
| not (strcmp r True) = candidate
| snd nextCandidate <= snd candidate = comb (r + 1) nextCandidate
| otherwise = comb (r + 1) candidate
where
uBound = div (i + 1) j
prev = b!(i - r * j + 1)
nextCandidate = prev `add`
((take j . drop (i - j + 1) $ s) ++ show r, j + length (show r))
strcmp 1 _ = True
strcmp num bool
| (take j . drop (i - num * j + 1) $ s)
== (take j . drop (i - (num - 1) * j + 1) $ s) =
strcmp (num - 1) True
| otherwise = False
Output:
*Main> compress "aaagctgctagag"
("a3gct2ag2",9)
*Main> compress "aaabbbaaabbbaaabbbaaabbb"
("aaabbb4",7)
Linear-time attempt:
import Data.List (sortBy)
group' xxs sAccum (chr, count)
| null xxs = if null chr
then singles
else if count <= 2
then reverse sAccum ++ multiples ++ "1"
else singles ++ if null chr then [] else chr ++ show count
| [x] == chr = group' xs sAccum (chr,count + 1)
| otherwise = if null chr
then group' xs (sAccum) ([x],1)
else if count <= 2
then group' xs (multiples ++ sAccum) ([x],1)
else singles
++ chr ++ show count ++ group' xs [] ([x],1)
where x:xs = xxs
singles = reverse sAccum ++ (if null sAccum then [] else "1")
multiples = concat (replicate count chr)
sequences ws strIndex maxSeqLen = repeated' where
half = if null . drop (2 * maxSeqLen - 1) $ ws
then div (length ws) 2 else maxSeqLen
repeated' = let (sequence,(sequenceStart, sequenceEnd'),notSinglesFlag) = repeated
in (sequence,(sequenceStart, sequenceEnd'))
repeated = foldr divide ([],(strIndex,strIndex),False) [1..half]
equalChunksOf t a = takeWhile(==t) . map (take a) . iterate (drop a)
divide chunkSize b#(sequence,(sequenceStart, sequenceEnd'),notSinglesFlag) =
let t = take (2*chunkSize) ws
t' = take chunkSize t
in if t' == drop chunkSize t
then let ts = equalChunksOf t' chunkSize ws
lenTs = length ts
sequenceEnd = strIndex + lenTs * chunkSize
newEnd = if sequenceEnd > sequenceEnd'
then sequenceEnd else sequenceEnd'
in if chunkSize > 1
then if length (group' (concat (replicate lenTs t')) [] ([],0)) > length (t' ++ show lenTs)
then (((strIndex,sequenceEnd,chunkSize,lenTs),t'):sequence, (sequenceStart,newEnd),True)
else b
else if notSinglesFlag
then b
else (((strIndex,sequenceEnd,chunkSize,lenTs),t'):sequence, (sequenceStart,newEnd),False)
else b
addOne a b
| null (fst b) = a
| null (fst a) = b
| otherwise =
let (((start,end,patLen,lenS),sequence):rest,(sStart,sEnd)) = a
(((start',end',patLen',lenS'),sequence'):rest',(sStart',sEnd')) = b
in if sStart' < sEnd && sEnd < sEnd'
then let c = ((start,end,patLen,lenS),sequence):rest
d = ((start',end',patLen',lenS'),sequence'):rest'
in (c ++ d, (sStart, sEnd'))
else a
segment xs baseIndex maxSeqLen = segment' xs baseIndex baseIndex where
segment' zzs#(z:zs) strIndex farthest
| null zs = initial
| strIndex >= farthest && strIndex > 0 = ([],(0,0))
| otherwise = addOne initial next
where
next#(s',(start',end')) = segment' zs (strIndex + 1) farthest'
farthest' | null s = farthest
| otherwise = if start /= end && end > farthest then end else farthest
initial#(s,(start,end)) = sequences zzs strIndex maxSeqLen
areExclusive ((a,b,_,_),_) ((a',b',_,_),_) = (a' >= b) || (b' <= a)
combs [] r = [r]
combs (x:xs) r
| null r = combs xs (x:r) ++ if null xs then [] else combs xs r
| otherwise = if areExclusive (head r) x
then combs xs (x:r) ++ combs xs r
else if l' > lowerBound
then combs xs (x: reduced : drop 1 r) ++ combs xs r
else combs xs r
where lowerBound = l + 2 * patLen
((l,u,patLen,lenS),s) = head r
((l',u',patLen',lenS'),s') = x
reduce = takeWhile (>=l') . iterate (\x -> x - patLen) $ u
lenReduced = length reduce
reduced = ((l,u - lenReduced * patLen,patLen,lenS - lenReduced),s)
buildString origStr sequences = buildString' origStr sequences 0 (0,"",0)
where
buildString' origStr sequences index accum#(lenC,cStr,lenOrig)
| null sequences = accum
| l /= index =
buildString' (drop l' origStr) sequences l (lenC + l' + 1, cStr ++ take l' origStr ++ "1", lenOrig + l')
| otherwise =
buildString' (drop u' origStr) rest u (lenC + length s', cStr ++ s', lenOrig + u')
where
l' = l - index
u' = u - l
s' = s ++ show lenS
(((l,u,patLen,lenS),s):rest) = sequences
compress [] _ accum = reverse accum ++ (if null accum then [] else "1")
compress zzs#(z:zs) maxSeqLen accum
| null (fst segment') = compress zs maxSeqLen (z:accum)
| (start,end) == (0,2) && not (null accum) = compress zs maxSeqLen (z:accum)
| otherwise =
reverse accum ++ (if null accum || takeWhile' compressedStr 0 /= 0 then [] else "1")
++ compressedStr
++ compress (drop lengthOriginal zzs) maxSeqLen []
where segment'#(s,(start,end)) = segment zzs 0 maxSeqLen
combinations = combs (fst $ segment') []
takeWhile' xxs count
| null xxs = 0
| x == '1' && null (reads (take 1 xs)::[(Int,String)]) = count
| not (null (reads [x]::[(Int,String)])) = 0
| otherwise = takeWhile' xs (count + 1)
where x:xs = xxs
f (lenC,cStr,lenOrig) (lenC',cStr',lenOrig') =
let g = compare ((fromIntegral lenC + if not (null accum) && takeWhile' cStr 0 == 0 then 1 else 0) / fromIntegral lenOrig)
((fromIntegral lenC' + if not (null accum) && takeWhile' cStr' 0 == 0 then 1 else 0) / fromIntegral lenOrig')
in if g == EQ
then compare (takeWhile' cStr' 0) (takeWhile' cStr 0)
else g
(lenCompressed,compressedStr,lengthOriginal) =
head $ sortBy f (map (buildString (take end zzs)) (map reverse combinations))
Output:
*Main> compress "aaaaaaaaabbbbbbbbbaaaaaaaaabbbbbbbbb" 100 []
"a9b9a9b9"
*Main> compress "aaabbbaaabbbaaabbbaaabbb" 100 []
"aaabbb4"

Algorithm to generate a sequence proportional to specified percentage

Given a Map of objects and designated proportions (let's say they add up to 100 to make it easy):
val ss : Map[String,Double] = Map("A"->42, "B"->32, "C"->26)
How can I generate a sequence such that for a subset of size n there are ~42% "A"s, ~32% "B"s and ~26% "C"s? (Obviously, small n will have larger errors).
(Work language is Scala, but I'm just asking for the algorithm.)
UPDATE: I resisted a random approach since, for instance, there's ~16% chance that the sequence would start with AA and ~11% chance it would start with BB and there would be very low odds that for n precisely == (sum of proportions) the distribution would be perfect. So, following #MvG's answer, I implemented as follows:
/**
Returns the key whose achieved proportions are most below desired proportions
*/
def next[T](proportions : Map[T, Double], achievedToDate : Map[T,Double]) : T = {
val proportionsSum = proportions.values.sum
val desiredPercentages = proportions.mapValues(v => v / proportionsSum)
//Initially no achieved percentages, so avoid / 0
val toDateTotal = if(achievedToDate.values.sum == 0.0){
1
}else{
achievedToDate.values.sum
}
val achievedPercentages = achievedToDate.mapValues(v => v / toDateTotal)
val gaps = achievedPercentages.map{ case (k, v) =>
val gap = desiredPercentages(k) - v
(k -> gap)
}
val maxUnder = gaps.values.toList.sortWith(_ > _).head
//println("Max gap is " + maxUnder)
val gapsForMaxUnder = gaps.mapValues{v => Math.abs(v - maxUnder) < Double.Epsilon }
val keysByHasMaxUnder = gapsForMaxUnder.map(_.swap)
keysByHasMaxUnder(true)
}
/**
Stream of most-fair next element
*/
def proportionalStream[T](proportions : Map[T, Double], toDate : Map[T, Double]) : Stream[T] = {
val nextS = next(proportions, toDate)
val tailToDate = toDate + (nextS -> (toDate(nextS) + 1.0))
Stream.cons(
nextS,
proportionalStream(proportions, tailToDate)
)
}
That when used, e.g., :
val ss : Map[String,Double] = Map("A"->42, "B"->32, "C"->26)
val none : Map[String,Double] = ss.mapValues(_ => 0.0)
val mySequence = (proportionalStream(ss, none) take 100).toList
println("Desired : " + ss)
println("Achieved : " + mySequence.groupBy(identity).mapValues(_.size))
mySequence.map(s => print(s))
println
produces :
Desired : Map(A -> 42.0, B -> 32.0, C -> 26.0)
Achieved : Map(C -> 26, A -> 42, B -> 32)
ABCABCABACBACABACBABACABCABACBACABABCABACABCABACBA
CABABCABACBACABACBABACABCABACBACABABCABACABCABACBA
For a deterministic approach, the most obvious solution would probably be this:
Keep track of the number of occurrences of each item in the sequence so far.
For the next item, choose that item for which the difference between intended and actual count (or proportion, if you prefer that) is maximal, but only if the intended count (resp. proportion) is greater than the actual one.
If there is a tie, break it in an arbitrary but deterministic way, e.g. choosing the alphabetically lowest item.
This approach would ensure an optimal adherence to the prescribed ratio for every prefix of the infinite sequence generated in this way.
Quick & dirty python proof of concept (don't expect any of the variable “names” to make any sense):
import sys
p = [0.42, 0.32, 0.26]
c = [0, 0, 0]
a = ['A', 'B', 'C']
n = 0
while n < 70*5:
n += 1
x = 0
s = n*p[0] - c[0]
for i in [1, 2]:
si = n*p[i] - c[i]
if si > s:
x = i
s = si
sys.stdout.write(a[x])
if n % 70 == 0:
sys.stdout.write('\n')
c[x] += 1
Generates
ABCABCABACABACBABCAABCABACBACABACBABCABACABACBACBAABCABCABACABACBABCAB
ACABACBACABACBABCABACABACBACBAABCABCABACABACBABCAABCABACBACABACBABCABA
CABACBACBAABCABCABACABACBABCABACABACBACBAACBABCABACABACBACBAABCABCABAC
ABACBABCABACABACBACBAACBABCABACABACBACBAABCABCABACABACBABCABACABACBACB
AACBABCABACABACBACBAABCABCABACABACBABCAABCABACBACBAACBABCABACABACBACBA
For every item of the sequence, compute a (pseudo-)random number r equidistributed between 0 (inclusive) and 100 (exclusive).
If 0 ≤ r < 42, take A
If 42 ≤ r < (42+32), take B
If (42+32) ≤ r < (42+32+26)=100, take C
The number of each entry in your subset is going to be the same as in your map, but with a scaling factor applied.
The scaling factor is n/100.
So if n was 50, you would have { Ax21, Bx16, Cx13 }.
Randomize the order to your liking.
The simplest "deterministic" [in terms of #elements of each category] solution [IMO] will be: add elements in predefined order, and then shuffle the resulting list.
First, add map(x)/100 * n elements from each element x chose how you handle integer arithmetics to avoid off by one element], and then shuffle the resulting list.
Shuffling a list is simple with fisher-yates shuffle, which is implemented in most languages: for example java has Collections.shuffle(), and C++ has random_shuffle()
In java, it will be as simple as:
int N = 107;
List<String> res = new ArrayList<String>();
for (Entry<String,Integer> e : map.entrySet()) { //map is predefined Map<String,Integer> for frequencies
for (int i = 0; i < Math.round(e.getValue()/100.0 * N); i++) {
res.add(e.getKey());
}
}
Collections.shuffle(res);
This is nondeterministic, but gives a distribution of values close to MvG's. It suffers from the problem that it could give AAA right at the start. I post it here for completeness' sake given how it proves my dissent with MvG was misplaced (and I don't expect any upvotes).
Now, if someone has an idea for an expand function that is deterministic and won't just duplicate MvG's method (rendering the calc function useless), I'm all ears!
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title>ErikE's answer</title>
</head>
<body>
<div id="output"></div>
<script type="text/javascript">
if (!Array.each) {
Array.prototype.each = function(callback) {
var i, l = this.length;
for (i = 0; i < l; i += 1) {
callback(i, this[i]);
}
};
}
if (!Array.prototype.sum) {
Array.prototype.sum = function() {
var sum = 0;
this.each(function(i, val) {
sum += val;
});
return sum;
};
}
function expand(counts) {
var
result = "",
charlist = [],
l,
index;
counts.each(function(i, val) {
char = String.fromCharCode(i + 65);
for ( ; val > 0; val -= 1) {
charlist.push(char);
}
});
l = charlist.length;
for ( ; l > 0; l -= 1) {
index = Math.floor(Math.random() * l);
result += charlist[index];
charlist.splice(index, 1);
}
return result;
}
function calc(n, proportions) {
var percents = [],
counts = [],
errors = [],
fnmap = [],
errorSum,
worstIndex;
fnmap[1] = "min";
fnmap[-1] = "max";
proportions.each(function(i, val) {
percents[i] = val / proportions.sum() * n;
counts[i] = Math.round(percents[i]);
errors[i] = counts[i] - percents[i];
});
errorSum = counts.sum() - n;
while (errorSum != 0) {
adjust = errorSum < 0 ? 1 : -1;
worstIndex = errors.indexOf(Math[fnmap[adjust]].apply(0, errors));
counts[worstIndex] += adjust;
errors[worstIndex] = counts[worstIndex] - percents[worstIndex];
errorSum += adjust;
}
return expand(counts);
}
document.body.onload = function() {
document.getElementById('output').innerHTML = calc(99, [25.1, 24.9, 25.9, 24.1]);
};
</script>
</body>
</html>

MergeSort in scala

I came across another codechef problem which I am attempting to solve in Scala. The problem statement is as follows:
Stepford Street was a dead end street. The houses on Stepford Street
were bought by wealthy millionaires. They had them extensively altered
so that as one progressed along the street, the height of the
buildings increased rapidly. However, not all millionaires were
created equal. Some refused to follow this trend and kept their houses
at their original heights. The resulting progression of heights was
thus disturbed. A contest to locate the most ordered street was
announced by the Beverly Hills Municipal Corporation. The criteria for
the most ordered street was set as follows: If there exists a house
with a lower height later in the street than the house under
consideration, then the pair (current house, later house) counts as 1
point towards the disorderliness index of the street. It is not
necessary that the later house be adjacent to the current house. Note:
No two houses on a street will be of the same height For example, for
the input: 1 2 4 5 3 6 The pairs (4,3), (5,3) form disordered pairs.
Thus the disorderliness index of this array is 2. As the criteria for
determining the disorderliness is complex, the BHMC has requested your
help to automate the process. You need to write an efficient program
that calculates the disorderliness index of a street.
A sample input output provided is as follows:
Input: 1 2 4 5 3 6
Output: 2
The output is 2 because of two pairs (4,3) and (5,3)
To solve this problem I thought I should use a variant of MergeSort,incrementing by 1 when the left element is greater than the right element.
My scala code is as follows:
def dysfunctionCalc(input:List[Int]):Int = {
val leftHalf = input.size/2
println("HalfSize:"+leftHalf)
val isOdd = input.size%2
println("Is odd:"+isOdd)
val leftList = input.take(leftHalf+isOdd)
println("LeftList:"+leftList)
val rightList = input.drop(leftHalf+isOdd)
println("RightList:"+rightList)
if ((leftList.size <= 1) && (rightList.size <= 1)){
println("Entering input where both lists are <= 1")
if(leftList.size == 0 || rightList.size == 0){
println("One of the lists is less than 0")
0
}
else if(leftList.head > rightList.head)1 else 0
}
else{
println("Both lists are greater than 1")
dysfunctionCalc(leftList) + dysfunctionCalc(rightList)
}
}
First off, my logic is wrong,it doesn't have a merge stage and I am not sure what would be the best way to percolate the result of the base-case up the stack and compare it with the other values. Also, using recursion to solve this problem may not be the most optimal way to go since for large lists, I maybe blowing up the stack. Also, there might be stylistic issues with my code as well.
I would be great if somebody could point out other flaws and the right way to solve this problem.
Thanks
Suppose you split your list into three pieces: the item you are considering, those on the left, and those on the right. Suppose further that those on the left are in a sorted set. Now you just need to walk through the list, moving items from "right" to "considered" and from "considered" to "left"; at each point, you look at the size of the subset of the sorted set that is greater than your item. In general, the size lookup can be done in O(log(N)) as can the add-element (with a Red-Black or AVL tree, for instance). So you have O(N log N) performance.
Now the question is how to implement this in Scala efficiently. It turns out that Scala has a Red-Black tree used for its TreeSet sorted set, and the implementation is actually quite simple (here in tail-recursive form):
import collection.immutable.TreeSet
final def calcDisorder(xs: List[Int], left: TreeSet[Int] = TreeSet.empty, n: Int = 0): Int = xs match {
case Nil => n
case x :: rest => calcDisorder(rest, left + x, n + left.from(x).size)
}
Unfortunately, left.from(x).size takes O(N) time (I believe), which yields a quadratic execution time. That's no good--what you need is an IndexedTreeSet which can do indexOf(x) in O(log(n)) (and then iterate with n + left.size - left.indexOf(x) - 1). You can build your own implementation or find one on the web. For instance, I found one here (API here) for Java that does exactly the right thing.
Incidentally, the problem with doing a mergesort is that you cannot easily work cumulatively. With merging a pair, you can keep track of how out-of-order it is. But when you merge in a third list, you must see how out of order it is with respect to both other lists, which spoils your divide-and-conquer strategy. (I am not sure whether there is some invariant one could find that would allow you to calculate directly if you kept track of it.)
Here is my try, I don't use MergeSort but it seems to solve the problem:
def calcDisorderness(myList:List[Int]):Int = myList match{
case Nil => 0
case t::q => q.count(_<t) + calcDisorderness(q)
}
scala> val input = List(1,2,4,5,3,6)
input: List[Int] = List(1, 2, 4, 5, 3, 6)
scala> calcDisorderness(input)
res1: Int = 2
The question is, is there a way to have a lower complexity?
Edit: tail recursive version of the same function and cool usage of default values in function arguments.
def calcDisorderness(myList:List[Int], disorder:Int=0):Int = myList match{
case Nil => disorder
case t::q => calcDisorderness(q, disorder + q.count(_<t))
}
A solution based on Merge Sort. Not super fast, potential slowdown could be in "xs.length".
def countSwaps(a: Array[Int]): Long = {
var disorder: Long = 0
def msort(xs: List[Int]): List[Int] = {
import Stream._
def merge(left: List[Int], right: List[Int], inc: Int): Stream[Int] = {
(left, right) match {
case (x :: xs, y :: ys) if x > y =>
cons(y, merge(left, ys, inc + 1))
case (x :: xs, _) => {
disorder += inc
cons(x, merge(xs, right, inc))
}
case _ => right.toStream
}
}
val n = xs.length / 2
if (n == 0)
xs
else {
val (ys, zs) = xs splitAt n
merge(msort(ys), msort(zs), 0).toList
}
}
msort(a.toList)
disorder
}
Another solution based on Merge Sort. Very fast: no FP or for-loop.
def countSwaps(a: Array[Int]): Count = {
var swaps: Count = 0
def mergeRun(begin: Int, run_len: Int, src: Array[Int], dst: Array[Int]) = {
var li = begin
val lend = math.min(begin + run_len, src.length)
var ri = begin + run_len
val rend = math.min(begin + run_len * 2, src.length)
var ti = begin
while (ti < rend) {
if (ri >= rend) {
dst(ti) = src(li); li += 1
swaps += ri - begin - run_len
} else if (li >= lend) {
dst(ti) = src(ri); ri += 1
} else if (a(li) <= a(ri)) {
dst(ti) = src(li); li += 1
swaps += ri - begin - run_len
} else {
dst(ti) = src(ri); ri += 1
}
ti += 1
}
}
val b = new Array[Int](a.length)
var run = 0
var run_len = 1
while (run_len < a.length) {
var begin = 0
while (begin < a.length) {
val (src, dst) = if (run % 2 == 0) (a, b) else (b, a)
mergeRun(begin, run_len, src, dst)
begin += run_len * 2
}
run += 1
run_len *= 2
}
swaps
}
Convert the above code to Functional style: no mutable variable, no loop.
All recursions are tail calls, thus the performance is good.
def countSwaps(a: Array[Int]): Count = {
def mergeRun(li: Int, lend: Int, rb: Int, ri: Int, rend: Int, di: Int, src: Array[Int], dst: Array[Int], swaps: Count): Count = {
if (ri >= rend && li >= lend) {
swaps
} else if (ri >= rend) {
dst(di) = src(li)
mergeRun(li + 1, lend, rb, ri, rend, di + 1, src, dst, ri - rb + swaps)
} else if (li >= lend) {
dst(di) = src(ri)
mergeRun(li, lend, rb, ri + 1, rend, di + 1, src, dst, swaps)
} else if (src(li) <= src(ri)) {
dst(di) = src(li)
mergeRun(li + 1, lend, rb, ri, rend, di + 1, src, dst, ri - rb + swaps)
} else {
dst(di) = src(ri)
mergeRun(li, lend, rb, ri + 1, rend, di + 1, src, dst, swaps)
}
}
val b = new Array[Int](a.length)
def merge(run: Int, run_len: Int, lb: Int, swaps: Count): Count = {
if (run_len >= a.length) {
swaps
} else if (lb >= a.length) {
merge(run + 1, run_len * 2, 0, swaps)
} else {
val lend = math.min(lb + run_len, a.length)
val rb = lb + run_len
val rend = math.min(rb + run_len, a.length)
val (src, dst) = if (run % 2 == 0) (a, b) else (b, a)
val inc_swaps = mergeRun(lb, lend, rb, rb, rend, lb, src, dst, 0)
merge(run, run_len, lb + run_len * 2, inc_swaps + swaps)
}
}
merge(0, 1, 0, 0)
}
It seems to me that the key is to break the list into a series of ascending sequences. For example, your example would be broken into (1 2 4 5)(3 6). None of the items in the first list can end a pair. Now you do a kind of merge of these two lists, working backwards:
6 > 5, so 6 can't be in any pairs
3 < 5, so its a pair
3 < 4, so its a pair
3 > 2, so we're done
I'm not clear from the definition on how to handle more than 2 such sequences.

Algorithm to divide text into 3 evenly-sized groups

I'm would like to create an algorithm that will divide text into 3-evenly sized groups (based on text length). Since this will be put to use for line-breaks, the order of the text needs to be maintained.
For instance this string:
Just testing to see how this works.
would sort to:
Just testing // 12 characters
to see how // 10 characters
this works. // 11 characters
Any ideas?
The "minimum raggedness" dynamic program, also from the Wikipedia article on word wrap, can be adapted to your needs. Set LineWidth = len(text)/n - 1 and ignore the comment about infinite penalties for exceeding the line width; use the definition of c(i, j) as is with P = 2.
Code. I took the liberty of modifying the DP always to return exactly n lines, at the cost of increasing the running time from O(#words ** 2) to O(#words ** 2 * n).
def minragged(text, n=3):
"""
>>> minragged('Just testing to see how this works.')
['Just testing', 'to see how', 'this works.']
>>> minragged('Just testing to see how this works.', 10)
['', '', 'Just', 'testing', 'to', 'see', 'how', 'this', 'works.', '']
"""
words = text.split()
cumwordwidth = [0]
# cumwordwidth[-1] is the last element
for word in words:
cumwordwidth.append(cumwordwidth[-1] + len(word))
totalwidth = cumwordwidth[-1] + len(words) - 1 # len(words) - 1 spaces
linewidth = float(totalwidth - (n - 1)) / float(n) # n - 1 line breaks
def cost(i, j):
"""
cost of a line words[i], ..., words[j - 1] (words[i:j])
"""
actuallinewidth = max(j - i - 1, 0) + (cumwordwidth[j] - cumwordwidth[i])
return (linewidth - float(actuallinewidth)) ** 2
# best[l][k][0] is the min total cost for words 0, ..., k - 1 on l lines
# best[l][k][1] is a minimizing index for the start of the last line
best = [[(0.0, None)] + [(float('inf'), None)] * len(words)]
# xrange(upper) is the interval 0, 1, ..., upper - 1
for l in xrange(1, n + 1):
best.append([])
for j in xrange(len(words) + 1):
best[l].append(min((best[l - 1][k][0] + cost(k, j), k) for k in xrange(j + 1)))
lines = []
b = len(words)
# xrange(upper, 0, -1) is the interval upper, upper - 1, ..., 1
for l in xrange(n, 0, -1):
a = best[l][b][1]
lines.append(' '.join(words[a:b]))
b = a
lines.reverse()
return lines
if __name__ == '__main__':
import doctest
doctest.testmod()
You can try the next simple heuristic for starters: Place to iterators in n/3 and 2n/3 and search for the closest space near each of them.
From http://en.wikipedia.org/wiki/Word_wrap:
SpaceLeft := LineWidth
for each Word in Text
if Width(Word) > SpaceLeft
insert line break before Word in Text
SpaceLeft := LineWidth - Width(Word)
else
SpaceLeft := SpaceLeft - (Width(Word) + SpaceWidth)
This method is used by many modern word processors, such as OpenOffice.org Writer and Microsoft Word. This algorithm is optimal in that it always puts the text on the minimum number of lines.
The answer from "someone" works fine. However, I had problems translating this into SWIFT code. Here is my translation for all those that are interested.
import Foundation
class SplitText{
typealias MinRag = (Float, Int) // meaning (cost for line (so far), word index)
// from http://stackoverflow.com/questions/6426017/word-wrap-to-x-lines-instead-of-maximum-width-least-raggedness?lq=1
class func splitText(text:String, numberOfLines:Int)-> [String]{
//preparations
var words = split(text, maxSplit:100, allowEmptySlices: false, isSeparator:{(s:Character)-> Bool in return s == " " || s == "\n"})
var cumwordwidth = [Int](); //cummulative word widths
cumwordwidth.append(0);
for word in words{
cumwordwidth.append(cumwordwidth[cumwordwidth.count - 1] + count(word));
}
var totalwidth = cumwordwidth[cumwordwidth.count - 1] + count(words) - 1;
var linewidth:Float = Float(totalwidth - (numberOfLines - 1)) / Float(numberOfLines)
// cost function for one line for words i .. j
var cost = { (i:Int,j:Int)-> Float in
var actuallinewidth = max(j - i - 1, 0) + (cumwordwidth[j] - cumwordwidth[i]);
var remainingWidth: Float = linewidth - Float(actuallinewidth)
return remainingWidth * remainingWidth
}
var best = [[MinRag]]()
var tmp = [MinRag]();
//ensure that data structure is initialised in a way that we start with adding the first word
tmp.append((0, -1));
for word in words {
tmp.append((Float.infinity , -1));
}
best.append(tmp);
//now we can start. We simply calculate the cost for all possible lines
for l in 1...numberOfLines {
tmp = [MinRag]()
for j in 0...words.count {
var min:MinRag = (best[l - 1][0].0 + cost(0, j), 0);
var k: Int
for k = 0; k < j + 1 ; ++k {
var loc:Float = best[l - 1][k].0 + cost(k, j);
if (loc < min.0 || (loc == min.0 && k < min.1)) {
min=(loc, k);
}
println("l=\(l), j=\(j), k=\(k), min=\(min)")
}
tmp.append(min);
}
best.append(tmp);
}
//now build the answer based on above calculations
var lines = [String]();
var b = words.count;
var o:Int
for o = numberOfLines; o > 0 ; --o {
var a = best[o][b].1;
lines.append(" ".join(words[a...b-1]));
b = a;
}
return reverse(lines);
}
}

Resources