Group up date ranges by interesecting ranges - algorithm

This is a variation of the gaps and islands problem. I tried finding solutions but they are all SQL based.
Problem: Given a list of date ranges, return a combined list of date ranges where none are overlapping:
Ex:
2022/01/01 - 2022/03/01
2022/02/01 - 2022/05/01
2022/07/01 - 2022/08/01
Returns:
2022/01/01 - 2022/05/01
2022/07/01 - 2022/08/01
The language doesn't really matter, just not SQL. Here is my attempt in python, I think it works but I'm not sure:
from datetime import date
from dateutil.relativedelta import relativedelta
from operator import itemgetter
date_ranges = [
{
"start": date(2022, 3, 1),
"end": date(2022, 6, 1),
},
{
"start": date(2022, 1, 1),
"end": date(2022, 4, 1),
},
{
"start": date(2022, 8, 1),
"end": date(2022, 8, 1),
}
]
def _get_agregated_dates(date_ranges):
def _is_overlapping(range1, range2):
return (range1["end"] + relativedelta(months=1)) >= range2["start"].replace(day=1)
date_ranges = sorted(date_ranges, key=itemgetter("start"))
new_date_ranges = []
range1 = date_ranges[0]
range2 = date_ranges[1]
for i in range(1, len(date_ranges)):
if _is_overlapping(range1, range2):
# Set new date range
range1["end"] = range1["end"] if range1["end"] > range2["end"] else range2["end"]
# We compared the last 2 dates, and they overlapped, so append before loop ends
if i == len(date_ranges) - 1:
new_date_ranges.append(range1)
else:
range2 = date_ranges[i+1]
else:
new_date_ranges.append(range1)
if i == len(date_ranges) - 1:
# We compared the last 2 dates, and they did't overlapped, so append before loop ends
new_date_ranges.append(range2)
else:
range1 = date_ranges[i]
range2 = date_ranges[i+1]
return new_date_ranges
print(_get_agregated_dates(date_ranges))
N.B I count consecutive months as "overlapping" on purpose

Related

Optimizing the algorithm to run under 4 seconds for a quite large number of operations

I have the following code which is for solving the practise challanges from hackerrank.
And there are literally 10^7 values to be created in a list and then each should be incremented according to 10^5 queries (with console read time included), I need to crack it within 4 seconds. Here is total inputs (with queries).
First line contains two numbers, first(n) is the number of values in list, second(m) is the number of queries following below. All lines below are queries have 3 numbers, first(a) and second(b) is the indexes (starting from 1), third(k) is the value to be added into the list within the indexes. And then finally the maximum in the list should be console ouput.
private fun readLn() = readLine()!! // string line
private fun readStrings() = readLn().split(" ") // list of strings
private fun readInts() = readStrings().map { it.toInt() } // list of ints
fun main() {
val (n, m) = readInts()
val list = MutableList(n) { 0L }
repeat(m) {
val queries = readStrings()
val a = queries[0].toInt() - 1
val b = queries[1].toInt() - 1
val k = queries[2].toLong()
for (i in a..b) {
list[i] += k
}
}
println(list.max())
}
Currently it seems well optimized for me, but still can't do all the operations within 4 seconds.
Any help would be appreciated, Thanks in advance!
Edit - After answer provided by #Photon, I've modified the code but still with that algorithm as well the time limit is reached for same test cases.
Here is the modified code -
private fun readLn() = readLine()!! // string line
private fun readStrings() = readLn().split(" ") // list of strings
private fun readInts() = readStrings().map { it.toInt() } // list of ints
fun main() {
val (n, m) = readInts()
val list = MutableList(n + 2) { 0L }
repeat(m) {
val queries = readStrings()
val a = queries[0].toInt()
val b = queries[1].toInt()
val k = queries[2].toLong()
list[a] += k
list[b + 1] -= k
}
for (i in 1..n + 1) {
list[i] = list[i - 1] + list[i]
}
println(list.max())
}
Brute force is simply too slow no matter how much you optimize this. Here`s a simple array trick to solve this in O(N + Q) time:
First we have array of zeroes of size N+2: A = [0, 0, 0, 0, ..., 0]
For query L R K instead of increasing all numbers in interval we can increase first one by K and R+1 one by -K
then after all queries we can modify array by adding A[i-1] for all i in [1, N]
this will be the same as doing all queries
It might be confusing so here's an example:
N=5 so our initial array: A = [0, 0, 0, 0, 0, 0, 0]
lets say we have a query: 1 3 3
updated array: A = [0, 3, 0, 0, -3, 0, 0]
lets say we have another query: 2 5 10
updated array: A = [0, 3, 10, 0, -3, 0, -10]
now after all queries we can add A[i-1] for all i in [1, 5]
updated array: A = [0, 3, 13, 13, 10, 10, 0]
notice is`s the same as doing all queries by brute force

SSRS expression for date difference to a number

I have this expression:
=COUNT(Fields!RecId.Value) -
IIF(Fields!Status.Value="Assigned",
DATEDIFF("d", Fields!CreatedDateTime.Value,Fields!ResolvedDateTime.Value),
DATEDIFF("d", Fields!CreatedDateTime.Value,Fields!AssignedDateTime.Value))
- IIF(Weekday(Parameters!StartDate.Value, 1) = 1, 1, 0)
- IIF(Weekday(Parameters!StartDate.Value, 1) = 7, 1, 0)
- IIF(Weekday(Parameters!EndDate.Value, 1) = 1, 1, 0)
- IIF(Weekday(Parameters!EndDate.Value, 1) = 7, 1, 0)
What I want to be able to return is the RecID value minus the date difference if the date is more than 1 day.
From the comment, it seems like you want the count of records minus the number of records where the work days between the Created Date and if Status is "Assigned" the Resolved Date else the Assigned Date.
=COUNT(Fields!RecId.Value) -
SUM(
IIF(Fields!Status.Value = "Assigned",
IIF(DATEDIFF("d", Fields!CreatedDateTime.Value, Fields!ResolvedDateTime.Value)
- (DateDiff(DateInterval.WeekOfYear, Fields!CreatedDateTime.Value, Fields!ResolvedDateTime.Value)*2)
- (IIF(WEEKDAY(Fields!CreatedDateTime.Value) = 7, 1, 0)
- (IIF(WEEKDAY(Fields!ResolvedDateTime.Value) = 6, 1, 0))
- 1) > 1, 0, 1)
,
IIF(DATEDIFF("d", Fields!CreatedDateTime.Value, Fields!AssignedDateTime.Value) > 1, 0, 1)
- (DateDiff(DateInterval.WeekOfYear, Fields!CreatedDateTime.Value, Fields!AssignedDateTime.Value) * 2)
- (IIF(WEEKDAY(Fields!CreatedDateTime.Value) = 7, 1, 0)
- (IIF(WEEKDAY(Fields!AssignedDateTime.Value) = 6, 1, 0))
- 1) > 1, 0, 1)
)

Pattern recognition in binary numbers (pseudo code or MQL5)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
. . 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 0
Recognition starts at 17 and goes backwards to 0.
What can be seen is the most simple pattern.
Pattern starts with at least three 0s or three 1s but could be more of each but not mixed!
First pattern is then followed by at least five 0s or five 1s depending on what came in the first pattern. Since pattern one contains three 0, there must be at least five 1s and vice versa
Then we want to see the first pattern again. At least three 0s or three 1s, again, depending wheather there were 1s or 0s before
Finally we want to see the second pattern again, which means at least five 0s or five 1s, again, depending on which pattern was seen before
I tried using for loops and counters but did not manage to work it out. What is struggling me is the fact, that the pattern is not of fixed size as there can be more than three or five 0s and 1s in succession.
Is anybody able to provide some pseudo code how to implement this or even some MQL5 code?
The following Swift code is everything else than optimal. It should just give you hints how you could implement it.
A function to match a single pattern:
func matchPattern(numbers: [Int], startIndex: Int, number: Int) -> Int {
var actualIndex = startIndex
while numbers[actualIndex] == number && actualIndex > 0 {
actualIndex = actualIndex - 1
}
return startIndex - actualIndex
}
A function to match the 4 patterns:
func match(binNrs: [Int]) -> Bool {
let firstPatternNr = binNrs[17]
let secondPatternNr = firstPatternNr == 0 ? 1 : 0
let pattern1Length = matchPattern(numbers: binNrs,
startIndex: 17,
number: firstPatternNr)
if pattern1Length < 3 { return false }
let pattern2Length = matchPattern(numbers: binNrs,
startIndex: 17 - pattern1Length,
number: secondPatternNr)
if pattern2Length < 5 { return false }
let pattern3Length = matchPattern(numbers: binNrs,
startIndex: 17 - pattern1Length - pattern2Length,
number: firstPatternNr)
if pattern3Length < 3 { return false }
let pattern4Length = matchPattern(numbers: binNrs,
startIndex: 17 - pattern1Length - pattern2Length - pattern3Length,
number: secondPatternNr)
return pattern4Length >= 5
}
Some test patterns with results:
let match1 = match(binNrs: [0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0]) // true
let match2 = match(binNrs: [1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) // false (4th sequence < 5)
let match3 = match(binNrs: [0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0]) // false (1st sequence < 3)
let match4 = match(binNrs: [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1]) // false (2nd sequence < 5)

Partition a collection into "k" close-to-equal pieces (Scala, but language agnostic)

Defined before this block of code:
dataset can be a Vector or List
numberOfSlices is an Int denoting how many "times" to slice dataset
I want to split the dataset into numberOfSlices slices, distributed as evenly as possible. By "split" I guess I mean "partition" (intersection of all should be empty, union of all should be the original) to use the set theory term, though this is not necessarily a set, just an arbitrary collection.
e.g.
dataset = List(1, 2, 3, 4, 5, 6, 7)
numberOfSlices = 3
slices == ListBuffer(Vector(1, 2), Vector(3, 4), Vector(5, 6, 7))
Is there a better way to do it than what I have below? (which I'm not even sure is optimal...)
Or perhaps this is not an algorithmically feasible endeavor, in which case any known good heuristics?
val slices = new ListBuffer[Vector[Int]]
val stepSize = dataset.length / numberOfSlices
var currentStep = 0
var looper = 0
while (looper != numberOfSlices) {
if (looper != numberOfSlices - 1) {
slices += dataset.slice(currentStep, currentStep + stepSize)
currentStep += stepSize
} else {
slices += dataset.slice(currentStep, dataset.length)
}
looper += 1
}
If the behavior of xs.grouped(xs.size / n) doesn't work for you, it's pretty easy to define exactly what you want. The quotient is the size of the smaller pieces, and the remainder is the number of the bigger pieces:
def cut[A](xs: Seq[A], n: Int) = {
val (quot, rem) = (xs.size / n, xs.size % n)
val (smaller, bigger) = xs.splitAt(xs.size - rem * (quot + 1))
smaller.grouped(quot) ++ bigger.grouped(quot + 1)
}
The typical "optimal" partition calculates an exact fractional length after cutting and then rounds to find the actual number to take:
def cut[A](xs: Seq[A], n: Int):Vector[Seq[A]] = {
val m = xs.length
val targets = (0 to n).map{x => math.round((x.toDouble*m)/n).toInt}
def snip(xs: Seq[A], ns: Seq[Int], got: Vector[Seq[A]]): Vector[Seq[A]] = {
if (ns.length<2) got
else {
val (i,j) = (ns.head, ns.tail.head)
snip(xs.drop(j-i), ns.tail, got :+ xs.take(j-i))
}
}
snip(xs, targets, Vector.empty)
}
This way your longer and shorter blocks will be interspersed, which is often more desirable for evenness:
scala> cut(List(1,2,3,4,5,6,7,8,9,10),4)
res5: Vector[Seq[Int]] =
Vector(List(1, 2, 3), List(4, 5), List(6, 7, 8), List(9, 10))
You can even cut more times than you have elements:
scala> cut(List(1,2,3),5)
res6: Vector[Seq[Int]] =
Vector(List(1), List(), List(2), List(), List(3))
Here's a one-liner that does the job for me, using the familiar Scala trick of a recursive function that returns a Stream. Notice the use of (x+k/2)/k to round the chunk sizes, intercalating the smaller and larger chunks in the final list, all with sizes with at most one element of difference. If you round up instead, with (x+k-1)/k, you move the smaller blocks to the end, and x/k moves them to the beginning.
def k_folds(k: Int, vv: Seq[Int]): Stream[Seq[Int]] =
if (k > 1)
vv.take((vv.size+k/2)/k) +: k_folds(k-1, vv.drop((vv.size+k/2)/k))
else
Stream(vv)
Demo:
scala> val indices = scala.util.Random.shuffle(1 to 39)
scala> for (ff <- k_folds(7, indices)) println(ff)
Vector(29, 8, 24, 14, 22, 2)
Vector(28, 36, 27, 7, 25, 4)
Vector(6, 26, 17, 13, 23)
Vector(3, 35, 34, 9, 37, 32)
Vector(33, 20, 31, 11, 16)
Vector(19, 30, 21, 39, 5, 15)
Vector(1, 38, 18, 10, 12)
scala> for (ff <- k_folds(7, indices)) println(ff.size)
6
6
5
6
5
6
5
scala> for (ff <- indices.grouped((indices.size+7-1)/7)) println(ff)
Vector(29, 8, 24, 14, 22, 2)
Vector(28, 36, 27, 7, 25, 4)
Vector(6, 26, 17, 13, 23, 3)
Vector(35, 34, 9, 37, 32, 33)
Vector(20, 31, 11, 16, 19, 30)
Vector(21, 39, 5, 15, 1, 38)
Vector(18, 10, 12)
scala> for (ff <- indices.grouped((indices.size+7-1)/7)) println(ff.size)
6
6
6
6
6
6
3
Notice how grouped does not try to even out the size of all the sub-lists.
Here is my take on the problem:
def partition[T](items: Seq[T], partitionsCount: Int): List[Seq[T]] = {
val minPartitionSize = items.size / partitionsCount
val extraItemsCount = items.size % partitionsCount
def loop(unpartitioned: Seq[T], acc: List[Seq[T]], extra: Int): List[Seq[T]] =
if (unpartitioned.nonEmpty) {
val (splitIndex, newExtra) = if (extra > 0) (minPartitionSize + 1, extra - 1) else (minPartitionSize, extra)
val (newPartition, remaining) = unpartitioned.splitAt(splitIndex)
loop(remaining, newPartition :: acc, newExtra)
} else acc
loop(items, List.empty, extraItemsCount).reverse
}
It's more verbose than some of the other solutions but hopefully more clear as well. reverse is only necessary if you want the order to be preserved.
As Kaito mentions grouped is exactly what you are looking for. But if you just want to know how to implement such a method, there are many ways ;-). You could for example do it like this:
def grouped[A](xs: List[A], size: Int) = {
def grouped[A](xs: List[A], size: Int, result: List[List[A]]): List[List[A]] = {
if(xs.isEmpty) {
result
} else {
val (slice, rest) = xs.splitAt(size)
grouped(rest, size, result :+ slice)
}
}
grouped(xs, size, Nil)
}
I'd approach it this way: Given n elements and m partitions (n>m), either n mod m == 0 in which case, each partition will have n/m elements, or n mod m = y, in which case you'll have each partition with n/m elements and you have to distribute y over some m.
You'll have y slots with n/m+1 elements and (m-y) slots with n/m. How you distribute them is your choice.

Levenshtein Distance: Inferring the edit operations from the matrix

I wrote Levenshtein algorithm in in C++
If I input:
string s: democrat
string t: republican
I get the matrix D filled-up and the number of operations (the Levenshtein distance) can be read in D[10][8] = 8
Beyond the filled matrix I want to construct the optimal solution. How must look this solution? I don't have an idea.
Please only write me HOW MUST LOOK for this example.
The question is
Given the matrix produced by the Levenshtein algorithm, how can one find "the optimal solution"?
i.e. how can we find the precise sequence of string operations: inserts, deletes and substitution [of a single letter], necessary to convert the 's string' into the 't string'?
First, it should be noted that in many cases there are SEVERAL optimal solutions. While the Levenshtein algorithm supplies the minimum number of operations (8 in democrat/republican example) there are many sequences (of 8 operations) which can produce this conversion.
By "decoding" the Levenshtein matrix, one can enumerate ALL such optimal sequences.
The general idea is that the optimal solutions all follow a "path", from top left corner to bottom right corner (or in the other direction), whereby the matrix cell values on this path either remain the same or increase by one (or decrease by one in the reverse direction), starting at 0 and ending at the optimal number of operations for the strings in question (0 thru 8 democrat/republican case). The number increases when an operation is necessary, it stays the same when the letter at corresponding positions in the strings are the same.
It is easy to produce an algorithm which produces such a path (slightly more complicated to produce all possible paths), and from such path deduce the sequence of operations.
This path finding algorithm should start at the lower right corner and work its way backward. The reason for this approach is that we know for a fact that to be an optimal solution it must end in this corner, and to end in this corner, it must have come from one of the 3 cells either immediately to its left, immediately above it or immediately diagonally. By selecting a cell among these three cells, one which satisfies our "same value or decreasing by one" requirement, we effectively pick a cell on one of the optimal paths. By repeating the operation till we get on upper left corner (or indeed until we reach a cell with a 0 value), we effectively backtrack our way on an optimal path.
Illustration with the democrat - republican example
It should also be noted that one can build the matrix in one of two ways: with 'democrat' horizontally or vertically. This doesn't change the computation of the Levenshtein distance nor does it change the list of operations needed; it only changes the way we interpret the matrix, for example moving horizontally on the "path" either means inserting a character [from the t string] or deleting a character [off the s string] depending whether 'string s' is "horizontal" or "vertical" in the matrix.
I'll use the following matrix. The conventions are therefore (only going in the left-to-right and/or top-to-bottom directions)
an horizontal move is an INSERTION of a letter from the 't string'
an vertical move is a DELETION of a letter from the 's string'
a diagonal move is either:
a no-operation (both letters at respective positions are the same); the number doesn't change
a SUBSTITUTION (letters at respective positions are distinct); the number increase by one.
Levenshtein matrix for s = "democrat", t="republican"
r e p u b l i c a n
0 1 2 3 4 5 6 7 8 9 10
d 1 1 2 3 4 5 6 7 8 9 10
e 2 2 1 2 3 4 5 6 7 8 9
m 3 3 2 2 3 4 5 6 7 8 9
o 4 4 3 3 3 4 5 6 7 8 9
c 5 5 4 4 4 4 5 6 6 7 8
r 6 5 5 5 5 5 5 6 7 7 8
a 7 6 6 6 6 6 6 6 7 7 8
t 8 7 7 7 7 7 7 7 7 8 8
The arbitrary approach I use to select one path among several possible optimal paths is loosely described below:
Starting at the bottom-rightmost cell, and working our way backward toward
the top left.
For each "backward" step, consider the 3 cells directly adjacent to the current
cell (in the left, top or left+top directions)
if the value in the diagonal cell (going up+left) is smaller or equal to the
values found in the other two cells
AND
if this is same or 1 minus the value of the current cell
then "take the diagonal cell"
if the value of the diagonal cell is one less than the current cell:
Add a SUBSTITUTION operation (from the letters corresponding to
the _current_ cell)
otherwise: do not add an operation this was a no-operation.
elseif the value in the cell to the left is smaller of equal to the value of
the of the cell above current cell
AND
if this value is same or 1 minus the value of the current cell
then "take the cell to left", and
add an INSERTION of the letter corresponding to the cell
else
take the cell above, add
Add a DELETION operation of the letter in 's string'
Following this informal pseudo-code, we get the following:
Start on the "n", "t" cell at bottom right.
Pick the [diagonal] "a", "a" cell as next destination since it is less than the other two (and satisfies the same or -1 condition).
Note that the new cell is one less than current cell
therefore the step 8 is substitute "t" with "n": democra N
Continue with "a", "a" cell,
Pick the [diagonal] "c", "r" cell as next destination...
Note that the new cell is same value as current cell ==> no operation needed.
Continue with "c", "r" cell,
Pick the [diagonal] "i", "c" cell as next destination...
Note that the new cell is one less than current cell
therefore the step 7 is substitute "r" with "c": democ C an
Continue with "i", "c" cell,
Pick the [diagonal] "l", "o" cell as next destination...
Note that the new cell is one less than current cell
therefore the step 6 is substitute "c" with "i": demo I can
Continue with "l", "o" cell,
Pick the [diagonal] "b", "m" cell as next destination...
Note that the new cell is one less than current cell
therefore the step 5 is substitute "o" with "l": dem L ican
Continue with "b", "m" cell,
Pick the [diagonal]"u", "e" cell as next destination...
Note that the new cell is one less than current cell
therefore the step 4 is substitute "m" with "b": de B lican
Continue with "u", "e" cell,
Note the "diagonal" cell doesn't qualify, because the "left" cell is less than it.
Pick the [left] "p", "e" cell as next destination...
therefore the step 3 is instert "u" after "e": de U blican
Continue with "p", "e" cell,
again the "diagonal" cell doesn't qualify
Pick the [left] "e", "e" cell as next destination...
therefore the step 2 is instert "p" after "e": de P ublican
Continue with "e", "e" cell,
Pick the [diagonal] "r", "d" cell as next destination...
Note that the new cell is same value as current cell ==> no operation needed.
Continue with "r", "d" cell,
Pick the [diagonal] "start" cell as next destination...
Note that the new cell is one less than current cell
therefore the step 1 is substitute "d" with "r": R epublican
You've arrived at a cell which value is 0 : your work is done!
The backtracking algorithm to infer the moves from the matrix implemented in python:
def _backtrack_string(matrix, output_word):
'''
Iteratively backtrack DP matrix to get optimal set of moves
Inputs: DP matrix (list:list:int),
Input word (str),
Output word (str),
Start x position in DP matrix (int),
Start y position in DP matrix (int)
Output: Optimal path (list)
'''
i = len(matrix) - 1
j = len(matrix[0]) - 1
optimal_path = []
while i > 0 and j > 0:
diagonal = matrix[i-1][j-1]
vertical = matrix[i-1][j]
horizontal = matrix[i][j-1]
current = matrix[i][j]
if diagonal <= vertical and diagonal <= horizontal and (diagonal <= current):
i = i - 1
j = j - 1
if diagonal == current - 1:
optimal_path.append("Replace " + str(j) + ", " + str(output_word[j]) )
elif horizontal <= vertical and horizontal <= current:
j = j - 1
optimal_path.append("Insert " + str(j) + ", " + str(output_word[j]))
elif vertical <= horizontal and vertical <= current:
i = i - 1
optimal_path.append("Delete " + str(i))
elif horizontal <= vertical and horizontal <= current:
j = j - 1
optimal_path.append("Insert " + str(j) + ", " + str(output_word[j]))
else:
i = i - 1
optimal_path.append("Delete " + str(i))
return reversed(optimal_path)
The output I get when I run the algorithm with original word "OPERATING" and desired word "CONSTANTINE" is the following
Insert 0, C
Replace 2, N
Replace 3, S
Replace 4, T
Insert 6, N
Replace 10, E
"" C O N S T A N T I N E
"" [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
<-- Insert 0, C
O [1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
\ Replace 2, N
P [2, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10]
\ Replace 3, S
E [3, 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 9]
\ Replace 4, T
R [4, 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10] No move
\ <-- Insert 6, N
A [5, 5, 5, 5, 5, 5, 4, 5, 6, 7, 8, 9]
\ No move
T [6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 7, 8]
\ No move
I [7, 7, 7, 7, 7, 6, 6, 6, 6, 5, 6, 7]
\ No move
N [8, 8, 8, 7, 8, 7, 7, 6, 7, 6, 5, 6]
\ Replace 10, E
G [9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6]
Note that I had to add extra conditions if the element in the diagonal is the same as the current element. There could be a deletion or insertion depending on values in the vertical (up) and horizontal (left) positions. We only get a "no operation" or "replace" operation when the following occurs
# assume bottom right of a 2x2 matrix is the reference position
# and has value v
# the following is the situation where we get a replace operation
[v + 1 , v<]
[ v< , v]
# the following is the situation where we get a "no operation"
[v , v<]
[v<, v ]
I think this is where the algorithm described in the first answer could break. There could be other arrangements in the 2x2 matrix above when neither operations are correct. The example shown with input "OPERATING" and output "CONSTANTINE" breaks the algorithm unless this is taken into account.
It's been some times since I played with it, but it seems to me the matrix should look something like:
. . r e p u b l i c a n
. 0 1 2 3 4 5 6 7 8 9 10
d 1 1 2 3 4 5 6 7 8 9 10
e 2 2 1 2 3 4 5 6 7 8 9
m 3 3 2 2 3 4 5 6 7 8 9
o 4 4 3 3 3 4 5 6 7 8 9
c 5 5 4 4 4 4 5 6 7 8 9
r 6 5 5 5 5 5 5 6 7 8 9
a 7 6 6 6 6 6 6 6 7 7 8
t 8 7 7 7 7 7 7 7 7 7 8
Don't take it for granted though.
Here is a VBA algorithm based on mjv's answer.
(very well explained, but some case were missing).
Sub TU_Levenshtein()
Call Levenshtein("democrat", "republican")
Call Levenshtein("ooo", "u")
Call Levenshtein("ceci est un test", "ceci n'est pas un test")
End Sub
Sub Levenshtein(ByVal string1 As String, ByVal string2 As String)
' Fill Matrix Levenshtein (-> array 'Distance')
Dim i As Long, j As Long
Dim string1_length As Long
Dim string2_length As Long
Dim distance() As Long
string1_length = Len(string1)
string2_length = Len(string2)
ReDim distance(string1_length, string2_length)
For i = 0 To string1_length
distance(i, 0) = i
Next
For j = 0 To string2_length
distance(0, j) = j
Next
For i = 1 To string1_length
For j = 1 To string2_length
If Asc(Mid$(string1, i, 1)) = Asc(Mid$(string2, j, 1)) Then
distance(i, j) = distance(i - 1, j - 1)
Else
distance(i, j) = Application.WorksheetFunction.min _
(distance(i - 1, j) + 1, _
distance(i, j - 1) + 1, _
distance(i - 1, j - 1) + 1)
End If
Next
Next
LevenshteinDistance = distance(string1_length, string2_length) ' for information only
' Write Matrix on VBA sheets (only for visuation, not used in calculus)
Cells.Clear
For i = 1 To UBound(distance, 1)
Cells(i + 2, 1).Value = Mid(string1, i, 1)
Next i
For i = 1 To UBound(distance, 2)
Cells(1, i + 2).Value = Mid(string2, i, 1)
Next i
For i = 0 To UBound(distance, 1)
For j = 0 To UBound(distance, 2)
Cells(i + 2, j + 2) = distance(i, j)
Next j
Next i
' One solution
current_posx = UBound(distance, 1)
current_posy = UBound(distance, 2)
Do
cc = distance(current_posx, current_posy)
Cells(current_posx + 1, current_posy + 1).Interior.Color = vbYellow ' visualisation again
' Manage border case
If current_posy - 1 < 0 Then
MsgBox ("deletion. " & Mid(string1, current_posx, 1))
current_posx = current_posx - 1
current_posy = current_posy
GoTo suivant
End If
If current_posx - 1 < 0 Then
MsgBox ("insertion. " & Mid(string2, current_posy, 1))
current_posx = current_posx
current_posy = current_posy - 1
GoTo suivant
End If
' Middle cases
cc_L = distance(current_posx, current_posy - 1)
cc_U = distance(current_posx - 1, current_posy)
cc_D = distance(current_posx - 1, current_posy - 1)
If (cc_D <= cc_L And cc_D <= cc_U) And (cc_D = cc - 1 Or cc_D = cc) Then
If (cc_D = cc - 1) Then
MsgBox "substitution. " & Mid(string1, current_posx, 1) & " by " & Mid(string2, current_posy, 1)
current_posx = current_posx - 1
current_posy = current_posy - 1
GoTo suivant
Else
MsgBox "no operation"
current_posx = current_posx - 1
current_posy = current_posy - 1
GoTo suivant
End If
ElseIf cc_L <= cc_D And cc_L = cc - 1 Then
MsgBox ("insertion. " & Mid(string2, current_posy, 1))
current_posx = current_posx
current_posy = current_posy - 1
GoTo suivant
Else
MsgBox ("deletion." & Mid(string1, current_posy, 1))
current_posx = current_posx
current_posy = current_posy - 1
GoTo suivant
End If
suivant:
Loop While Not (current_posx = 0 And current_posy = 0)
End Sub
I've done some work with the Levenshtein distance algorithm's matrix recently. I needed to produce the operations which would transform one list into another. (This will work for strings too.)
Do the following (vows) tests show the sort of functionality that you're looking for?
, "lev - complex 2"
: { topic
: lev.diff([13, 6, 5, 1, 8, 9, 2, 15, 12, 7, 11], [9, 13, 6, 5, 1, 8, 2, 15, 12, 11])
, "check actions"
: function(topic) { assert.deepEqual(topic, [{ op: 'delete', pos: 9, val: 7 },
{ op: 'delete', pos: 5, val: 9 },
{ op: 'insert', pos: 0, val: 9 },
]); }
}
, "lev - complex 3"
: { topic
: lev.diff([9, 13, 6, 5, 1, 8, 2, 15, 12, 11], [13, 6, 5, 1, 8, 9, 2, 15, 12, 7, 11])
, "check actions"
: function(topic) { assert.deepEqual(topic, [{ op: 'delete', pos: 0, val: 9 },
{ op: 'insert', pos: 5, val: 9 },
{ op: 'insert', pos: 9, val: 7 }
]); }
}
, "lev - complex 4"
: { topic
: lev.diff([9, 13, 6, 5, 1, 8, 2, 15, 12, 11, 16], [13, 6, 5, 1, 8, 9, 2, 15, 12, 7, 11, 17])
, "check actions"
: function(topic) { assert.deepEqual(topic, [{ op: 'delete', pos: 0, val: 9 },
{ op: 'insert', pos: 5, val: 9 },
{ op: 'insert', pos: 9, val: 7 },
{ op: 'replace', pos: 11, val: 17 }
]); }
}
Here is some Matlab code, is this correct by your opinion? Seems to give the right results :)
clear all
s = char('democrat');
t = char('republican');
% Edit Matrix
m=length(s);
n=length(t);
mat=zeros(m+1,n+1);
for i=1:1:m
mat(i+1,1)=i;
end
for j=1:1:n
mat(1,j+1)=j;
end
for i=1:m
for j=1:n
if (s(i) == t(j))
mat(i+1,j+1)=mat(i,j);
else
mat(i+1,j+1)=1+min(min(mat(i+1,j),mat(i,j+1)),mat(i,j));
end
end
end
% Edit Sequence
s = char('democrat');
t = char('republican');
i = m+1;
j = n+1;
display([s ' --> ' t])
while(i ~= 1 && j ~= 1)
temp = min(min(mat(i-1,j-1), mat(i,j-1)), mat(i-1,j));
if(mat(i-1,j) == temp)
i = i - 1;
t = [t(1:j-1) s(i) t(j:end)];
disp(strcat(['iinsertion: i=' int2str(i) ' , j=' int2str(j) ' ; ' s ' --> ' t]))
elseif(mat(i-1,j-1) == temp)
if(mat(i-1,j-1) == mat(i,j))
i = i - 1;
j = j - 1;
disp(strcat(['uunchanged: i=' int2str(i) ' , j=' int2str(j) ' ; ' s ' --> ' t]))
else
i = i - 1;
j = j - 1;
t(j) = s(i);
disp(strcat(['substition: i=' int2str(i) ' , j=' int2str(j) ' ; ' s ' --> ' t]))
end
elseif(mat(i,j-1) == temp)
j = j - 1;
t(j) = [];
disp(strcat(['dddeletion: i=' int2str(i) ' , j=' int2str(j) ' ; ' s ' --> ' t]))
end
end
C# implementation of JackIsJack answer with some changes:
Operations are output in 'forward' order (JackIsJack outputs in reverse order);
Last 'else' clause in original answer worked incorrectly (looks like copy-paste error).
Console application code:
class Program
{
static void Main(string[] args)
{
Levenshtein("1", "1234567890");
Levenshtein( "1234567890", "1");
Levenshtein("kitten", "mittens");
Levenshtein("mittens", "kitten");
Levenshtein("kitten", "sitting");
Levenshtein("sitting", "kitten");
Levenshtein("1234567890", "12356790");
Levenshtein("12356790", "1234567890");
Levenshtein("ceci est un test", "ceci n'est pas un test");
Levenshtein("ceci n'est pas un test", "ceci est un test");
}
static void Levenshtein(string string1, string string2)
{
Console.WriteLine("Levenstein '" + string1 + "' => '" + string2 + "'");
var string1_length = string1.Length;
var string2_length = string2.Length;
int[,] distance = new int[string1_length + 1, string2_length + 1];
for (int i = 0; i <= string1_length; i++)
{
distance[i, 0] = i;
}
for (int j = 0; j <= string2_length; j++)
{
distance[0, j] = j;
}
for (int i = 1; i <= string1_length; i++)
{
for (int j = 1; j <= string2_length; j++)
{
if (string1[i - 1] == string2[j - 1])
{
distance[i, j] = distance[i - 1, j - 1];
}
else
{
distance[i, j] = Math.Min(distance[i - 1, j] + 1, Math.Min(
distance[i, j - 1] + 1,
distance[i - 1, j - 1] + 1));
}
}
}
var LevenshteinDistance = distance[string1_length, string2_length];// for information only
Console.WriteLine($"Levernstein distance: {LevenshteinDistance}");
// List of operations
var current_posx = string1_length;
var current_posy = string2_length;
var stack = new Stack<string>(); // for outputting messages in forward direction
while (current_posx != 0 || current_posy != 0)
{
var cc = distance[current_posx, current_posy];
// edge cases
if (current_posy - 1 < 0)
{
stack.Push("Delete '" + string1[current_posx - 1] + "'");
current_posx--;
continue;
}
if (current_posx - 1 < 0)
{
stack.Push("Insert '" + string2[current_posy - 1] + "'");
current_posy--;
continue;
}
// Middle cases
var cc_L = distance[current_posx, current_posy - 1];
var cc_U = distance[current_posx - 1, current_posy];
var cc_D = distance[current_posx - 1, current_posy - 1];
if ((cc_D <= cc_L && cc_D <= cc_U) && (cc_D == cc - 1 || cc_D == cc))
{
if (cc_D == cc - 1)
{
stack.Push("Substitute '" + string1[current_posx - 1] + "' by '" + string2[current_posy - 1] + "'");
current_posx--;
current_posy--;
}
else
{
stack.Push("Keep '" + string1[current_posx - 1] + "'");
current_posx--;
current_posy--;
}
}
else if (cc_L <= cc_D && cc_L == cc - 1)
{
stack.Push("Insert '" + string2[current_posy - 1] + "'");
current_posy--;
}
else
{
stack.Push("Delete '" + string1[current_posx - 1]+"'");
current_posx--;
}
}
while(stack.Count > 0)
{
Console.WriteLine(stack.Pop());
}
}
}
The code to get all the edit paths according to edit matrix, source and target. Make a comment if there are any bugs. Thanks a lot!
import copy
from typing import List, Union
def edit_distance(source: Union[List[str], str],
target: Union[List[str], str],
return_distance: bool = False):
"""get the edit matrix
"""
edit_matrix = [[i + j for j in range(len(target) + 1)] for i in range(len(source) + 1)]
for i in range(1, len(source) + 1):
for j in range(1, len(target) + 1):
if source[i - 1] == target[j - 1]:
d = 0
else:
d = 1
edit_matrix[i][j] = min(edit_matrix[i - 1][j] + 1,
edit_matrix[i][j - 1] + 1,
edit_matrix[i - 1][j - 1] + d)
if return_distance:
return edit_matrix[len(source)][len(target)]
return edit_matrix
def get_edit_paths(matrix: List[List[int]],
source: Union[List[str], str],
target: Union[List[str], str]):
"""get all the valid edit paths
"""
all_paths = []
def _edit_path(i, j, optimal_path):
if i > 0 and j > 0:
diagonal = matrix[i - 1][j - 1] # the diagonal value
vertical = matrix[i - 1][j] # the above value
horizontal = matrix[i][j - 1] # the left value
current = matrix[i][j] # current value
# whether the source and target token are the same
flag = False
# compute the minimal value of the diagonal, vertical and horizontal
minimal = min(diagonal, min(vertical, horizontal))
# if the diagonal is the minimal
if diagonal == minimal:
new_i = i - 1
new_j = j - 1
path_ = copy.deepcopy(optimal_path)
# if the diagnoal value equals to current - 1
# it means `replace`` operation
if diagonal == current - 1:
path_.append(f"Replace | {new_j} | {target[new_j]}")
_edit_path(new_i, new_j, path_)
# if the diagonal value equals to current value
# and corresponding positional value of source and target equal
# it means this is current best path
elif source[new_i] == target[new_j]:
flag = True
# path_.append(f"Keep | {new_i}")
_edit_path(new_i, new_j, path_)
# if the position doesn't have best path
# we need to consider other situations
if not flag:
# if vertical value equals to minimal
# it means delete source corresponding value
if vertical == minimal:
new_i = i - 1
new_j = j
path_ = copy.deepcopy(optimal_path)
path_.append(f"Delete | {new_i}")
_edit_path(new_i, new_j, path_)
# if horizontal value equals to minimal
# if mean insert target corresponding value to source
if horizontal == minimal:
new_i = i
new_j = j - 1
path_ = copy.deepcopy(optimal_path)
path_.append(f"Insert | {new_j} | {target[new_j]}")
_edit_path(new_i, new_j, path_)
else:
all_paths.append(list(reversed(optimal_path)))
# get the rows and columns of the edit matrix
row_len = len(matrix) - 1
col_len = len(matrix[0]) - 1
_edit_path(row_len, col_len, optimal_path=[])
return all_paths
if __name__ == "__main__":
source = "BBDEF"
target = "ABCDF"
matrix = edit_distance(source, target)
print("print paths")
paths = get_edit_paths(matrix, source=list(source), target=list(target))
for path in paths:
print(path)

Resources