Created
February 19, 2020 16:10
-
-
Save jlherren/d97839b1276b9bd7faa930f74711a4b6 to your computer and use it in GitHub Desktop.
Find Levenshtein distance between two strings and construct edit instructions to go from one string to the other
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
def wagner_fisher(s: str, t: str): | |
""" | |
Computes the Levenshtein distance between the two strings. Returns a tuple containing | |
the distance itself and also the entire matrix for further processing. | |
See: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm | |
""" | |
m, n = len(s), len(t) | |
d = numpy.zeros(shape=(m + 1, n + 1), dtype='int32') | |
for i in range(1, m + 1): | |
d[i, 0] = i | |
for j in range(1, n + 1): | |
d[0, j] = j | |
for j in range(1, n + 1): | |
for i in range(1, m + 1): | |
if s[i - 1] == t[j - 1]: | |
substitutionCost = 0 | |
else: | |
substitutionCost = 1 | |
d[i, j] = min(d[i - 1, j] + 1, d[i, j - 1] + 1, d[i - 1, j - 1] + substitutionCost) | |
return d[m, n], d | |
def edit_instructions(s: str, t: str): | |
""" | |
Compute the edit operations required to get from string s to string t | |
""" | |
distance, d = wagner_fisher(s, t) | |
m, n = len(s), len(t) | |
instructions = [] | |
while m > 0 or n > 0: | |
deletion_score = d[m - 1, n] if m >= 1 else float('inf') | |
insertion_score = d[m, n - 1] if n >= 1 else float('inf') | |
substitution_or_noop_score = d[m - 1, n - 1] if m >= 1 and n >= 1 else float('inf') | |
smallest = min(deletion_score, insertion_score, substitution_or_noop_score) | |
if smallest == substitution_or_noop_score: | |
if d[m - 1, n - 1] < d[m, n]: | |
instructions.append('substitute "%s" with "%s" at position %d' % (s[m - 1], t[n - 1], n - 1)) | |
m -= 1 | |
n -= 1 | |
elif smallest == deletion_score: | |
instructions.append('delete "%s" at position %d' % (s[m - 1], n)) | |
m -= 1 | |
elif smallest == insertion_score: | |
instructions.append('insert "%s" at position %d' % (t[n - 1], n - 1)) | |
n -= 1 | |
if distance != len(instructions): | |
raise Exception('Internal error') | |
return instructions[::-1] | |
# A few tests | |
def test(s, t, instructions): | |
i = edit_instructions(s, t) | |
i = ', '.join(i) | |
if i != instructions: | |
print('Test failed for "%s" and "%s"' % (s, t)) | |
print('Expected:', instructions) | |
print('Obtained:', i) | |
for s in ['', 'a', 'aa', 'aaa', 'ab', 'abc', 'abcd']: | |
test(s, s, '') | |
test('aa', 'a', 'delete "a" at position 1') | |
test('a', 'aa', 'insert "a" at position 1') | |
test('a', 'b', 'substitute "a" with "b" at position 0') | |
test('a', 'ab', 'insert "b" at position 1') | |
test('a', 'ba', 'insert "b" at position 0') | |
test('bc', 'abc', 'insert "a" at position 0') | |
test('ac', 'abc', 'insert "b" at position 1') | |
test('ab', 'abc', 'insert "c" at position 2') | |
test('abc', 'bc', 'delete "a" at position 0') | |
test('abc', 'ac', 'delete "b" at position 1') | |
test('abc', 'ab', 'delete "c" at position 2') | |
test('abc', 'cba', 'substitute "a" with "c" at position 0, substitute "c" with "a" at position 2') | |
test('abc', '123', 'substitute "a" with "1" at position 0, substitute "b" with "2" at position 1, ' + | |
'substitute "c" with "3" at position 2') | |
test('abc', 'ab3', 'substitute "c" with "3" at position 2') | |
test('abc', 'a2c', 'substitute "b" with "2" at position 1') | |
test('abc', '1bc', 'substitute "a" with "1" at position 0') | |
test('abc--abc', 'bc--bc', 'delete "a" at position 0, delete "a" at position 4') | |
test('abc--abc', 'bc--0abc', 'delete "a" at position 0, insert "0" at position 4') | |
test('abc--abc', 'bc--1bc', 'delete "a" at position 0, substitute "a" with "1" at position 4') | |
test('abc--abc', '0abc--bc', 'insert "0" at position 0, delete "a" at position 6') | |
test('abc--abc', '0abc--0abc', 'insert "0" at position 0, insert "0" at position 6') | |
test('abc--abc', '0abc--1abc', 'insert "0" at position 0, insert "1" at position 6') | |
test('abc--abc', '1bc--bc', 'substitute "a" with "1" at position 0, delete "a" at position 5') | |
test('abc--abc', '1bc--0abc', 'substitute "a" with "1" at position 0, insert "0" at position 5') | |
test('abc--abc', '1bc--1bc', 'substitute "a" with "1" at position 0, substitute "a" with "1" at position 5') | |
test('abcdefghijklmnopqrstuvwxyz', 'a0bcefGhi0jkmnOpq0rsuvWxyz', 'insert "0" at position 1, delete "d" at position 4, ' + | |
'substitute "g" with "G" at position 6, insert "0" at position 9, delete "l" at position 12, ' + | |
'substitute "o" with "O" at position 14, insert "0" at position 17, delete "t" at position 20, ' + | |
'substitute "w" with "W" at position 22') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment