brandonrobertz · December 9, 2021 20:44
diff --git a/damerau_levenshtein_distance.py b/damerau_levenshtein_distance.py
 # Damerau-Levenshtein edit distance implementation for Python 3
 # Python 3 compatable, forked from: https://gist.github.com/pombredanne/0d83ad58f45986ddeb0917266e106be0
 # Which was based on: https://gist.github.com/badocelot/5327427
 # Which was based on pseudocode from Wikipedia: https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance

 # Possible improvement by treating 1 addition + 1 deletion = 1 substitution
 # between transposed characters:
 #
 # Damerau-Levenshtein distance for "abcdef" and "abcfad" = 3:
 #   1. substitute "d" for "f"
 #   2. substitute "e" for "a"
 #   3. substitute "f" for "d"
 #
 # Or alternatively:
 #   1. transpose "d" and "f"
 #   2. delete "a"
 #   3. insert "e"
 #
 # It's obvious that (2) and (3) in the second analysis are really just one
 # substitution:
 #   1. transpose "d" and "f"
 #   2. substitute "e" for "a"
 #
 # With this variant, the distance between "abcdef" and "abcfad" is in fact 2.

 def damerau_levenshtein_distance(a, b, normalized=False):
    # "Infinity" -- greater than maximum possible edit distance
    # Used to prevent transpositions for first characters
    INF = len(a) + len(b)

    # Matrix: (M + 2) x (N + 2)
    matrix  = [[INF] * (len(b) + 2)]
    matrix += [[INF] + list(range(len(b) + 1))]
    matrix += [[INF, m] + [0] * len(b) for m in range(1, len(a) + 1)]

    # Holds last row each element was encountered: DA in the Wikipedia pseudocode
    last_row = {}

    # Fill in costs
    for row in range(1, len(a) + 1):
        # Current character in a
        ch_a = a[row-1]

        # Column of last match on this row: DB in pseudocode
        last_match_col = 0

        for col in range(1, len(b) + 1):
            # Current character in b
            ch_b = b[col-1]

            # Last row with matching character
            last_matching_row = last_row.get(ch_b, 0)

            # Cost of substitution
            cost = 0 if ch_a == ch_b else 1

            # Compute substring distance
            matrix[row+1][col+1] = min(
                matrix[row][col] + cost, # Substitution
                matrix[row+1][col] + 1,  # Addition
                matrix[row][col+1] + 1,  # Deletion

                # Transposition
                # Start by reverting to cost before transposition
                matrix[last_matching_row][last_match_col]
                    # Cost of letters between transposed letters
                    # 1 addition + 1 deletion = 1 substitution
                    + max((row - last_matching_row - 1),
                          (col - last_match_col - 1))
                    # Cost of the transposition itself
                    + 1)

            # If there was a match, update last_match_col
            if cost == 0:
                last_match_col = col

        # Update last row for current character
        last_row[ch_a] = row

    # last element is the final edit distance
    dl_diff = matrix[-1][-1]

    if normalized:
        # this gives us a 1-0 scale of distance, 1 meaning
        # most different, 0 meaning no differences
        return dl_diff / max(len(a), len(b))

    return dl_diff
	# Damerau-Levenshtein edit distance implementation for Python 3
	# Python 3 compatable, forked from: https://gist.github.com/pombredanne/0d83ad58f45986ddeb0917266e106be0
	# Which was based on: https://gist.github.com/badocelot/5327427
	# Which was based on pseudocode from Wikipedia: https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance

	# Possible improvement by treating 1 addition + 1 deletion = 1 substitution
	# between transposed characters:
	#
	# Damerau-Levenshtein distance for "abcdef" and "abcfad" = 3:
	# 1. substitute "d" for "f"
	# 2. substitute "e" for "a"
	# 3. substitute "f" for "d"
	#
	# Or alternatively:
	# 1. transpose "d" and "f"
	# 2. delete "a"
	# 3. insert "e"
	#
	# It's obvious that (2) and (3) in the second analysis are really just one
	# substitution:
	# 1. transpose "d" and "f"
	# 2. substitute "e" for "a"
	#
	# With this variant, the distance between "abcdef" and "abcfad" is in fact 2.

	def damerau_levenshtein_distance(a, b, normalized=False):
	# "Infinity" -- greater than maximum possible edit distance
	# Used to prevent transpositions for first characters
	INF = len(a) + len(b)

	# Matrix: (M + 2) x (N + 2)
	matrix = [[INF] * (len(b) + 2)]
	matrix += [[INF] + list(range(len(b) + 1))]
	matrix += [[INF, m] + [0] * len(b) for m in range(1, len(a) + 1)]

	# Holds last row each element was encountered: DA in the Wikipedia pseudocode
	last_row = {}

	# Fill in costs
	for row in range(1, len(a) + 1):
	# Current character in a
	ch_a = a[row-1]

	# Column of last match on this row: DB in pseudocode
	last_match_col = 0

	for col in range(1, len(b) + 1):
	# Current character in b
	ch_b = b[col-1]

	# Last row with matching character
	last_matching_row = last_row.get(ch_b, 0)

	# Cost of substitution
	cost = 0 if ch_a == ch_b else 1

	# Compute substring distance
	matrix[row+1][col+1] = min(
	matrix[row][col] + cost, # Substitution
	matrix[row+1][col] + 1, # Addition
	matrix[row][col+1] + 1, # Deletion

	# Transposition
	# Start by reverting to cost before transposition
	matrix[last_matching_row][last_match_col]
	# Cost of letters between transposed letters
	# 1 addition + 1 deletion = 1 substitution
	+ max((row - last_matching_row - 1),
	(col - last_match_col - 1))
	# Cost of the transposition itself
	+ 1)

	# If there was a match, update last_match_col
	if cost == 0:
	last_match_col = col

	# Update last row for current character
	last_row[ch_a] = row

	# last element is the final edit distance
	dl_diff = matrix[-1][-1]

	if normalized:
	# this gives us a 1-0 scale of distance, 1 meaning
	# most different, 0 meaning no differences
	return dl_diff / max(len(a), len(b))

	return dl_diff