Created
October 19, 2020 02:47
-
-
Save sharkdeng/0be9cdad9b93b67f2e81656459f05eba to your computer and use it in GitHub Desktop.
Jaro-wrinkle to compare string similarity (updated version of Jaro)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def jaro(val1, val2): | |
''' | |
Computes the Jaro similarity between 2 sequences from: | |
Matthew A. Jaro (1989). Advances in record linkage methodology | |
as applied to the 1985 census of Tampa Florida. Journal of the | |
American Statistical Association. 84 (406): 414–20. | |
Returns a value between 0.0 and 1.0. | |
Implemented by Shark | |
''' | |
# If at least one of the values is empty return 0 | |
if (val1 == '') or (val2 == ''): | |
return 0.0 | |
# If both attribute values exactly match return 1 | |
elif (val1 == val2): | |
return 1.0 | |
# 1 get |s1| and |s2| | |
len1 = len(val1) | |
len2 = len(val2) | |
ass1 = '' # found characters | |
ass2 = '' | |
halflen = int(max(len1, len2)/2)-1 | |
JARO_MARKER_CHAR = '1' # to indicate this character has been found | |
# 2 get matches | |
m = 0 | |
# traverse the first string | |
for i in range(len1): | |
start = max(0, i-halflen) | |
end = min(i+halflen+1, len2) | |
j = val2.find(val1[i], start, end) | |
if j > -1: # found | |
ass1 += val1[i] | |
ass2 += val2[j] | |
m+=1 | |
# replace the found character to avoid duplicate finding | |
val2 = val2[:j] + JARO_MARKER_CHAR + val2[j+1:] | |
if m == 0: | |
return 0 | |
# 3 get transposition | |
t = 0 | |
for i in range(len(ass1)): | |
if ass1[i] != ass2[i]: | |
t += 1 | |
t = t/2.0 | |
# 4 get jaro | |
jaro = (m/len1 + m/len2 + (m-t)/m)/3 | |
assert (jaro >= 0) and (jaro <= 1), f'Jaro score {jaro} should be in [0, 1]' | |
return round(jaro, 4) | |
def jaro_wrinkle(val1, val2): | |
if val1 == val2: | |
return 1 | |
# Sj: jaro score | |
Sj = jaro(val1, val2) | |
if Sj == 0: | |
return 0 | |
# L: get the length of matching prefix | |
minlen = min(len(val1), len(val2)) | |
for L in range(1, minlen+1): # start 1, since val1[:0] = '' = val2[:0] | |
if val1[:L] != val2[:L]: | |
break | |
L -= 1 | |
L = 4 if L>4 else L | |
assert L > 0 | |
# P: scaling factor | |
P = 0.1 | |
# Sw = Sj + P * L * (1 – Sj) | |
Sw = Sj + P * L * (1-Sj) | |
return round(Sw, 4) | |
def test(): | |
assert jaro_wrinkle('DwAyNE', 'DuANE') == 0.84 | |
assert jaro_wrinkle('TRATE', 'TRACE') == 0.9067 | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment