Skip to content

Instantly share code, notes, and snippets.

@sharkdeng
Created October 19, 2020 02:47
Show Gist options
  • Save sharkdeng/0be9cdad9b93b67f2e81656459f05eba to your computer and use it in GitHub Desktop.
Save sharkdeng/0be9cdad9b93b67f2e81656459f05eba to your computer and use it in GitHub Desktop.
Jaro-wrinkle to compare string similarity (updated version of Jaro)
def jaro(val1, val2):
'''
Computes the Jaro similarity between 2 sequences from:
Matthew A. Jaro (1989). Advances in record linkage methodology
as applied to the 1985 census of Tampa Florida. Journal of the
American Statistical Association. 84 (406): 414–20.
Returns a value between 0.0 and 1.0.
Implemented by Shark
'''
# If at least one of the values is empty return 0
if (val1 == '') or (val2 == ''):
return 0.0
# If both attribute values exactly match return 1
elif (val1 == val2):
return 1.0
# 1 get |s1| and |s2|
len1 = len(val1)
len2 = len(val2)
ass1 = '' # found characters
ass2 = ''
halflen = int(max(len1, len2)/2)-1
JARO_MARKER_CHAR = '1' # to indicate this character has been found
# 2 get matches
m = 0
# traverse the first string
for i in range(len1):
start = max(0, i-halflen)
end = min(i+halflen+1, len2)
j = val2.find(val1[i], start, end)
if j > -1: # found
ass1 += val1[i]
ass2 += val2[j]
m+=1
# replace the found character to avoid duplicate finding
val2 = val2[:j] + JARO_MARKER_CHAR + val2[j+1:]
if m == 0:
return 0
# 3 get transposition
t = 0
for i in range(len(ass1)):
if ass1[i] != ass2[i]:
t += 1
t = t/2.0
# 4 get jaro
jaro = (m/len1 + m/len2 + (m-t)/m)/3
assert (jaro >= 0) and (jaro <= 1), f'Jaro score {jaro} should be in [0, 1]'
return round(jaro, 4)
def jaro_wrinkle(val1, val2):
if val1 == val2:
return 1
# Sj: jaro score
Sj = jaro(val1, val2)
if Sj == 0:
return 0
# L: get the length of matching prefix
minlen = min(len(val1), len(val2))
for L in range(1, minlen+1): # start 1, since val1[:0] = '' = val2[:0]
if val1[:L] != val2[:L]:
break
L -= 1
L = 4 if L>4 else L
assert L > 0
# P: scaling factor
P = 0.1
# Sw = Sj + P * L * (1 – Sj)
Sw = Sj + P * L * (1-Sj)
return round(Sw, 4)
def test():
assert jaro_wrinkle('DwAyNE', 'DuANE') == 0.84
assert jaro_wrinkle('TRATE', 'TRACE') == 0.9067
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment