Skip to content

Instantly share code, notes, and snippets.

@Gerenuk
Created June 29, 2020 16:42
Show Gist options
  • Save Gerenuk/d7fee2ebe3cfc1c945e86c5e1d75d572 to your computer and use it in GitHub Desktop.
Save Gerenuk/d7fee2ebe3cfc1c945e86c5e1d75d572 to your computer and use it in GitHub Desktop.
import re
import unicodedata
STATE_FIND_STREETNAME = 1
STATE_FIND_FIRST_NUMBER = 2
STATE_FIND_SECOND_NUMBER = 3
STATE_ALL_FOUND = 4
def normalize_street(
street,
regex=re.compile("[a-zß]+|[0-9]+"), # relevant after unicode normalization
MAX_HAUSNUMMER=10000,
MAX_HAUSNUMMER_DIFF=2,
LETTERS_AFTER_HAUSNUMMER="abc",
replace=None,
):
"""
Tokenizes by above regex. Then expects:
Alpha+ ((Num1 Letter?) (Num2 Letter?)?)?
Input is removed from accents
Consider replacing ä->ae, straße->str etc. maybe
>>> Abc dßäöüf ghi a21 b - 23 a
('abc dßaouf ghi a', {21, 22, 23}, True)
"""
if not street: # leer
return "", set()
x = street
if replace is not None:
for old, new in replace.items():
x = x.replace(old, new)
x = x.lower()
x = unicodedata.normalize("NFKD", x) # remove accents
x = "".join(c for c in x if not unicodedata.combining(c)) # remove accents
parts = street_regex.findall(x)
street_name_parts = []
number1 = None
number2 = None
clean_match = True
try:
parts_iter = iter(parts)
state = STATE_FIND_STREETNAME
part = next(parts_iter)
while 1:
if state == STATE_FIND_STREETNAME:
if part.isalpha():
street_name_parts.append(part)
part = next(parts_iter)
else:
state = STATE_FIND_FIRST_NUMBER
elif state == STATE_FIND_FIRST_NUMBER:
if part.isdecimal():
number1 = int(part)
part = next(parts_iter)
if part in LETTERS_AFTER_HAUSNUMMER:
part = next(parts_iter)
state = STATE_FIND_SECOND_NUMBER
else:
raise ValueError(f"Sollte Nummer sein: {part} in {street}")
elif state == STATE_FIND_SECOND_NUMBER:
if part.isdecimal():
number2 = int(part)
part = next(parts_iter)
if part in LETTERS_AFTER_HAUSNUMMER:
part = next(parts_iter)
state = STATE_ALL_FOUND
elif state == STATE_ALL_FOUND:
print(
f"Mehr Text '{part}' nachdem Straße/Hausnummer schon erkannt: {street}"
)
clean_match = False
raise StopIteration()
else:
assert 0, "Illegal state"
except StopIteration:
pass
if number1 is not None and not (0 < number1 <= MAX_HAUSNUMMER):
print(f"Illegale Hausnummer {number1} in '{street}'")
number1 = None
if number2 is not None and not (0 < number2 <= MAX_HAUSNUMMER):
print(f"Illegale Hausnummer {number2} in '{street}'")
number2 = None
if number1 is not None:
if number2 is not None:
if number1 < number2 and number2 - number1 <= MAX_HAUSNUMMER_DIFF:
numbers = set(range(number1, number2 + 1))
else:
print(f"Keine etwas ansteigenden Nummern in '{street}'")
numbers = {number1}
clean_match = False
else:
numbers = {number1}
else:
numbers=set()
norm_street = " ".join(street_name_parts)
if not clean_match: # Usually for ambigious normalizations
print(
f"Fallback normalization: '{street}' --> '{norm_street} {','.join(map(str, sorted(numbers)))}'"
)
return norm_street, numbers, clean_match
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment