Last active
August 29, 2015 14:18
-
-
Save hideaki-t/198898f44aab078ed1a1 to your computer and use it in GitHub Desktop.
normalize_neologd.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf8 | |
from __future__ import unicode_literals | |
import re | |
import unicodedata | |
def unicode_normalize(cls, s): | |
pt = re.compile('([{}]+)'.format(cls)) | |
def norm(c): | |
return unicodedata.normalize('NFKC', c) if pt.match(c) else c | |
s = ''.join(norm(x) for x in re.split(pt, s)) | |
return s | |
def remove_extra_spaces(s): | |
s = re.sub('[ ]+', ' ', s) | |
blocks = ''.join(('\u4E00-\u9FFF', # CJK UNIFIED IDEOGRAPHS | |
'\u3040-\u309F', # HIRAGANA | |
'\u30A0-\u30FF', # KATAKANA | |
'\u3000-\u303F', # CJK SYMBOLS AND PUNCTUATION | |
'\uFF00-\uFFEF' # HALFWIDTH AND FULLWIDTH FORMS | |
)) | |
basic_latin = '\u0000-\u007F' | |
def remove_space_between(cls1, cls2, s): | |
p = re.compile('([{}]) ([{}])'.format(cls1, cls2)) | |
while p.search(s): | |
s = p.sub(r'\1\2', s) | |
return s | |
s = remove_space_between(blocks, blocks, s) | |
s = remove_space_between(blocks, basic_latin, s) | |
s = remove_space_between(basic_latin, blocks, s) | |
return s | |
def normalize_neologd(s): | |
s = s.strip() | |
s = unicode_normalize('0−9A-Za-z。-゚', s) | |
def maketrans(f, t): | |
return {ord(x): ord(y) for x, y in zip(f, t)} | |
s = s.translate( | |
maketrans('!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~。、・「」', | |
'!”#$%&’()*+,−./:;<=>?@[¥]^_`{|}〜。、・「」')) | |
s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s) # normalize hyphens | |
s = re.sub('[﹣-ー—―─━ー]+', 'ー', s) # normalize choonpus | |
s = re.sub('[~∼∾〜〰~]', '', s) # remove tildes | |
s = remove_extra_spaces(s) | |
s = unicode_normalize('!”#$%&’()*+,−./:;<>?@[¥]^_`{|}〜', s) # keep =,・,「,」 | |
return s | |
if __name__ == "__main__": | |
assert "0" == normalize_neologd("0") | |
assert "ハンカク" == normalize_neologd("ハンカク") | |
assert "o-o" == normalize_neologd("o₋o") | |
assert "majikaー" == normalize_neologd("majika━") | |
assert "わい" == normalize_neologd("わ〰い") | |
assert "スーパー" == normalize_neologd("スーパーーーー") | |
assert "!#" == normalize_neologd("!#") | |
assert "ゼンカクスペース" == normalize_neologd("ゼンカク スペース") | |
assert "おお" == normalize_neologd("お お") | |
assert "おお" == normalize_neologd(" おお") | |
assert "おお" == normalize_neologd("おお ") | |
assert "検索エンジン自作入門を買いました!!!" == \ | |
normalize_neologd("検索 エンジン 自作 入門 を 買い ました!!!") | |
assert "アルゴリズムC" == normalize_neologd("アルゴリズム C") | |
assert "PRML副読本" == normalize_neologd(" PRML 副 読 本 ") | |
assert "Coding the Matrix" == normalize_neologd("Coding the Matrix") | |
assert "南アルプスの天然水Sparking Lemonレモン一絞り" == \ | |
normalize_neologd("南アルプスの 天然水 Sparking Lemon レモン一絞り") | |
assert "南アルプスの天然水- Sparking*Lemon+レモン一絞り" == \ | |
normalize_neologd("南アルプスの 天然水- Sparking* Lemon+ レモン一絞り") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment