Skip to content

Instantly share code, notes, and snippets.

@mkyt
Created May 18, 2020 07:32
Show Gist options
  • Save mkyt/f500ac3a0fad1918f89baa80bd2fdcb9 to your computer and use it in GitHub Desktop.
Save mkyt/f500ac3a0fad1918f89baa80bd2fdcb9 to your computer and use it in GitHub Desktop.
法令漢数字→ローマ数字
from kanjize import kanji2int
import re
exprs = [
(r'月([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)日', 1), # 三月二十八日
(r'年([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)月', 1), # 昭和四十二年三月
(r'(明治|大正|昭和|平成|令和)([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)年', 2), # 昭和四十二年
(r'第([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)(章|節|条|項|号)', 1), # 第二十三条
(r'[〇一二三四五六七八九十壱弐参拾百千万萬億兆]+の([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)', 1), # 第十九条の六の十五
(r'条の([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)', 1), # 第十九条の六
(r'^([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)', 1), # <行頭>二
(r'^第([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)', 1), # <行頭>第二
(r'^\(([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)\)', 1), # <行頭>(三)
(r'([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)(週|日|時間|分|秒|回)', 1), # 七十二時間
]
knums = '〇一二三四五六七八九'
kn2n = {}
for i, k in enumerate(knums): kn2n[k] = str(i)
def is_complex(kn):
'''whether given string is a complex kanji number w/ rank characters'''
for k in kn:
if k in '十壱弐参拾百千万萬億兆':
return True
return False
def gen_k2i(arg_n):
def res(m):
k = m.group(arg_n)
if is_complex(k):
i = str(kanji2int(k))
else:
# treat as a simple list of kanji numerals e.g. "第一三〇号"
i = ''.join(kn2n[kn] for kn in k)
return m.group(0).replace(k, i)
return res
def subst(s):
for expr, pos in exprs:
pat = re.compile(expr, flags=re.MULTILINE)
s = re.sub(pat, gen_k2i(pos), s)
return s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment