Skip to content

Instantly share code, notes, and snippets.

@e-mon
Created June 28, 2015 19:26
Show Gist options
  • Save e-mon/73d53835abec0d22e51e to your computer and use it in GitHub Desktop.
Save e-mon/73d53835abec0d22e51e to your computer and use it in GitHub Desktop.
SIMPLE
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from math import log
from collections import Counter,defaultdict
from functools import reduce
from itertools import chain
import Help
class SIMPLE:
#-------------------------------------------------------------------------------------
# set variables
#-------------------------------------------------------------------------------------
def __init__(self, learning_corpus):
self.UTMAXLEN = 4
self.KKCInput = self.Input()
self.BT,self.UT = ("BT","UT")
flatten_KKCInput = list(chain.from_iterable(self.KKCInput[:3])) + list(chain.from_iterable(self.KKCInput[3]))
self.CharLogP = log(1 + len(flatten_KKCInput))
self.LC = learning_corpus # corpus file path
PairFreq = self.generate_model()
self.PairFreq, self.Freq = self.smoothing(PairFreq)
self.Dict = self.create_dictionary(PairFreq)
#-------------------------------------------------------------------------------------
# 言語モデル %PairFreq の生成
#-------------------------------------------------------------------------------------
def generate_model(self):
PairFeq = []
try:
PairFreq = map(lambda x:Counter(x.split() + [self.BT]),open(self.LC,'r').readlines()) #各行単位で単位の頻度をカウント,文末記号分を加算
PairFreq = reduce(lambda x,y:x+y, PairFreq) #マージ
except IOError:
print('"%s" cannot be opened.' % arg)
quit()
return PairFreq
#-------------------------------------------------------------------------------------
# スムージング
#-------------------------------------------------------------------------------------
def smoothing(self,PairFreq):
Freq = 0 # f() = Σf(word/kkci)↲
keys = list(PairFreq.keys())
for pair in keys:
freq = PairFreq[pair]
Freq += freq
if freq == 1: # 頻度が1の場合
PairFreq[self.UT] += freq # f(UT) に加算して
PairFreq.pop(pair) # f(pair) を消去
return PairFreq, Freq
#-------------------------------------------------------------------------------------
# 仮名漢字変換辞書 %Dict の作成
#-------------------------------------------------------------------------------------
def create_dictionary(self,PairFreq):
Dict = defaultdict(list) # KKCI => <Word, KKCI>+
for pair in PairFreq.keys(): # f(∀pair) > 0 に対するループ
if pair in [self.BT,self.UT]: # 特殊記号は辞書にいれない
continue # 入力記号列部分
kkci = pair.split('/')[1] # 必要なら $Dict{$kkci} の初期化
Dict[kkci].append(pair) # dict(KKCI) に追加
return Dict
#-------------------------------------------------------------------------------------
# main
#-------------------------------------------------------------------------------------
def convert(self,inputs):
# 仮名漢字変換の本体
return self.KKConv(sent, self.PairFreq, self.Freq, self.Dict)
#-------------------------------------------------------------------------------------
# 入力記号集合
#-------------------------------------------------------------------------------------
def Input(self):
LATINU = "A B C D E F G H I J K L M N OP Q R S T U V W X Y Z".split()
NUMBER = "0 1 2 3 4 5 6 7 8 9".split()
HIRAGANA = ("ぁ あ ぃ い ぅ う ぇ え ぉ お か が き ぎ く"+\
" ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た"+\
" だ ち ぢ っ つ づ て で と ど な に ぬ ね の は"+\
" ば ぱ ひ び ぴ ふ ぶ ぷ へ べ ぺ ほ ぼ ぽ ま み"+\
" む め も ゃ や ゅ ゆ ょ よ ら り る れ ろ ゎ わ"+\
" ゐ ゑ を ん").split()
OTHERS = [" ヴ ヵ ヶ ".split(), # 片仮名のみの文字
"ー = ¥ ` 「 」 ; ’ 、 。".split(), # / => ・ (if US101)
"! @ # $ % ^ & * ( ) _ + | 〜 { } : ” < > ?".split(),
"・".split()] # for JP106 keyboard
return LATINU, NUMBER, HIRAGANA, OTHERS
#-------------------------------------------------------------------------------------
# KKConv
#-------------------------------------------------------------------------------------
# 機 能 : 仮名漢字変換
#
# 注意点 : NODE = <PREV, $pair, $logP>;
def KKConv(self,sent, PairFreq, Freq, Dict):
POSI = len(sent) # 解析位置 $posi の最大値
VTable = [[] for i in range(POSI+1)] # Viterbi Table
VTable[0].append((None, self.BT, 0)) # DP左端
for posi in range(1,POSI+1): # 解析位置(辞書引き右端)
for _from in range(posi): # 開始位置(辞書引き左端)
kkci = sent[_from:posi]
for pair in Dict[kkci]: # 既知語のループ
best = (None, None, 0) # 最良のノード(の初期値)
for node in VTable[_from]:
logP = node[2] - log(PairFreq[pair] / Freq)
if (best[1] is None) or (logP < best[2]):
best = (node, pair, logP)
if best[1] is not None: # 最良のノードがある場合
VTable[posi].append(best) # @best をコピーして参照を記憶
if posi - _from <= self.UTMAXLEN: # 未知語によるノード生成
best = (None, None, 0) # 最良のノード(の初期値)
for node in VTable[_from]:
logP = node[2] - log(PairFreq[self.UT] / Freq) + (posi - _from + 1)*self.CharLogP # 入力記号と単語末の BT の生成
if (best[1] is None) or (logP < best[2]):
pair = kkci + '/' + self.UT
best = (node, pair, logP)
if best[1] is not None: # 最良のノードがある場合
VTable[posi].append(best) # @best をコピーして参照を記憶
best = (None, None, 0) # 最良のノード(の初期値)
for node in VTable[POSI]: # $BT への遷移
logP = node[2] - log(PairFreq[self.BT]/Freq)
if (best[1] is None) or (logP < best[2]):
best = (node, self.BT, logP)
# 逆向きの探索と変換結果の表示
result = [] # 結果 <word, kkci>+
node = best[0] # 右端のノード
while node[0] is not None: # ノードを左向きにたどる
result.insert(0,node[1]) # $pair を配列に記憶していく
node = node[0]
return result
if __name__ == '__main__':
#-------------------------------------------------------------------------------------
# check arguments
#-------------------------------------------------------------------------------------
if len(sys.argv) != 2 or sys.argv[1] == "-help":
Help.Help('./kkc-word-1.perl')
quit()
LC = sys.argv[1];
simple = SIMPLE(LC)
inputs = sys.stdin.read().splitlines()
for sent in inputs:
result = simple.convert(sent.rstrip())
print(' '.join(result))
# print(''.join(map(lambda x:x.split('/')[0],result)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment