Last active
December 11, 2018 14:52
-
-
Save wannaphong/9dd03f2347424d1c2d4944309d67f4a3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Apache License 2.0 | |
file_name="data" # ชื่อไฟล์คลังข้อมูล | |
import codecs | |
from pythainlp.tokenize import word_tokenize | |
from pythainlp.tag import pos_tag | |
from nltk.tokenize import RegexpTokenizer | |
import glob | |
import nltk | |
import re | |
#จัดการประโยคซ้ำ | |
data_not=[] | |
def Unique(p): | |
text=re.sub("<[^>]*>","",p) | |
text=re.sub("\[(.*?)\]","",text) | |
text=re.sub("\[\/(.*?)\]","",text) | |
if text not in data_not: | |
data_not.append(text) | |
return True | |
else: | |
return False | |
# เตรียมตัวตัด tag ด้วย re | |
pattern = r'\[(.*?)\](.*?)\[\/(.*?)\]' | |
tokenizer = RegexpTokenizer(pattern) # ใช้ nltk.tokenize.RegexpTokenizer เพื่อตัด [TIME]8.00[/TIME] ให้เป็น ('TIME','ไง','TIME') | |
# จัดการกับ tag ที่ไม่ได้ tag | |
def toolner_to_tag(text): | |
text=text.strip() | |
text=re.sub("<[^>]*>","",text) | |
text=re.sub("(\[\/(.*?)\])","\\1***",text)#.replace('(\[(.*?)\])','***\\1')# ตัดการกับพวกไม่มี tag word | |
text=re.sub("(\[\w+\])","***\\1",text) | |
text2=[] | |
for i in text.split('***'): | |
if "[" in i: | |
text2.append(i) | |
else: | |
text2.append("[word]"+i+"[/word]") | |
text="".join(text2)#re.sub("[word][/word]","","".join(text2)) | |
return text.replace("[word][/word]","") | |
# แปลง text ให้เป็น conll2002 | |
def text2conll2002(text,pos=True): | |
""" | |
ใช้แปลงข้อความให้กลายเป็น conll2002 | |
""" | |
text=toolner_to_tag(text) # นำไปใส่ tag [word] | |
text=text.replace("''",'"') | |
text=text.replace("’",'"').replace("‘",'"')#.replace('"',"") | |
tag=tokenizer.tokenize(text) # แยก tag ออกมาจากข้อความ | |
j=0 | |
conll2002="" # ประกาศตัวแปรเก็บ conll2002 | |
for tagopen,text,tagclose in tag: # ลูปใน tag โดยเป็น (tagopen,text,tagclose) | |
word_cut=word_tokenize(text,engine="newmm") # ใช้ตัวตัดคำ newmm ของ PyThaiNLP | |
i=0 | |
txt5="" | |
while i<len(word_cut): #ลูปตามจำนวน token ที่ตัดในtag | |
if word_cut[i]=="''" or word_cut[i]=='"':pass | |
elif i==0 and tagopen!='word': # ไม่เป็น tag [word] และเป็น i หรือตัวเริ่มต้น tag | |
txt5+=word_cut[i] | |
txt5+='\t'+'B-'+tagopen | |
elif tagopen!='word': | |
txt5+=word_cut[i] | |
txt5+='\t'+'I-'+tagopen | |
else: # เป็น [word] | |
txt5+=word_cut[i] | |
txt5+='\t'+'O' | |
txt5+='\n' | |
#j+=1 | |
i+=1 | |
conll2002+=txt5 | |
if pos==False: | |
return conll2002 | |
return postag(conll2002) # เพิ่ม postag ใส่ | |
# ใช้สำหรับกำกับ pos tag เพื่อใช้กับ NER | |
# print(text2conll2002(t,pos=False)) | |
def postag(text): | |
listtxt=[i for i in text.split('\n') if i!=''] | |
list_word=[] | |
for data in listtxt: | |
list_word.append(data.split('\t')[0]) | |
#print(text) | |
list_word=pos_tag(list_word,engine="perceptron") | |
text="" | |
i=0 | |
for data in listtxt: | |
text+=data.split('\t')[0]+'\t'+list_word[i][1]+'\t'+data.split('\t')[1]+'\n' | |
i+=1 | |
return text | |
# อ่านข้อมูลจากไฟล์ | |
def get_data(fileopen): | |
""" | |
สำหรับใช้อ่านทั้งหมดทั้งในไฟล์ทีละรรทัดออกมาเป็น list | |
""" | |
with codecs.open(fileopen, 'r',encoding='utf-8-sig') as f: | |
lines = f.read().splitlines() | |
return [a for a in lines if Unique(a)] # เอาไม่ซ้ำกัน | |
def alldata(lists): | |
text="" | |
for data in lists: | |
text+=text2conll2002(data) | |
text+='\n' | |
return text | |
def alldata_list(lists): | |
data_all=[] | |
for data in lists: | |
data_num=[] | |
try: | |
txt=text2conll2002(data,pos=True).split('\n') # นำไปแปลงเป็น conll2002 | |
for d in txt: | |
tt=d.split('\t') | |
if d!="": | |
if len(tt)==3: | |
data_num.append((tt[0],tt[1],tt[2])) | |
else: | |
data_num.append((tt[0],tt[1])) | |
#print(data_num) | |
data_all.append(data_num) | |
except: | |
print(data) | |
#print(data_all) | |
return data_all | |
def alldata_list_str(lists): | |
string="" | |
for data in lists: | |
string1="" | |
for j in data: | |
string1+=j[0]+" "+j[1]+" "+j[2]+"\n" | |
string1+="\n" | |
string+=string1 | |
return string | |
def get_data_tag(listd): | |
list_all=[] | |
c=[] | |
for i in listd: | |
if i !='': | |
c.append((i.split("\t")[0],i.split("\t")[1],i.split("\t")[2])) | |
else: | |
list_all.append(c) | |
c=[] | |
return list_all | |
def getall(lista): | |
ll=[] | |
for i in lista: | |
o=True | |
for j in ll: | |
if re.sub("\[(.*?)\]","",i)==re.sub("\[(.*?)\]","",j): | |
o=False | |
break | |
if o==True: | |
ll.append(i) | |
return ll | |
data1=getall(get_data(file_name+".txt")) # นำคลังเข้าไป แยกออกเป็น list ละบรรทัด | |
datatofile=alldata_list(data1) # นำไปผ่านขั้นตอน 1 2 3 4 | |
tt=[] | |
with open(file_name+"-pos.conll","w") as f: | |
i=0 | |
while i<len(datatofile): | |
for j in datatofile[i]: | |
f.write(j[0]+"\t"+j[1]+"\t"+j[2]+"\n") | |
if i+1<len(datatofile): | |
f.write("\n") | |
i+=1 | |
with open(file_name+".conll","w") as f: | |
i=0 | |
while i<len(datatofile): | |
for j in datatofile[i]: | |
f.write(j[0]+"\t"+j[2]+"\n") | |
if i+1<len(datatofile): | |
f.write("\n") | |
i+=1 | |
from sklearn_crfsuite import scorers,metrics | |
from sklearn.metrics import make_scorer | |
from sklearn.model_selection import cross_validate,train_test_split | |
import sklearn_crfsuite | |
def doc2features(doc, i): | |
word = doc[i][0] | |
postag = doc[i][1] | |
# Features from current word | |
features={ | |
'word.word': word, | |
'word.isspace':word.isspace(), | |
'postag':postag, | |
'word.isdigit()': word.isdigit() | |
} | |
if i > 0: | |
prevword = doc[i-1][0] | |
postag1 = doc[i-1][1] | |
features['word.prevword'] = prevword | |
features['word.previsspace']=prevword.isspace() | |
features['word.prepostag'] = postag1 | |
features['word.prevwordisdigit'] = prevword.isdigit() | |
else: | |
features['BOS'] = True # Special "Beginning of Sequence" tag | |
# Features from next word | |
if i < len(doc)-1: | |
nextword = doc[i+1][0] | |
postag1 = doc[i+1][1] | |
features['word.nextword'] = nextword | |
features['word.nextisspace']=nextword.isspace() | |
features['word.nextpostag'] = postag1 | |
features['word.nextwordisdigit'] = nextword.isdigit() | |
else: | |
features['EOS'] = True # Special "End of Sequence" tag | |
return features | |
def extract_features(doc): | |
return [doc2features(doc, i) for i in range(len(doc))] | |
def get_labels(doc): | |
return [tag for (token,postag,tag) in doc] | |
X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา | |
y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา | |
X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1) # แบ่ง 0.1 หรือ 10% | |
crf = sklearn_crfsuite.CRF( | |
algorithm='lbfgs', | |
c1=0.1, | |
c2=0.1, | |
max_iterations=500, | |
all_possible_transitions=True, | |
model_filename=file_name+"-pos.model0" # ตั้งชื่อโมเดล | |
) | |
crf.fit(X, y); # train | |
labels = list(crf.classes_) | |
labels.remove('O') | |
y_pred = crf.predict(X_test) | |
e=metrics.flat_f1_score(y_test, y_pred, | |
average='weighted', labels=labels) | |
print(e) # โชว์ประสิทธิภาพ | |
sorted_labels = sorted( | |
labels, | |
key=lambda name: (name[1:], name[0]) | |
) | |
print(metrics.flat_classification_report( | |
y_test, y_pred, labels=sorted_labels, digits=3 | |
)) | |
def get_ner(text): | |
word_cut=word_tokenize(text,engine="newmm") | |
list_word=pos_tag(word_cut,engine='perceptron') | |
X_test = extract_features([(data,list_word[i][1]) for i,data in enumerate(word_cut)]) | |
y_=crf.predict_single(X_test) | |
return [(word_cut[i],list_word[i][1],data) for i,data in enumerate(y_)] | |
while True: | |
t=input("Text : ") | |
print(get_ner(t)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment