Created
September 15, 2023 19:30
-
-
Save jmkim/eea59a5128590c2359762054e1154a64 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import orjson | |
from typing import NamedTuple | |
IN_FILE="전체 내려받기_한국어기초사전_json_20230901/3_5000_20230901.json" | |
class WordEquivalent(NamedTuple): | |
language: str = None | |
lemma: str = None | |
definition: str = None | |
class WordExample(NamedTuple): | |
type: str = None | |
example: list[str] = [] | |
class WordForm(NamedTuple): | |
type: str = None | |
pronunciation: str = None | |
sound: str = None | |
class WordType(NamedTuple): | |
homonym_number: str = None | |
lexicalUnit: str = None | |
partOfSpeech: str = None | |
vocabularyLevel: str = None | |
semanticCategory: str = None | |
subjectCategiory: str = None | |
annotation: str = None | |
origin: str = None | |
class WordMeaning(NamedTuple): | |
id: int = None | |
definition: str = None | |
syntacticPattern: str = None | |
annotation: str = None | |
equivalent: list[WordEquivalent] = [] | |
form: list[WordForm] = [] | |
type: list[WordType] = [] | |
examples: list[WordExample] = [] | |
class Word(NamedTuple): | |
writtenForm: str = None | |
variant: str = None | |
meanings: list[WordMeaning] = [] | |
# 리스트가 아닐 경우 리스트를 씌워주기 | |
def flatten_arrays(arr_or_obj): | |
arr = None | |
if not isinstance(arr_or_obj, list): | |
arr = [ arr_or_obj ] | |
else: | |
arr = arr_or_obj | |
return arr | |
# 사전 내 원본 feat = { "feat": [ { "att": KEY, "val": VALUE } ] } | |
# Flatten 결과 feat = { key: VALUE } | |
def flatten_feats(feats): | |
if feats is None or len(feats) <= 0: | |
return None | |
flat_feats = dict() | |
# 비정상 feat : feat이 배열로 묶여 있는 경우. 풀어서 하나씩 추가 | |
# 비정상 예시 : [ { "feat" : {} }, { "feat" : {} } ] | |
if isinstance(feats, list) and len(feats) > 0 and feats[0].get("feat", None) is not None: | |
for a in feats: | |
flat_feats = flat_feats | flatten_feats(a) | |
# 정상 feat | |
# 정상 예시 : [ "feat" : [ {}, {} ] | |
else: | |
feats = feats["feat"] | |
feats = flatten_arrays(feats) | |
for feat in feats: | |
flat_feats[feat["att"]] = feat["val"] | |
return flat_feats | |
# JSON을 Word 객체로 변환 | |
def json_to_word(json): | |
words = list[Word] | |
for lexres_k, lexres_v in json.get("LexicalResource").items(): | |
if lexres_k == "GlobalInformation": | |
print(flatten_feats(lexres_v)) | |
if lexres_k == "Lexicon": | |
print(lexres_k) | |
for lex_k, lex_v in lexres_v.items(): | |
if lex_k == "LexicalEntry": | |
print(lex_k) | |
entries = lex_v | |
for entry in entries: | |
lemma = flatten_feats(entry["Lemma"]) | |
writtenForm = lemma.get("writtenForm", None) | |
variant = lemma.get("variant", None) | |
meanings = [] | |
senses = flatten_arrays(entry["Sense"]) | |
for sense in senses: | |
equivalent = [] | |
for eq_feat in sense.get("Equivalent", []): | |
equivalent.append(flatten_feats(eq_feat)) | |
examples = [] | |
exs = sense.get("SenseExample", []) | |
if not isinstance(exs, list): | |
exs = flatten_arrays(exs) | |
for ex_feat in exs: | |
examples.append(flatten_feats(ex_feat)) | |
form = [] | |
for fo_feat in sense.get("WordForm", []): | |
fo_feats.append(flatten_feats(fo_feat)) | |
type = flatten_feats({"feat": entry.get("feat")}) | |
feats = flatten_feats(sense) | |
meaning = WordMeaning( | |
id=int(sense["val"]), | |
definition=feats.get("definition", None), | |
syntacticPattern=feats.get("syntacticPattern", None), | |
annotation=feats.get("annotation", None), | |
equivalent=equivalent, | |
form=form, | |
type=type, | |
examples=examples | |
) | |
meanings.append(meaning) | |
w = Word( | |
writtenForm=writtenForm, | |
variant=variant, | |
meanings=meanings) | |
print(w) | |
with open(IN_FILE, "r", encoding="utf-8") as f: | |
json_obj = orjson.loads(f.read()) | |
json_to_word(json_obj) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment