Created
March 15, 2023 06:52
-
-
Save MarioZZJ/586afa2b4923a78101eeb5e3c3117921 to your computer and use it in GitHub Desktop.
采集 MeSH 树中所有 MeSH 词,保存为 json 和 csv。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
采集 MeSH 树中所有 MeSH 词,保存为 json 和 csv。 | |
Author: MarioZZJ <zjzheng@smail.nju.edu.cn> | |
Usage: | |
python3 download_mesh_tree.py | |
""" | |
import requests | |
import json | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
def get_childs(treecode): | |
""" | |
由于 MeSH 层级特性,定义函数递归调用即可遍历所有节点。 | |
这里采用深度优先搜索。 | |
""" | |
print('\r'+treecode, end=' #') | |
childs = [] | |
response = requests.get("https://meshb.nlm.nih.gov/api/tree/" + treecode) | |
soup = BeautifulSoup(response.text, "html.parser") | |
li_tags = soup.find_all("li") | |
for li in li_tags: | |
this_tree = {} | |
# 根据标签解析获取节点数据 | |
a_tag = li.find('a') | |
span = a_tag.find('span').text | |
this_tree['name'] = span.split(' [')[0] | |
this_tree['treecode'] = span.split(' [')[1].replace(']', '') | |
this_tree['ui'] = a_tag.attrs["href"].split('ui=')[1] | |
this_tree['child'] = [] | |
if li.find('i'): | |
this_tree['child'] = get_childs(this_tree['treecode']) | |
childs.append(this_tree) | |
mesh_record.append([ | |
this_tree['ui'], | |
this_tree['name'], | |
this_tree['treecode'], | |
]) | |
return childs | |
if __name__ == '__main__': | |
mesh_tree = [] | |
mesh_record = [] | |
roots = { | |
'A': 'Anatomy', | |
'B': 'Organisms', | |
'D': 'Chemicals and Drugs', | |
'E': 'Analytical, Diagnostic and Therapeutic Techniques, and Equipment', | |
'F': 'Psychiatry and Psychology', | |
'G': 'Phenomena and Processes', | |
'H': 'Disciplines and Occupations', | |
'I': 'Anthropology, Education, Sociology, and Social Phenomena', | |
'J': 'Technology, Industry, and Agriculture', | |
'K': 'Humanities', | |
'M': 'Named Groups', | |
'N': 'Health Care', | |
'V': 'Publication Characteristics', | |
'Z': 'Geographicals' | |
} # 直接定义好根节点,免去首次爬取,后面全递归 | |
for key in roots.keys(): | |
tree = { | |
"treecode": key, | |
"name": roots[key], | |
"ui": '', | |
"child": [] | |
} | |
response = requests.get("https://meshb.nlm.nih.gov/api/tree/" + key) | |
tree['child'] = get_childs(key) # 递归爬取 | |
mesh_tree.append(tree) | |
# 保存结果 | |
with open('./mesh_tree.json', 'w') as f: | |
json.dump(mesh_tree, f) | |
pd.DataFrame(mesh_record,index=None,columns=['dUI','name','treecode']).to_csv('./mesh_tree.csv',header=True,index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment