Skip to content

Instantly share code, notes, and snippets.

@MarioZZJ
Created March 15, 2023 06:52
Show Gist options
  • Save MarioZZJ/586afa2b4923a78101eeb5e3c3117921 to your computer and use it in GitHub Desktop.
Save MarioZZJ/586afa2b4923a78101eeb5e3c3117921 to your computer and use it in GitHub Desktop.
采集 MeSH 树中所有 MeSH 词,保存为 json 和 csv。
#!/usr/bin/env python
"""
采集 MeSH 树中所有 MeSH 词,保存为 json 和 csv。
Author: MarioZZJ <zjzheng@smail.nju.edu.cn>
Usage:
python3 download_mesh_tree.py
"""
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
def get_childs(treecode):
"""
由于 MeSH 层级特性,定义函数递归调用即可遍历所有节点。
这里采用深度优先搜索。
"""
print('\r'+treecode, end=' #')
childs = []
response = requests.get("https://meshb.nlm.nih.gov/api/tree/" + treecode)
soup = BeautifulSoup(response.text, "html.parser")
li_tags = soup.find_all("li")
for li in li_tags:
this_tree = {}
# 根据标签解析获取节点数据
a_tag = li.find('a')
span = a_tag.find('span').text
this_tree['name'] = span.split(' [')[0]
this_tree['treecode'] = span.split(' [')[1].replace(']', '')
this_tree['ui'] = a_tag.attrs["href"].split('ui=')[1]
this_tree['child'] = []
if li.find('i'):
this_tree['child'] = get_childs(this_tree['treecode'])
childs.append(this_tree)
mesh_record.append([
this_tree['ui'],
this_tree['name'],
this_tree['treecode'],
])
return childs
if __name__ == '__main__':
mesh_tree = []
mesh_record = []
roots = {
'A': 'Anatomy',
'B': 'Organisms',
'D': 'Chemicals and Drugs',
'E': 'Analytical, Diagnostic and Therapeutic Techniques, and Equipment',
'F': 'Psychiatry and Psychology',
'G': 'Phenomena and Processes',
'H': 'Disciplines and Occupations',
'I': 'Anthropology, Education, Sociology, and Social Phenomena',
'J': 'Technology, Industry, and Agriculture',
'K': 'Humanities',
'M': 'Named Groups',
'N': 'Health Care',
'V': 'Publication Characteristics',
'Z': 'Geographicals'
} # 直接定义好根节点,免去首次爬取,后面全递归
for key in roots.keys():
tree = {
"treecode": key,
"name": roots[key],
"ui": '',
"child": []
}
response = requests.get("https://meshb.nlm.nih.gov/api/tree/" + key)
tree['child'] = get_childs(key) # 递归爬取
mesh_tree.append(tree)
# 保存结果
with open('./mesh_tree.json', 'w') as f:
json.dump(mesh_tree, f)
pd.DataFrame(mesh_record,index=None,columns=['dUI','name','treecode']).to_csv('./mesh_tree.csv',header=True,index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment