Last active
May 25, 2024 02:12
-
-
Save MarioZZJ/b3befa6903e99393c33c51deca1dc056 to your computer and use it in GitHub Desktop.
使用biobert-large-cased表示论文标题摘要文本
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoConfig, AutoModel, AutoTokenizer | |
import pandas as pd | |
import numpy as np | |
import torch | |
import csv | |
import argparse | |
from tqdm import tqdm | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--year', type=int, default=1991) | |
parser.add_argument('--cache_dir', type=str, default='/Public/huggingface') | |
parser.add_argument('--model_name', type=str, default='dmis-lab/biobert-large-cased-v1.1') | |
parser.add_argument('--proxy_ip', type=str, default='127.0.0.1') | |
parser.add_argument('--proxy_port', type=int, default=10809) | |
parser.add_argument('--batch_size', type=int, default=16) | |
args = parser.parse_args() | |
year = args.year | |
proxy_ip = args.proxy_ip | |
proxy_port = args.proxy_port | |
model_name = args.model_name | |
cache_dir = args.cache_dir | |
proxies = { | |
'http': f'http://{proxy_ip}:{proxy_port}', | |
'https': f'http://{proxy_ip}:{proxy_port}' | |
} | |
batch_size = args.batch_size | |
# load huggingface model | |
config = AutoConfig.from_pretrained(model_name, proxies=proxies, cache_dir=cache_dir, local_files_only=True) | |
tokenizer = AutoTokenizer.from_pretrained(model_name, proxies=proxies, cache_dir=cache_dir, local_files_only=True) | |
model = AutoModel.from_pretrained(model_name, proxies=proxies, config=config, cache_dir=cache_dir, local_files_only=True) | |
# load and parse data | |
model = model.to('cuda') | |
data = pd.read_csv(f'data/IEK4D/pmid_title_abstract/{year}.tsv', sep='\t', quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\') | |
data = data.dropna(axis=0, subset=['pmid', 'publish_year']) | |
data = data.fillna(' ') | |
data['publish_year'] = data['publish_year'].astype(int) | |
# embed documents | |
pmids = [] | |
embs = torch.tensor([]) | |
batch_text = [] | |
batch_pmids = [] | |
for index, row in tqdm(data.iterrows(), total=len(data), desc=f'Embedding {year}'): | |
pmid = row['pmid'] | |
title = row['title'] | |
abstract = row['abstract'] | |
text = title + ' ' + abstract | |
batch_text.append(text) | |
batch_pmids.append(pmid) | |
if len(batch_pmids) < batch_size: | |
continue | |
else: | |
batch_dict = tokenizer(batch_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512).to('cuda') | |
with torch.no_grad(): | |
outputs = model(**batch_dict) | |
last_hidden_states = outputs.last_hidden_state | |
attention_mask = batch_dict['attention_mask'] | |
last_hidden = last_hidden_states.masked_fill(~attention_mask.unsqueeze(-1).bool(), 0) | |
batch_embedding = last_hidden.sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(-1) | |
batch_embedding = batch_embedding.detach().cpu() | |
pmids.extend(batch_pmids) | |
embs = torch.cat((embs, batch_embedding), dim=0) | |
batch_text = [] | |
batch_pmids= [] | |
# save embeddings | |
torch.save(torch.tensor(np.array(embs)), f'data/IEK4D/embedding/text/document/{year}_emb.pt') | |
torch.save(pmids, f'data/IEK4D/embedding/text/document/{year}_pmids.pt') |
似乎使用[cls]代表整个文档进而进行下游分类是更普遍的做法?如specter
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
已优化